From 086ed8172a5d47b701b56343bd8c4d191dcd3256 Mon Sep 17 00:00:00 2001 From: rohola Date: Mon, 6 Apr 2020 14:51:39 -0600 Subject: [PATCH] first commit --- .gitignore | 18 + README.md | 56 + config.py | 109 ++ configs/interact_config.json | 15 + configs/train_config.json | 23 + configs/train_emotion_recognition_config.json | 23 + configs/train_full_config.json | 23 + configs/train_multihead_config.json | 23 + eval_emotion_recognition.py | 208 +++ evaluate.py | 195 ++ interact.py | 161 ++ pytorch_pretrained_bert/__init__.py | 26 + pytorch_pretrained_bert/__main__.py | 83 + .../convert_gpt2_checkpoint_to_pytorch.py | 72 + .../convert_openai_checkpoint_to_pytorch.py | 72 + .../convert_tf_checkpoint_to_pytorch.py | 66 + ...onvert_transfo_xl_checkpoint_to_pytorch.py | 116 ++ pytorch_pretrained_bert/file_utils.py | 279 +++ pytorch_pretrained_bert/modeling.py | 1623 +++++++++++++++++ pytorch_pretrained_bert/modeling_gpt2.py | 821 +++++++++ pytorch_pretrained_bert/modeling_openai.py | 1089 +++++++++++ .../modeling_transfo_xl.py | 1392 ++++++++++++++ .../modeling_transfo_xl_utilities.py | 402 ++++ pytorch_pretrained_bert/optimization.py | 302 +++ .../optimization_openai.py | 127 ++ pytorch_pretrained_bert/tokenization.py | 434 +++++ pytorch_pretrained_bert/tokenization_gpt2.py | 311 ++++ .../tokenization_openai.py | 313 ++++ .../tokenization_transfo_xl.py | 586 ++++++ requirements.txt | 9 + train.py | 239 +++ train_emotion_recognition.py | 286 +++ train_full.py | 253 +++ train_multihead.py | 297 +++ utils.py | 225 +++ 35 files changed, 10277 insertions(+) create mode 100644 README.md create mode 100644 config.py create mode 100644 configs/interact_config.json create mode 100644 configs/train_config.json create mode 100644 configs/train_emotion_recognition_config.json create mode 100644 configs/train_full_config.json create mode 100644 configs/train_multihead_config.json create mode 100644 eval_emotion_recognition.py create mode 100644 evaluate.py create mode 100644 interact.py create mode 100644 pytorch_pretrained_bert/__init__.py create mode 100644 pytorch_pretrained_bert/__main__.py create mode 100755 pytorch_pretrained_bert/convert_gpt2_checkpoint_to_pytorch.py create mode 100755 pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py create mode 100755 pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py create mode 100755 pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py create mode 100644 pytorch_pretrained_bert/file_utils.py create mode 100644 pytorch_pretrained_bert/modeling.py create mode 100644 pytorch_pretrained_bert/modeling_gpt2.py create mode 100644 pytorch_pretrained_bert/modeling_openai.py create mode 100644 pytorch_pretrained_bert/modeling_transfo_xl.py create mode 100644 pytorch_pretrained_bert/modeling_transfo_xl_utilities.py create mode 100644 pytorch_pretrained_bert/optimization.py create mode 100644 pytorch_pretrained_bert/optimization_openai.py create mode 100644 pytorch_pretrained_bert/tokenization.py create mode 100644 pytorch_pretrained_bert/tokenization_gpt2.py create mode 100644 pytorch_pretrained_bert/tokenization_openai.py create mode 100644 pytorch_pretrained_bert/tokenization_transfo_xl.py create mode 100644 requirements.txt create mode 100644 train.py create mode 100644 train_emotion_recognition.py create mode 100644 train_full.py create mode 100644 train_multihead.py create mode 100644 utils.py diff --git a/.gitignore b/.gitignore index e69de29..dbaabc3 100644 --- a/.gitignore +++ b/.gitignore @@ -0,0 +1,18 @@ +.vscode +experiments/ +data/ +dataset_cache* +dataset1_cache* +daily_dialog_* +runs/ +ParlAI/ +__pycache__ +.idea/* +env/* +ParlAI/* +model/* +logs/* +caches/* +_OpenAIGPTTokenizer +out +emp_transfo_checkpoint/* \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..fff39f1 --- /dev/null +++ b/README.md @@ -0,0 +1,56 @@ +# EmpTransfo: A Multi-head Transformer Architecture for Creating Empathetic Dialog Systems + +The present repo contains the code for the paper https://arxiv.org/abs/2003.02958 +on empathetic dialog system. The repository is heavily influenced by https://github.com/huggingface/transfer-learning-conv-ai + + +## Installation +To install and use the training and inference scripts please clone the repo and install the requirements: + +```bash +git clone git@github.com:roholazandie/EmpTransfo.git +cd EmpTransfo +pip install -r requirements.txt + +``` + + +## Interact with the chatbot +You can download the the checkpoint model [here](https://drive.google.com/open?id=1EjpK0YEVG1i9meLJzt7ZgODr0k65lTDi), extract and point to it from interact_config.json "model_checkpoint" value. +For example: +``` +"model_checkpoint" : "/home/rohola/codes/EmpTransfo/emp_transfo_checkpoint" +``` +Then run interact.py +```python +python interact.py +``` + +## Dataset +The original daily dialog dataset is [here](https://www.aclweb.org/anthology/I17-1099/). We changed the format to our purpose and can be download +from [here](https://drive.google.com/open?id=1T4AdY7wku8srL_xWSxgt-OHqdLFVo3s3). + + +## Training + +The script train_multihead.py uses three heads with all features. + + +The script train_full.py uses two heads (next sentence prediction and LM head), but uses all the features. + + +The script train_emotion_recognition.py trains to predict the next emotion (wihtout no_emotion). + +The script train.py trains without any features of the dataset (the base model). + +For all training scripts just change the dataset_path in config.json file related to that task, and then run the script +without any arguments. + + + +## Citation +If you use this code in your research, you can cite our ANLP paper: + +``` + +``` \ No newline at end of file diff --git a/config.py b/config.py new file mode 100644 index 0000000..445f2a2 --- /dev/null +++ b/config.py @@ -0,0 +1,109 @@ +import json + + +class Config: + + def __init__(self, + dataset_path="", + dataset_cache="", + model_checkpoint="", + num_candidates=2, + do_lower_case=True, + max_history=2, + train_batch_size=4, + valid_batch_size=4, + gradient_accumulation_steps=8, + lr=5e-5, + warmup_proportion=0.1, + lm_coef=1, + mc_coef=1, + max_norm=10, + n_epochs=2, + personality_permutations=1, + eval_before_start=False, + device="cpu", + fp16="", + local_rank=-1, + log_dir="", + ): + self.dataset_path = dataset_path + self.dataset_cache = dataset_cache + self.model_checkpoint = model_checkpoint + self.num_candidates = num_candidates + self.do_lower_case = do_lower_case + self.max_history = max_history + self.train_batch_size = train_batch_size + self.valid_batch_size = valid_batch_size + self.gradient_accumulation_steps = gradient_accumulation_steps + self.lr = lr + self.warmup_proportion = warmup_proportion + self.lm_coef = lm_coef + self.mc_coef = mc_coef + self.max_norm = max_norm + self.n_epochs = n_epochs + self.personality_permutations = personality_permutations + self.eval_before_start = eval_before_start + self.device = device + self.fp16 = fp16 + self.local_rank = local_rank + self.log_dir = log_dir + + @classmethod + def from_dict(cls, json_object): + config = Config() + for key in json_object: + config.__dict__[key] = json_object[key] + return config + + @classmethod + def from_json_file(cls, json_file): + with open(json_file) as f: + config_json = f.read() + + return cls.from_dict(json.loads(config_json)) + + +class InteractConfig: + + def __init__(self, + dataset_path="", + model="", + dataset_cache="", + model_checkpoint="", + max_history="", + device="", + no_sample="", + max_length="", + min_length="", + seed="", + temperature="", + top_k="", + top_p="" + ): + self.dataset_path = dataset_path + self.model = model + self.dataset_cache = dataset_cache + self.model_checkpoint = model_checkpoint + self.max_history = max_history + self.device = device + self.no_sample = no_sample + self.max_length = max_length + self.min_length = min_length + self.seed = seed + self.temperature = temperature + self.top_k = top_k + self.top_p = top_p + + @classmethod + def from_dict(cls, json_object): + config = InteractConfig() + for key in json_object: + config.__dict__[key] = json_object[key] + return config + + @classmethod + def from_json_file(cls, json_file): + with open(json_file) as f: + config_json = f.read() + + return cls.from_dict(json.loads(config_json)) diff --git a/configs/interact_config.json b/configs/interact_config.json new file mode 100644 index 0000000..ab31a1b --- /dev/null +++ b/configs/interact_config.json @@ -0,0 +1,15 @@ +{ + "dataset_path" : "/home/rohola/data/daily_dialog_full/daily_dialog.json", + "model" : "openai-gpt", + "dataset_cache" : "./caches/dataset_cache_OpenAIGPTTokenizer", + "model_checkpoint" : "/home/rohola/codes/EmpTransfo/emp_transfo_checkpoint", + "max_history" : 2, + "device" : "cpu", + "no_sample" : true, + "max_length" : 20, + "min_length" : 1, + "seed" : 42, + "temperature" : 0.7, + "top_k" : 0, + "top_p" : 0.9 +} \ No newline at end of file diff --git a/configs/train_config.json b/configs/train_config.json new file mode 100644 index 0000000..c1b58f8 --- /dev/null +++ b/configs/train_config.json @@ -0,0 +1,23 @@ +{ + "dataset_path": "/home/rohola/data/daily_dialog.json" , + "dataset_cache": "./daily_dialog_dataset_cache", + "model_checkpoint": "openai-gpt", + "num_candidates": 2, + "do_lower_case": true, + "max_history": 2, + "train_batch_size": 1, + "valid_batch_size": 1, + "gradient_accumulation_steps": 8, + "lr": 6.25e-5, + "warmup_proportion": 0.1, + "lm_coef": 1.0, + "mc_coef": 1.0, + "max_norm": 1.0, + "n_epochs": 3, + "personality_permutations":1, + "eval_before_start": false, + "device": "cuda:0", + "fp16": "", + "local_rank": -1, + "log_dir": "" +} diff --git a/configs/train_emotion_recognition_config.json b/configs/train_emotion_recognition_config.json new file mode 100644 index 0000000..7a2fcbd --- /dev/null +++ b/configs/train_emotion_recognition_config.json @@ -0,0 +1,23 @@ +{ + "dataset_path": "/home/rohola/data/daily_dialog_full/daily_dialog.json" , + "dataset_cache": "./daily_dialog_dataset_cache", + "model_checkpoint": "openai-gpt", + "num_candidates": 2, + "do_lower_case": true, + "max_history": 2, + "train_batch_size": 1, + "valid_batch_size": 1, + "gradient_accumulation_steps": 8, + "lr": 6.25e-5, + "warmup_proportion": 0.1, + "lm_coef": 1.0, + "mc_coef": 1.0, + "max_norm": 1.0, + "n_epochs": 3, + "personality_permutations":1, + "eval_before_start": false, + "device": "cpu", + "fp16": "", + "local_rank": -1, + "log_dir": "" +} diff --git a/configs/train_full_config.json b/configs/train_full_config.json new file mode 100644 index 0000000..ddcaf33 --- /dev/null +++ b/configs/train_full_config.json @@ -0,0 +1,23 @@ +{ + "dataset_path": "/home/rohola/data/daily_dialog_full/daily_dialog.json" , + "dataset_cache": "./caches/daily_dialog_dataset_cache", + "model_checkpoint": "openai-gpt", + "num_candidates": 2, + "do_lower_case": true, + "max_history": 2, + "train_batch_size": 1, + "valid_batch_size": 1, + "gradient_accumulation_steps": 8, + "lr": 6.25e-5, + "warmup_proportion": 0.1, + "lm_coef": 1.0, + "mc_coef": 1.0, + "max_norm": 1.0, + "n_epochs": 3, + "personality_permutations":1, + "eval_before_start": false, + "device": "cuda:0", + "fp16": "", + "local_rank": -1, + "log_dir": "" +} diff --git a/configs/train_multihead_config.json b/configs/train_multihead_config.json new file mode 100644 index 0000000..abeb7d7 --- /dev/null +++ b/configs/train_multihead_config.json @@ -0,0 +1,23 @@ +{ + "dataset_path": "/home/rohola/data/daily_dialog_topic/daily_dialog.json" , + "dataset_cache": "caches/daily_dialog_multihead", + "model_checkpoint": "openai-gpt", + "num_candidates": 2, + "do_lower_case": true, + "max_history": 2, + "train_batch_size": 1, + "valid_batch_size": 1, + "gradient_accumulation_steps": 8, + "lr": 6.25e-5, + "warmup_proportion": 0.1, + "lm_coef": 1.0, + "mc_coef": 1.0, + "max_norm": 1.0, + "n_epochs": 3, + "personality_permutations":1, + "eval_before_start": false, + "device": "cuda:0", + "fp16": "", + "local_rank": -1, + "log_dir": "" +} diff --git a/eval_emotion_recognition.py b/eval_emotion_recognition.py new file mode 100644 index 0000000..5a3e41e --- /dev/null +++ b/eval_emotion_recognition.py @@ -0,0 +1,208 @@ +# Copyright (c) 2019-present, HuggingFace Inc. +# All rights reserved. This source code is licensed under the BSD-style license found in the LICENSE file in the root directory of this source tree. +import logging +from pprint import pformat +from collections import defaultdict +from itertools import chain + +import torch +from torch.nn.parallel import DistributedDataParallel +from torch.utils.data import DataLoader, TensorDataset + +from config import Config +from pytorch_pretrained_bert import (OpenAIAdam, OpenAIGPTDoubleHeadLMEmotionRecognitionModel, OpenAIGPTTokenizer, + GPT2DoubleHeadsModel, GPT2Tokenizer, WEIGHTS_NAME, CONFIG_NAME, + BertModel, BertTokenizer) + +from utils import get_dataset, get_dataset_for_daily_dialog + +SPECIAL_TOKENS = ["", "", "", "", + "", "", "", "", "", "", "", + "", "", "", "", + ""] +MODEL_INPUTS = ["input_ids", "mc_token_ids", "lm_labels", "mc_labels", "token_type_ids", "token_emotion_ids"] +PADDED_INPUTS = ["input_ids", "lm_labels", "token_type_ids", "token_emotion_ids"] + +logger = logging.getLogger(__file__) + +def average_distributed_scalar(scalar, config): + """ Average a scalar over the nodes if we are in distributed training. We use this for distributed evaluation. """ + if config.local_rank == -1: + return scalar + scalar_t = torch.tensor(scalar, dtype=torch.float, device=config.device) / torch.distributed.get_world_size() + torch.distributed.all_reduce(scalar_t, op=torch.distributed.ReduceOp.SUM) + return scalar_t.item() + + +def pad_dataset(dataset, padding=0): + """ Pad the dataset. This could be optimized by defining a Dataset class and padd only batches but this is simpler. """ + max_l = max(len(x) for x in dataset["input_ids"]) + for name in PADDED_INPUTS: + dataset[name] = [x + [padding if name != "lm_labels" else -1] * (max_l - len(x)) for x in dataset[name]] + return dataset + + +def get_emotion_label(tokenizer, candidate_emotion): + _, _, _, _, no_emotion_id, happiness_id, surprise_id, sadness_id, disgust_id, anger_id, fear_id, _, _, _, _, _ = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS) + if candidate_emotion == happiness_id: + return 0 + elif candidate_emotion == surprise_id: + return 1 + elif candidate_emotion == sadness_id: + return 2 + elif candidate_emotion == disgust_id: + return 3 + elif candidate_emotion == anger_id: + return 4 + elif candidate_emotion == fear_id: + return 5 + elif candidate_emotion == no_emotion_id: + return 6 + + +def build_input_from_segments(history, emotions, reply, true_emotion, tokenizer, with_eos=True): + """ Build a sequence of input from 3 segments: persona, history and last reply """ + bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:4]) + #tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]) + + instance = {} + # sequence = [[bos] + history[0] + list(chain(*history[1:]))] + [reply + ([eos] if with_eos else [])] #seq = [personas, history, reply] concatenate all persona sentences + sequence = [[bos] + history[0]] + history[1:] + [reply + ([eos] if with_eos else [])] + sequence = [[speaker2 if (len(sequence)-i) % 2 else speaker1] + s for i, s in enumerate(sequence)] + + instance["input_ids"] = list(chain(*sequence)) + instance["token_type_ids"] = [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in s] # the last for is for repeating the speaker1 and speaker2 for all tokens + #instance["token_emotion_ids"] = [emotions[i] for i, s in enumerate(sequence[:-1]) for _ in s] + [true_emotion] * len(sequence[-1]) + instance["token_emotion_ids"] = [emotions[i] for i, s in enumerate(sequence[:-1]) for _ in s] + + instance["mc_token_ids"] = len(instance["input_ids"]) - 1 + instance["mc_labels"] = get_emotion_label(tokenizer, true_emotion) + instance["lm_labels"] = ([-1] * sum(len(s) for s in sequence[:-1])) + [-1] + sequence[-1][1:] #all -1 except for reply, reply is just the ids + return instance, sequence + + +def get_data_loaders(config, tokenizer): + """ Prepare the dataset for training and evaluation """ + personachat = get_dataset_for_daily_dialog(tokenizer, config.dataset_path, config.dataset_cache, SPECIAL_TOKENS) + + #personachat["train"] = personachat["train"][:100] + #personachat["valid"] = personachat["valid"][:10] + + logger.info("Build inputs and labels") + datasets = {"train": defaultdict(list), "valid": defaultdict(list)} + c = 0 + for dataset_name, dataset in personachat.items(): + num_candidates = 2#len(dataset[0]["utterances"][0]["candidates"]) + if config.num_candidates > 0 and dataset_name == 'train': + num_candidates = min(config.num_candidates, num_candidates) + for dialog in dataset: + for utterance in dialog["utterances"]: + history = utterance["history"][-(2 * config.max_history + 1):] + emotions = utterance["emotion"][-(2 * config.max_history + 1):] + reply = utterance["candidates"][-1] + true_emotion = utterance['candidates_emotions'][-1] + if true_emotion == tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS)[4]: + continue + instance, _ = build_input_from_segments(history, + emotions, + reply, + true_emotion, + tokenizer) + + if len(instance["input_ids"]) > 310: + truncated_history = [hist[:10] for hist in history] + truncated_candidate = reply[:10] + true_emotion = utterance['candidates_emotions'][-1] + instance, _ = build_input_from_segments(truncated_history, + emotions, + truncated_candidate, + true_emotion, + tokenizer) + c+=1 + + for input_name, input_array in instance.items(): + datasets[dataset_name][input_name].append(input_array) + + #datasets[dataset_name]["mc_labels"].append(num_candidates - 1) + datasets[dataset_name]["n_candidates"] = num_candidates + print(c) + logger.info("Pad inputs and convert to Tensor") + tensor_datasets = {"train": [], "valid": []} + for dataset_name, dataset in datasets.items(): + dataset = pad_dataset(dataset, padding=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1])) + for input_name in MODEL_INPUTS: + tensor = torch.tensor(dataset[input_name]) + #if input_name != "mc_labels": + # tensor = tensor.view((-1, datasets[dataset_name]["n_candidates"]) + tensor.shape[1:]) + tensor_datasets[dataset_name].append(tensor) + + logger.info("Build train and validation dataloaders") + train_dataset, valid_dataset = TensorDataset(*tensor_datasets["train"]), TensorDataset(*tensor_datasets["valid"]) + train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if config.distributed else None + valid_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset) if config.distributed else None + train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=config.train_batch_size, shuffle=False) + valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=config.valid_batch_size, shuffle=False) + + logger.info("Train dataset (Batch, Candidates, Seq length): {}".format(train_dataset.tensors[0].shape)) + logger.info("Valid dataset (Batch, Candidates, Seq length): {}".format(valid_dataset.tensors[0].shape)) + return train_loader, valid_loader, train_sampler, valid_sampler + + +def train(): + config_file = "configs/train_full_pipeline_config.json" + config = Config.from_json_file(config_file) + + # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes + logging.basicConfig(level=logging.INFO if config.local_rank in [-1, 0] else logging.WARN) + logger.warning("Running process %d", config.local_rank) # This is a logger.warning: it will be printed by all distributed processes + logger.info("Arguments: %s", pformat(config)) + + # Initialize distributed training if needed + config.distributed = (config.local_rank != -1) + if config.distributed: + torch.cuda.set_device(config.local_rank) + config.device = torch.device("cuda", config.local_rank) + torch.distributed.init_process_group(backend='nccl', init_method='env://') + + logger.info("Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning") + tokenizer_class = GPT2Tokenizer if "gpt2" in config.model_checkpoint else OpenAIGPTTokenizer + tokenizer = tokenizer_class.from_pretrained(config.model_checkpoint) + model_class = GPT2DoubleHeadsModel if "gpt2" in config.model_checkpoint else OpenAIGPTDoubleHeadLMEmotionRecognitionModel + model = model_class.from_pretrained(config.model_checkpoint) + tokenizer.set_special_tokens(SPECIAL_TOKENS) + model.set_num_special_tokens(len(SPECIAL_TOKENS)) + model.to(config.device) + optimizer = OpenAIAdam(model.parameters(), lr=config.lr) + + # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) + if config.fp16: + from apex import amp # Apex is only required if we use fp16 training + model, optimizer = amp.initialize(model, optimizer, opt_level=config.fp16) + if config.distributed: + model = DistributedDataParallel(model, device_ids=[config.local_rank], output_device=config.local_rank) + + logger.info("Prepare datasets") + train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(config, tokenizer) + + # Evaluation function and evaluator (evaluator output is the input of the metrics) + model.eval() + num_correct = 0 + num_all = len(val_loader) + for batch in val_loader: + with torch.no_grad(): + batch = tuple(input_tensor.to(config.device) for input_tensor in batch) + input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids = batch + + model_outputs = model(input_ids, mc_token_ids, token_type_ids=token_type_ids, token_emotion_ids=token_emotion_ids) + lm_logits, mc_logits = model_outputs[0], model_outputs[1] # So we can also use GPT2 outputs + + indices = torch.argmax(mc_logits, dim=1) + + correct = torch.eq(indices, mc_labels).view(-1) + num_correct += torch.sum(correct).item() + + print(num_correct / num_all) + + +if __name__ == "__main__": + train() diff --git a/evaluate.py b/evaluate.py new file mode 100644 index 0000000..3bcfffb --- /dev/null +++ b/evaluate.py @@ -0,0 +1,195 @@ +# # Copyright (c) 2019-present, HuggingFace Inc. +# All rights reserved. +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +import logging +import random +from argparse import ArgumentParser +from itertools import chain +from pprint import pformat +import numpy as np + +import torch +import torch.nn.functional as F +from tqdm import tqdm + +from config import InteractConfig +from pytorch_pretrained_bert import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, GPT2LMHeadModel, GPT2Tokenizer +from utils import download_pretrained_model, get_dataset, _bleu, _f1_score + + + +def build_input_from_segments(persona, history, reply, tokenizer, SPECIAL_TOKENS, lm_labels=False, with_eos=True): + """ Build a sequence of input from 3 segments: persona, history and last reply """ + bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1]) + + instance = {} + sequence = [[bos] + list(chain(*persona))] + history + [ + reply + ([eos] if with_eos else [])] # seq = [personas, history, reply] concatenate all persona sentences + sequence = [sequence[0]] + [[speaker2 if (len(sequence) - i) % 2 else speaker1] + s for i, s in + enumerate(sequence[1:])] + + instance["input_ids"] = list(chain(*sequence)) + instance["token_type_ids"] = [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in + s] # the last for is for repeating the speaker1 and speaker2 for all tokens + instance["mc_token_ids"] = len(instance["input_ids"]) - 1 + instance["lm_labels"] = [-1] * len(instance["input_ids"]) + if lm_labels: + instance["lm_labels"] = ([-1] * sum(len(s) for s in sequence[:-1])) + [-1] + sequence[-1][1:] # all -1 except for reply, reply is just the ids + return instance, sequence + + + +def top_filtering(logits, top_k=0, top_p=0.0, threshold=-float('Inf'), filter_value=-float('Inf')): + """ Filter a distribution of logits using top-k, top-p (nucleus) and/or threshold filtering + Args: + logits: logits distribution shape (..., vocabulary size) + top_k: <=0: no filtering, >0: keep only top k tokens with highest probability. + top_p: <=0.0: no filtering, >0.0: keep only a subset S of candidates, where S is the smallest subset + whose total probability mass is greater than or equal to the threshold top_p. + In practice, we select the highest probability tokens whose cumulative probability mass exceeds + the threshold top_p. + threshold: a minimal threshold to keep logits + """ + top_k = min(top_k, logits.size(-1)) + if top_k > 0: + # Remove all tokens with a probability less than the last token in the top-k tokens + indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] + logits[indices_to_remove] = filter_value + + if top_p > 0.0: + # Compute cumulative probabilities of sorted tokens + sorted_logits, sorted_indices = torch.sort(logits, descending=True) + cumulative_probabilities = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) + + # Remove tokens with cumulative probability above the threshold + sorted_indices_to_remove = cumulative_probabilities > top_p + # Shift the indices to the right to keep also the first token above the threshold + sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() + sorted_indices_to_remove[..., 0] = 0 + + # Back to unsorted indices and set them to -infinity + indices_to_remove = sorted_indices[sorted_indices_to_remove] + logits[indices_to_remove] = filter_value + + indices_to_remove = logits < threshold + logits[indices_to_remove] = filter_value + + return logits + + +def get_emotions(dataset): + + + for data in tqdm(dataset['valid']): + utterances = data['utterances'] + + for utterance in utterances: + true_emotion = utterance["emotion"] + + +def calculate_metrics(args, model, tokenizer, dataset, special_tokens): + special_tokens_ids = tokenizer.convert_tokens_to_ids(special_tokens) + + all_blues = [] + all_f1_scores = [] + all_true_sentences = [] + all_predicted_sentences = [] + for data in tqdm(dataset['valid']): + personality = data['personality'] + utterances = data['utterances'] + + #utterance = utterances[-1] #only the longest conversaion + for utterance in utterances: + true_label = utterance['candidates'][-1] + history = utterance['history'] + predicted_output = [] + for i in range(args.max_length): + instance, _ = build_input_from_segments(personality, history, predicted_output, tokenizer, special_tokens, with_eos=False) + + try: + + if len(instance["input_ids"]) > 310: + truncated_history = [hist[:5] for hist in history] + instance, _ = build_input_from_segments(personality, truncated_history, predicted_output, tokenizer, special_tokens, with_eos=False) + + input_ids = torch.tensor(instance["input_ids"], device=args.device).unsqueeze(0) + token_type_ids = torch.tensor(instance["token_type_ids"], device=args.device).unsqueeze(0) + + logits = model(input_ids, token_type_ids=token_type_ids) + except: + print("exception") + continue + + if "gpt2" == args.model: + logits = logits[0] + logits = logits[0, -1, :] / args.temperature + logits = top_filtering(logits, top_k=args.top_k, top_p=args.top_p) + probs = F.softmax(logits, dim=-1) + + prev = torch.topk(probs, 1)[1] if args.no_sample else torch.multinomial(probs, 1) + # if i < args.min_length and prev.item() in special_tokens_ids: + # k=0 + # while prev.item() in special_tokens_ids and k < 100: + # prev = torch.multinomial(probs, num_samples=1) + # k+=1 + + if i < args.min_length: + prev = torch.multinomial(probs, num_samples=1) + + # if prev.item() in special_tokens_ids: + # break + predicted_output.append(prev.item()) + + predicted_sentence = tokenizer.decode(predicted_output, skip_special_tokens=True) + true_sentence = tokenizer.decode(true_label, skip_special_tokens=True) + #looks like zero gives the best results + + all_predicted_sentences.append(predicted_sentence) + all_true_sentences.append(true_sentence) + + bleus = [_bleu(predicted_sentence, [true_sentence], method="method"+str(i)) for i in [0,1,2,3,5]] + #bleu = _bleu(predicted_sentence, [true_sentence]) + f1_score = _f1_score(predicted_sentence, [true_sentence]) + #print(f1_score) + all_blues.append(bleus) + all_f1_scores.append(f1_score) + #compare predicted and label with bleu + + + print("avg bleu", np.array(all_blues).mean(axis=0)) + print("avg f1 score", np.mean(all_f1_scores)) + print("max bleu", np.array(all_blues).max(axis=0)) + + +def run(): + config_file = "configs/interact_config.json" + config = InteractConfig.from_json_file(config_file) + + logging.basicConfig(level=logging.INFO) + logger = logging.getLogger(__file__) + logger.info(pformat(config)) + + if config.model_checkpoint == "": + config.model_checkpoint = download_pretrained_model() + + random.seed(config.seed) + torch.random.manual_seed(config.seed) + torch.cuda.manual_seed(config.seed) + + logger.info("Get pretrained model and tokenizer") + tokenizer_class = GPT2Tokenizer if "gpt2" == config.model else OpenAIGPTTokenizer + tokenizer = tokenizer_class.from_pretrained(config.model_checkpoint) + model_class = GPT2LMHeadModel if "gpt2" == config.model else OpenAIGPTLMHeadModel + model = model_class.from_pretrained(config.model_checkpoint) + + model.to(config.device) + model.eval() + + dataset = get_dataset(tokenizer, config.dataset_path, config.dataset_cache) + + special_tokens = ["", "", "", "", ""] + calculate_metrics(config, model, tokenizer, dataset, special_tokens) + +if __name__ == "__main__": + run() diff --git a/interact.py b/interact.py new file mode 100644 index 0000000..c59e160 --- /dev/null +++ b/interact.py @@ -0,0 +1,161 @@ +# # Copyright (c) 2019-present, HuggingFace Inc. +# All rights reserved. +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +import logging +import random +from argparse import ArgumentParser +from itertools import chain +from pprint import pformat + +import torch +import torch.nn.functional as F + +from config import InteractConfig +from pytorch_pretrained_bert import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, GPT2LMHeadModel, GPT2Tokenizer, \ + BertTokenizer +from pytorch_pretrained_bert.modeling import BertLMHeadModel +from utils import get_dataset_personalities, download_pretrained_model, get_dataset + + +def build_input_from_segments(history, reply, tokenizer, SPECIAL_TOKENS, lm_labels=False, with_eos=True): + """ Build a sequence of input from 3 segments: persona, history and last reply """ + bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1]) + persona = [] + instance = {} + sequence = [[bos] + list(chain(*persona))] + history + [ + reply + ([eos] if with_eos else [])] # seq = [personas, history, reply] concatenate all persona sentences + sequence = [sequence[0]] + [[speaker2 if (len(sequence) - i) % 2 else speaker1] + s for i, s in + enumerate(sequence[1:])] + + instance["input_ids"] = list(chain(*sequence)) + instance["token_type_ids"] = [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in + s] # the last for is for repeating the speaker1 and speaker2 for all tokens + instance["mc_token_ids"] = len(instance["input_ids"]) - 1 + instance["lm_labels"] = [-1] * len(instance["input_ids"]) + if lm_labels: + instance["lm_labels"] = ([-1] * sum(len(s) for s in sequence[:-1])) + [-1] + sequence[-1][1:] # all -1 except for reply, reply is just the ids + return instance, sequence + + +def top_filtering(logits, top_k=0, top_p=0.0, threshold=-float('Inf'), filter_value=-float('Inf')): + """ Filter a distribution of logits using top-k, top-p (nucleus) and/or threshold filtering + Args: + logits: logits distribution shape (..., vocabulary size) + top_k: <=0: no filtering, >0: keep only top k tokens with highest probability. + top_p: <=0.0: no filtering, >0.0: keep only a subset S of candidates, where S is the smallest subset + whose total probability mass is greater than or equal to the threshold top_p. + In practice, we select the highest probability tokens whose cumulative probability mass exceeds + the threshold top_p. + threshold: a minimal threshold to keep logits + """ + top_k = min(top_k, logits.size(-1)) + if top_k > 0: + # Remove all tokens with a probability less than the last token in the top-k tokens + indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] + logits[indices_to_remove] = filter_value + + if top_p > 0.0: + # Compute cumulative probabilities of sorted tokens + sorted_logits, sorted_indices = torch.sort(logits, descending=True) + cumulative_probabilities = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) + + # Remove tokens with cumulative probability above the threshold + sorted_indices_to_remove = cumulative_probabilities > top_p + # Shift the indices to the right to keep also the first token above the threshold + sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() + sorted_indices_to_remove[..., 0] = 0 + + # Back to unsorted indices and set them to -infinity + indices_to_remove = sorted_indices[sorted_indices_to_remove] + logits[indices_to_remove] = filter_value + + indices_to_remove = logits < threshold + logits[indices_to_remove] = filter_value + + return logits + + +def sample_sequence(history, tokenizer, model, args, SPECIAL_TOKENS, current_output=None): + special_tokens_ids = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS) + + if current_output is None: + current_output = [] + + for i in range(args.max_length): + instance, sequence = build_input_from_segments(history, current_output, tokenizer, SPECIAL_TOKENS, + with_eos=False) + + input_ids = torch.tensor(instance["input_ids"], device=args.device).unsqueeze(0) + token_type_ids = torch.tensor(instance["token_type_ids"], device=args.device).unsqueeze(0) + + logits = model(input_ids, token_type_ids=token_type_ids) + + if "gpt2" == args.model: + logits = logits[0] + logits = logits[0, -1, :] / args.temperature + logits = top_filtering(logits, top_k=args.top_k, top_p=args.top_p) + probs = F.softmax(logits, dim=-1) + + prev = torch.topk(probs, 1)[1] if args.no_sample else torch.multinomial(probs, 1) + if i < args.min_length and prev.item() in special_tokens_ids: + while prev.item() in special_tokens_ids: + prev = torch.multinomial(probs, num_samples=1) + + if prev.item() in special_tokens_ids: + break + current_output.append(prev.item()) + + return current_output + + +def run(): + config_file = "configs/interact_config.json" + config = InteractConfig.from_json_file(config_file) + + logging.basicConfig(level=logging.INFO) + logger = logging.getLogger(__file__) + logger.info(pformat(config)) + + if config.model_checkpoint == "": + config.model_checkpoint = download_pretrained_model() + + torch.random.manual_seed(config.seed) + torch.cuda.manual_seed(config.seed) + + logger.info("Get pretrained model and tokenizer") + if config.model == "bert": + tokenizer_class = BertTokenizer + model_class = BertLMHeadModel + elif config.model == "gpt2": + tokenizer_class = GPT2Tokenizer + model_class = GPT2LMHeadModel + else: + tokenizer_class = OpenAIGPTTokenizer + model_class = OpenAIGPTLMHeadModel + + SPECIAL_TOKENS = ["", "", "", "", ""] + + tokenizer = tokenizer_class.from_pretrained(config.model_checkpoint) + model = model_class.from_pretrained(config.model_checkpoint) + + model.to(config.device) + model.eval() + + history = [] + while True: + raw_text = input(">>> ") + while not raw_text: + print('Prompt should not be empty!') + raw_text = input(">>> ") + history.append(tokenizer.encode(raw_text)) + with torch.no_grad(): + out_ids = sample_sequence(history, tokenizer, model, config, SPECIAL_TOKENS) + history.append(out_ids) + history = history[-(2 * config.max_history + 1):] + out_text = tokenizer.decode(out_ids, skip_special_tokens=True) + print(out_text) + + +if __name__ == "__main__": + run() diff --git a/pytorch_pretrained_bert/__init__.py b/pytorch_pretrained_bert/__init__.py new file mode 100644 index 0000000..522f976 --- /dev/null +++ b/pytorch_pretrained_bert/__init__.py @@ -0,0 +1,26 @@ +__version__ = "0.6.2" +from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer +from .tokenization_openai import OpenAIGPTTokenizer +from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus) +from .tokenization_gpt2 import GPT2Tokenizer + +from .modeling import (BertConfig, BertModel, BertForPreTraining, + BertForMaskedLM, BertForNextSentencePrediction, + BertForSequenceClassification, BertForMultipleChoice, + BertForTokenClassification, BertForQuestionAnswering, + load_tf_weights_in_bert) +from .modeling_openai import (OpenAIGPTConfig, OpenAIGPTModel, + OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel, OpenAIGPTDoubleHeadLMEmotionRecognitionModel, + OpenAIGPTForEmotionDetection, + OpenAIGPTMultiHeadModel, + load_tf_weights_in_openai_gpt) +from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel, + load_tf_weights_in_transfo_xl) +from .modeling_gpt2 import (GPT2Config, GPT2Model, + GPT2LMHeadModel, GPT2DoubleHeadsModel, GPT2MultipleChoiceHead, + load_tf_weights_in_gpt2) + +from .optimization import BertAdam +from .optimization_openai import OpenAIAdam + +from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, cached_path, WEIGHTS_NAME, CONFIG_NAME diff --git a/pytorch_pretrained_bert/__main__.py b/pytorch_pretrained_bert/__main__.py new file mode 100644 index 0000000..a2aae9e --- /dev/null +++ b/pytorch_pretrained_bert/__main__.py @@ -0,0 +1,83 @@ +# coding: utf8 +def main(): + import sys + if (len(sys.argv) != 4 and len(sys.argv) != 5) or sys.argv[1] not in [ + "convert_tf_checkpoint_to_pytorch", + "convert_openai_checkpoint", + "convert_transfo_xl_checkpoint", + "convert_gpt2_checkpoint", + ]: + print( + "Should be used as one of: \n" + ">> `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`, \n" + ">> `pytorch_pretrained_bert convert_openai_checkpoint OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`, \n" + ">> `pytorch_pretrained_bert convert_transfo_xl_checkpoint TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG]` or \n" + ">> `pytorch_pretrained_bert convert_gpt2_checkpoint TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG]`") + else: + if sys.argv[1] == "convert_tf_checkpoint_to_pytorch": + try: + from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch + except ImportError: + print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, " + "In that case, it requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions.") + raise + + if len(sys.argv) != 5: + # pylint: disable=line-too-long + print("Should be used as `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`") + else: + PYTORCH_DUMP_OUTPUT = sys.argv.pop() + TF_CONFIG = sys.argv.pop() + TF_CHECKPOINT = sys.argv.pop() + convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT) + elif sys.argv[1] == "convert_openai_checkpoint": + from .convert_openai_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch + OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2] + PYTORCH_DUMP_OUTPUT = sys.argv[3] + if len(sys.argv) == 5: + OPENAI_GPT_CONFIG = sys.argv[4] + else: + OPENAI_GPT_CONFIG = "" + convert_openai_checkpoint_to_pytorch(OPENAI_GPT_CHECKPOINT_FOLDER_PATH, + OPENAI_GPT_CONFIG, + PYTORCH_DUMP_OUTPUT) + elif sys.argv[1] == "convert_transfo_xl_checkpoint": + try: + from .convert_transfo_xl_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch + except ImportError: + print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, " + "In that case, it requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions.") + raise + + if 'ckpt' in sys.argv[2].lower(): + TF_CHECKPOINT = sys.argv[2] + TF_DATASET_FILE = "" + else: + TF_DATASET_FILE = sys.argv[2] + TF_CHECKPOINT = "" + PYTORCH_DUMP_OUTPUT = sys.argv[3] + if len(sys.argv) == 5: + TF_CONFIG = sys.argv[4] + else: + TF_CONFIG = "" + convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, TF_DATASET_FILE) + else: + try: + from .convert_gpt2_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch + except ImportError: + print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, " + "In that case, it requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions.") + raise + + TF_CHECKPOINT = sys.argv[2] + PYTORCH_DUMP_OUTPUT = sys.argv[3] + if len(sys.argv) == 5: + TF_CONFIG = sys.argv[4] + else: + TF_CONFIG = "" + convert_gpt2_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT) +if __name__ == '__main__': + main() diff --git a/pytorch_pretrained_bert/convert_gpt2_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_gpt2_checkpoint_to_pytorch.py new file mode 100755 index 0000000..51d52a6 --- /dev/null +++ b/pytorch_pretrained_bert/convert_gpt2_checkpoint_to_pytorch.py @@ -0,0 +1,72 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert OpenAI GPT checkpoint.""" + +from __future__ import absolute_import, division, print_function + +import argparse +from io import open + +import torch + +from pytorch_pretrained_bert.modeling_gpt2 import (CONFIG_NAME, WEIGHTS_NAME, + GPT2Config, + GPT2Model, + load_tf_weights_in_gpt2) + + +def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path): + # Construct model + if gpt2_config_file == "": + config = GPT2Config() + else: + config = GPT2Config(gpt2_config_file) + model = GPT2Model(config) + + # Load weights from numpy + load_tf_weights_in_gpt2(model, gpt2_checkpoint_path) + + # Save pytorch-model + pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME + pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME + print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) + torch.save(model.state_dict(), pytorch_weights_dump_path) + print("Save configuration file to {}".format(pytorch_config_dump_path)) + with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: + f.write(config.to_json_string()) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + ## Required parameters + parser.add_argument("--gpt2_checkpoint_path", + default = None, + type = str, + required = True, + help = "Path the TensorFlow checkpoint path.") + parser.add_argument("--pytorch_dump_folder_path", + default = None, + type = str, + required = True, + help = "Path to the output PyTorch model.") + parser.add_argument("--gpt2_config_file", + default = "", + type = str, + help = "An optional config json file corresponding to the pre-trained OpenAI model. \n" + "This specifies the model architecture.") + args = parser.parse_args() + convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path, + args.gpt2_config_file, + args.pytorch_dump_folder_path) diff --git a/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py new file mode 100755 index 0000000..566008a --- /dev/null +++ b/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py @@ -0,0 +1,72 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert OpenAI GPT checkpoint.""" + +from __future__ import absolute_import, division, print_function + +import argparse +from io import open + +import torch + +from pytorch_pretrained_bert.modeling_openai import (CONFIG_NAME, WEIGHTS_NAME, + OpenAIGPTConfig, + OpenAIGPTModel, + load_tf_weights_in_openai_gpt) + + +def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path): + # Construct model + if openai_config_file == "": + config = OpenAIGPTConfig() + else: + config = OpenAIGPTConfig(openai_config_file) + model = OpenAIGPTModel(config) + + # Load weights from numpy + load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path) + + # Save pytorch-model + pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME + pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME + print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) + torch.save(model.state_dict(), pytorch_weights_dump_path) + print("Save configuration file to {}".format(pytorch_config_dump_path)) + with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: + f.write(config.to_json_string()) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + ## Required parameters + parser.add_argument("--openai_checkpoint_folder_path", + default = None, + type = str, + required = True, + help = "Path the TensorFlow checkpoint path.") + parser.add_argument("--pytorch_dump_folder_path", + default = None, + type = str, + required = True, + help = "Path to the output PyTorch model.") + parser.add_argument("--openai_config_file", + default = "", + type = str, + help = "An optional config json file corresponding to the pre-trained OpenAI model. \n" + "This specifies the model architecture.") + args = parser.parse_args() + convert_openai_checkpoint_to_pytorch(args.openai_checkpoint_folder_path, + args.openai_config_file, + args.pytorch_dump_folder_path) diff --git a/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py new file mode 100755 index 0000000..13d9638 --- /dev/null +++ b/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py @@ -0,0 +1,66 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert BERT checkpoint.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import re +import argparse +import tensorflow as tf +import torch +import numpy as np + +from pytorch_pretrained_bert.modeling import BertConfig, BertForPreTraining, load_tf_weights_in_bert + +def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): + # Initialise PyTorch model + config = BertConfig.from_json_file(bert_config_file) + print("Building PyTorch model from configuration: {}".format(str(config))) + model = BertForPreTraining(config) + + # Load weights from tf checkpoint + load_tf_weights_in_bert(model, tf_checkpoint_path) + + # Save pytorch-model + print("Save PyTorch model to {}".format(pytorch_dump_path)) + torch.save(model.state_dict(), pytorch_dump_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + ## Required parameters + parser.add_argument("--tf_checkpoint_path", + default = None, + type = str, + required = True, + help = "Path the TensorFlow checkpoint path.") + parser.add_argument("--bert_config_file", + default = None, + type = str, + required = True, + help = "The config json file corresponding to the pre-trained BERT model. \n" + "This specifies the model architecture.") + parser.add_argument("--pytorch_dump_path", + default = None, + type = str, + required = True, + help = "Path to the output PyTorch model.") + args = parser.parse_args() + convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, + args.bert_config_file, + args.pytorch_dump_path) diff --git a/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py new file mode 100755 index 0000000..8d6b965 --- /dev/null +++ b/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py @@ -0,0 +1,116 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert Transformer XL checkpoint and datasets.""" + +from __future__ import absolute_import, division, print_function + +import argparse +import os +import sys +from io import open + +import torch + +import pytorch_pretrained_bert.tokenization_transfo_xl as data_utils +from pytorch_pretrained_bert.modeling_transfo_xl import (CONFIG_NAME, + WEIGHTS_NAME, + TransfoXLConfig, + TransfoXLLMHeadModel, + load_tf_weights_in_transfo_xl) +from pytorch_pretrained_bert.tokenization_transfo_xl import (CORPUS_NAME, + VOCAB_NAME) + +if sys.version_info[0] == 2: + import cPickle as pickle +else: + import pickle + +# We do this to be able to load python 2 datasets pickles +# See e.g. https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory/2121918#2121918 +data_utils.Vocab = data_utils.TransfoXLTokenizer +data_utils.Corpus = data_utils.TransfoXLCorpus +sys.modules['data_utils'] = data_utils +sys.modules['vocabulary'] = data_utils + +def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path, + transfo_xl_config_file, + pytorch_dump_folder_path, + transfo_xl_dataset_file): + if transfo_xl_dataset_file: + # Convert a pre-processed corpus (see original TensorFlow repo) + with open(transfo_xl_dataset_file, "rb") as fp: + corpus = pickle.load(fp, encoding="latin1") + # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term) + pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_NAME + print("Save vocabulary to {}".format(pytorch_vocab_dump_path)) + corpus_vocab_dict = corpus.vocab.__dict__ + torch.save(corpus_vocab_dict, pytorch_vocab_dump_path) + + corpus_dict_no_vocab = corpus.__dict__ + corpus_dict_no_vocab.pop('vocab', None) + pytorch_dataset_dump_path = pytorch_dump_folder_path + '/' + CORPUS_NAME + print("Save dataset to {}".format(pytorch_dataset_dump_path)) + torch.save(corpus_dict_no_vocab, pytorch_dataset_dump_path) + + if tf_checkpoint_path: + # Convert a pre-trained TensorFlow model + config_path = os.path.abspath(transfo_xl_config_file) + tf_path = os.path.abspath(tf_checkpoint_path) + + print("Converting Transformer XL checkpoint from {} with config at {}".format(tf_path, config_path)) + # Initialise PyTorch model + if transfo_xl_config_file == "": + config = TransfoXLConfig() + else: + config = TransfoXLConfig(transfo_xl_config_file) + print("Building PyTorch model from configuration: {}".format(str(config))) + model = TransfoXLLMHeadModel(config) + + model = load_tf_weights_in_transfo_xl(model, config, tf_path) + # Save pytorch-model + pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME) + pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME) + print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path))) + torch.save(model.state_dict(), pytorch_weights_dump_path) + print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path))) + with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: + f.write(config.to_json_string()) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--pytorch_dump_folder_path", + default = None, + type = str, + required = True, + help = "Path to the folder to store the PyTorch model or dataset/vocab.") + parser.add_argument("--tf_checkpoint_path", + default = "", + type = str, + help = "An optional path to a TensorFlow checkpoint path to be converted.") + parser.add_argument("--transfo_xl_config_file", + default = "", + type = str, + help = "An optional config json file corresponding to the pre-trained BERT model. \n" + "This specifies the model architecture.") + parser.add_argument("--transfo_xl_dataset_file", + default = "", + type = str, + help = "An optional dataset file to be converted in a vocabulary.") + args = parser.parse_args() + convert_transfo_xl_checkpoint_to_pytorch(args.tf_checkpoint_path, + args.transfo_xl_config_file, + args.pytorch_dump_folder_path, + args.transfo_xl_dataset_file) diff --git a/pytorch_pretrained_bert/file_utils.py b/pytorch_pretrained_bert/file_utils.py new file mode 100644 index 0000000..605c841 --- /dev/null +++ b/pytorch_pretrained_bert/file_utils.py @@ -0,0 +1,279 @@ +""" +Utilities for working with the local dataset cache. +This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp +Copyright by the AllenNLP authors. +""" +from __future__ import (absolute_import, division, print_function, unicode_literals) + +import sys +import json +import logging +import os +import shutil +import tempfile +import fnmatch +from functools import wraps +from hashlib import sha256 +import sys +from io import open + +import boto3 +import requests +from botocore.exceptions import ClientError +from tqdm import tqdm + +try: + from torch.hub import _get_torch_home + torch_cache_home = _get_torch_home() +except ImportError: + torch_cache_home = os.path.expanduser( + os.getenv('TORCH_HOME', os.path.join( + os.getenv('XDG_CACHE_HOME', '~/.cache'), 'torch'))) +default_cache_path = os.path.join(torch_cache_home, 'pytorch_pretrained_bert') + +try: + from urllib.parse import urlparse +except ImportError: + from urlparse import urlparse + +try: + from pathlib import Path + PYTORCH_PRETRAINED_BERT_CACHE = Path( + os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', default_cache_path)) +except (AttributeError, ImportError): + PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', + default_cache_path) + +CONFIG_NAME = "config.json" +WEIGHTS_NAME = "pytorch_model.bin" + +logger = logging.getLogger(__name__) # pylint: disable=invalid-name + + +def url_to_filename(url, etag=None): + """ + Convert `url` into a hashed filename in a repeatable way. + If `etag` is specified, append its hash to the url's, delimited + by a period. + """ + url_bytes = url.encode('utf-8') + url_hash = sha256(url_bytes) + filename = url_hash.hexdigest() + + if etag: + etag_bytes = etag.encode('utf-8') + etag_hash = sha256(etag_bytes) + filename += '.' + etag_hash.hexdigest() + + return filename + + +def filename_to_url(filename, cache_dir=None): + """ + Return the url and etag (which may be ``None``) stored for `filename`. + Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist. + """ + if cache_dir is None: + cache_dir = PYTORCH_PRETRAINED_BERT_CACHE + if sys.version_info[0] == 3 and isinstance(cache_dir, Path): + cache_dir = str(cache_dir) + + cache_path = os.path.join(cache_dir, filename) + if not os.path.exists(cache_path): + raise EnvironmentError("file {} not found".format(cache_path)) + + meta_path = cache_path + '.json' + if not os.path.exists(meta_path): + raise EnvironmentError("file {} not found".format(meta_path)) + + with open(meta_path, encoding="utf-8") as meta_file: + metadata = json.load(meta_file) + url = metadata['url'] + etag = metadata['etag'] + + return url, etag + + +def cached_path(url_or_filename, cache_dir=None): + """ + Given something that might be a URL (or might be a local path), + determine which. If it's a URL, download the file and cache it, and + return the path to the cached file. If it's already a local path, + make sure the file exists and then return the path. + """ + if cache_dir is None: + cache_dir = PYTORCH_PRETRAINED_BERT_CACHE + if sys.version_info[0] == 3 and isinstance(url_or_filename, Path): + url_or_filename = str(url_or_filename) + if sys.version_info[0] == 3 and isinstance(cache_dir, Path): + cache_dir = str(cache_dir) + + parsed = urlparse(url_or_filename) + + if parsed.scheme in ('http', 'https', 's3'): + # URL, so get it from the cache (downloading if necessary) + return get_from_cache(url_or_filename, cache_dir) + elif os.path.exists(url_or_filename): + # File, and it exists. + return url_or_filename + elif parsed.scheme == '': + # File, but it doesn't exist. + raise EnvironmentError("file {} not found".format(url_or_filename)) + else: + # Something unknown + raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename)) + + +def split_s3_path(url): + """Split a full s3 path into the bucket name and path.""" + parsed = urlparse(url) + if not parsed.netloc or not parsed.path: + raise ValueError("bad s3 path {}".format(url)) + bucket_name = parsed.netloc + s3_path = parsed.path + # Remove '/' at beginning of path. + if s3_path.startswith("/"): + s3_path = s3_path[1:] + return bucket_name, s3_path + + +def s3_request(func): + """ + Wrapper function for s3 requests in order to create more helpful error + messages. + """ + + @wraps(func) + def wrapper(url, *args, **kwargs): + try: + return func(url, *args, **kwargs) + except ClientError as exc: + if int(exc.response["Error"]["Code"]) == 404: + raise EnvironmentError("file {} not found".format(url)) + else: + raise + + return wrapper + + +@s3_request +def s3_etag(url): + """Check ETag on S3 object.""" + s3_resource = boto3.resource("s3") + bucket_name, s3_path = split_s3_path(url) + s3_object = s3_resource.Object(bucket_name, s3_path) + return s3_object.e_tag + + +@s3_request +def s3_get(url, temp_file): + """Pull a file directly from S3.""" + s3_resource = boto3.resource("s3") + bucket_name, s3_path = split_s3_path(url) + s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file) + + +def http_get(url, temp_file): + req = requests.get(url, stream=True) + content_length = req.headers.get('Content-Length') + total = int(content_length) if content_length is not None else None + progress = tqdm(unit="B", total=total) + for chunk in req.iter_content(chunk_size=1024): + if chunk: # filter out keep-alive new chunks + progress.update(len(chunk)) + temp_file.write(chunk) + progress.close() + + +def get_from_cache(url, cache_dir=None): + """ + Given a URL, look for the corresponding dataset in the local cache. + If it's not there, download it. Then return the path to the cached file. + """ + if cache_dir is None: + cache_dir = PYTORCH_PRETRAINED_BERT_CACHE + if sys.version_info[0] == 3 and isinstance(cache_dir, Path): + cache_dir = str(cache_dir) + + if not os.path.exists(cache_dir): + os.makedirs(cache_dir) + + # Get eTag to add to filename, if it exists. + if url.startswith("s3://"): + etag = s3_etag(url) + else: + try: + response = requests.head(url, allow_redirects=True) + if response.status_code != 200: + etag = None + else: + etag = response.headers.get("ETag") + except EnvironmentError: + etag = None + + if sys.version_info[0] == 2 and etag is not None: + etag = etag.decode('utf-8') + filename = url_to_filename(url, etag) + + # get cache path to put the file + cache_path = os.path.join(cache_dir, filename) + + # If we don't have a connection (etag is None) and can't identify the file + # try to get the last downloaded one + if not os.path.exists(cache_path) and etag is None: + matching_files = fnmatch.filter(os.listdir(cache_dir), filename + '.*') + matching_files = list(filter(lambda s: not s.endswith('.json'), matching_files)) + if matching_files: + cache_path = os.path.join(cache_dir, matching_files[-1]) + + if not os.path.exists(cache_path): + # Download to temporary file, then copy to cache dir once finished. + # Otherwise you get corrupt cache entries if the download gets interrupted. + with tempfile.NamedTemporaryFile() as temp_file: + logger.info("%s not found in cache, downloading to %s", url, temp_file.name) + + # GET file object + if url.startswith("s3://"): + s3_get(url, temp_file) + else: + http_get(url, temp_file) + + # we are copying the file before closing it, so flush to avoid truncation + temp_file.flush() + # shutil.copyfileobj() starts at the current position, so go to the start + temp_file.seek(0) + + logger.info("copying %s to cache at %s", temp_file.name, cache_path) + with open(cache_path, 'wb') as cache_file: + shutil.copyfileobj(temp_file, cache_file) + + logger.info("creating metadata file for %s", cache_path) + meta = {'url': url, 'etag': etag} + meta_path = cache_path + '.json' + with open(meta_path, 'w') as meta_file: + output_string = json.dumps(meta) + if sys.version_info[0] == 2 and isinstance(output_string, str): + output_string = unicode(output_string, 'utf-8') # The beauty of python 2 + meta_file.write(output_string) + + logger.info("removing temp file %s", temp_file.name) + + return cache_path + + +def read_set_from_file(filename): + ''' + Extract a de-duped collection (set) of text from a file. + Expected file format is one item per line. + ''' + collection = set() + with open(filename, 'r', encoding='utf-8') as file_: + for line in file_: + collection.add(line.rstrip()) + return collection + + +def get_file_extension(path, dot=True, lower=True): + ext = os.path.splitext(path)[1] + ext = ext if dot else ext[1:] + return ext.lower() if lower else ext diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py new file mode 100644 index 0000000..bdec14c --- /dev/null +++ b/pytorch_pretrained_bert/modeling.py @@ -0,0 +1,1623 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BERT model.""" + +from __future__ import absolute_import, division, print_function, unicode_literals + +import copy +import json +import logging +import math +import os +import sys +from io import open + +import torch +from torch import nn +from torch.nn import CrossEntropyLoss + +from .file_utils import cached_path, WEIGHTS_NAME, CONFIG_NAME +#from pytorch_pretrained_bert.modeling_openai import OpenAIGPTModel, OpenAIGPTLMHead +logger = logging.getLogger(__name__) + +PRETRAINED_MODEL_ARCHIVE_MAP = { + 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin", + 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin", + 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin", + 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin", + 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin", + 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin", + 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin", + 'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-pytorch_model.bin", + 'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin", + 'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin", + 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin", + 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin", + 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin", +} +PRETRAINED_CONFIG_ARCHIVE_MAP = { + 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json", + 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json", + 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json", + 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json", + 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json", + 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json", + 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json", + 'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-config.json", + 'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json", + 'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json", + 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json", + 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json", + 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json", +} +BERT_CONFIG_NAME = 'bert_config.json' +TF_WEIGHTS_NAME = 'model.ckpt' + + +def prune_linear_layer(layer, index, dim=0): + """ Prune a linear layer (a model parameters) to keep only entries in index. + Return the pruned layer as a new layer with requires_grad=True. + Used to remove heads. + """ + index = index.to(layer.weight.device) + W = layer.weight.index_select(dim, index).clone().detach() + if layer.bias is not None: + if dim == 1: + b = layer.bias.clone().detach() + else: + b = layer.bias[index].clone().detach() + new_size = list(layer.weight.size()) + new_size[dim] = len(index) + new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None).to(layer.weight.device) + new_layer.weight.requires_grad = False + new_layer.weight.copy_(W.contiguous()) + new_layer.weight.requires_grad = True + if layer.bias is not None: + new_layer.bias.requires_grad = False + new_layer.bias.copy_(b.contiguous()) + new_layer.bias.requires_grad = True + return new_layer + + +def load_tf_weights_in_bert(model, tf_checkpoint_path): + """ Load tf checkpoints in a pytorch model + """ + try: + import re + import numpy as np + import tensorflow as tf + except ImportError: + print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions.") + raise + tf_path = os.path.abspath(tf_checkpoint_path) + print("Converting TensorFlow checkpoint from {}".format(tf_path)) + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + for name, shape in init_vars: + print("Loading TF weight {} with shape {}".format(name, shape)) + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array) + + for name, array in zip(names, arrays): + name = name.split('/') + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if any(n in ["adam_v", "adam_m", "global_step"] for n in name): + print("Skipping {}".format("/".join(name))) + continue + pointer = model + for m_name in name: + if re.fullmatch(r'[A-Za-z]+_\d+', m_name): + l = re.split(r'_(\d+)', m_name) + else: + l = [m_name] + if l[0] == 'kernel' or l[0] == 'gamma': + pointer = getattr(pointer, 'weight') + elif l[0] == 'output_bias' or l[0] == 'beta': + pointer = getattr(pointer, 'bias') + elif l[0] == 'output_weights': + pointer = getattr(pointer, 'weight') + elif l[0] == 'squad': + pointer = getattr(pointer, 'classifier') + else: + try: + pointer = getattr(pointer, l[0]) + except AttributeError: + print("Skipping {}".format("/".join(name))) + continue + if len(l) >= 2: + num = int(l[1]) + pointer = pointer[num] + if m_name[-11:] == '_embeddings': + pointer = getattr(pointer, 'weight') + elif m_name == 'kernel': + array = np.transpose(array) + try: + assert pointer.shape == array.shape + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + print("Initialize PyTorch weight {}".format(name)) + pointer.data = torch.from_numpy(array) + return model + + +def gelu(x): + """Implementation of the gelu activation function. + For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): + 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) + Also see https://arxiv.org/abs/1606.08415 + """ + return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) + + +def swish(x): + return x * torch.sigmoid(x) + + +ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish} + + +class BertConfig(object): + """Configuration class to store the configuration of a `BertModel`. + """ + + def __init__(self, + vocab_size_or_config_json_file, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12): + """Constructs BertConfig. + + Args: + vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`. + hidden_size: Size of the encoder layers and the pooler layer. + num_hidden_layers: Number of hidden layers in the Transformer encoder. + num_attention_heads: Number of attention heads for each attention layer in + the Transformer encoder. + intermediate_size: The size of the "intermediate" (i.e., feed-forward) + layer in the Transformer encoder. + hidden_act: The non-linear activation function (function or string) in the + encoder and pooler. If string, "gelu", "relu" and "swish" are supported. + hidden_dropout_prob: The dropout probabilitiy for all fully connected + layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob: The dropout ratio for the attention + probabilities. + max_position_embeddings: The maximum sequence length that this model might + ever be used with. Typically set this to something large just in case + (e.g., 512 or 1024 or 2048). + type_vocab_size: The vocabulary size of the `token_type_ids` passed into + `BertModel`. + initializer_range: The sttdev of the truncated_normal_initializer for + initializing all weight matrices. + layer_norm_eps: The epsilon used by LayerNorm. + """ + if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 + and isinstance(vocab_size_or_config_json_file, unicode)): + with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: + json_config = json.loads(reader.read()) + for key, value in json_config.items(): + self.__dict__[key] = value + elif isinstance(vocab_size_or_config_json_file, int): + self.vocab_size = vocab_size_or_config_json_file + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + else: + raise ValueError("First argument must be either a vocabulary size (int)" + "or the path to a pretrained model config file (str)") + + @classmethod + def from_dict(cls, json_object): + """Constructs a `BertConfig` from a Python dictionary of parameters.""" + config = BertConfig(vocab_size_or_config_json_file=-1) + for key, value in json_object.items(): + config.__dict__[key] = value + return config + + @classmethod + def from_json_file(cls, json_file): + """Constructs a `BertConfig` from a json file of parameters.""" + with open(json_file, "r", encoding='utf-8') as reader: + text = reader.read() + return cls.from_dict(json.loads(text)) + + def __repr__(self): + return str(self.to_json_string()) + + def to_dict(self): + """Serializes this instance to a Python dictionary.""" + output = copy.deepcopy(self.__dict__) + return output + + def to_json_string(self): + """Serializes this instance to a JSON string.""" + return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" + + def to_json_file(self, json_file_path): + """ Save this instance to a json file.""" + with open(json_file_path, "w", encoding='utf-8') as writer: + writer.write(self.to_json_string()) + + +try: + from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm +except ImportError: + logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .") + + + class BertLayerNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-12): + """Construct a layernorm module in the TF style (epsilon inside the square root). + """ + super(BertLayerNorm, self).__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.bias = nn.Parameter(torch.zeros(hidden_size)) + self.variance_epsilon = eps + + def forward(self, x): + u = x.mean(-1, keepdim=True) + s = (x - u).pow(2).mean(-1, keepdim=True) + x = (x - u) / torch.sqrt(s + self.variance_epsilon) + return self.weight * x + self.bias + + +class BertEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings. + """ + + def __init__(self, config): + super(BertEmbeddings, self).__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, input_ids, token_type_ids=None): + seq_length = input_ids.size(1) + position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) + position_ids = position_ids.unsqueeze(0).expand_as(input_ids) + if token_type_ids is None: + token_type_ids = torch.zeros_like(input_ids) + + words_embeddings = self.word_embeddings(input_ids) + position_embeddings = self.position_embeddings(position_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = words_embeddings + position_embeddings + token_type_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class BertSelfAttention(nn.Module): + def __init__(self, config, output_attentions=False, keep_multihead_output=False): + super(BertSelfAttention, self).__init__() + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (config.hidden_size, config.num_attention_heads)) + self.output_attentions = output_attentions + self.keep_multihead_output = keep_multihead_output + self.multihead_output = None + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward(self, hidden_states, attention_mask, head_mask=None): + mixed_query_layer = self.query(hidden_states) + mixed_key_layer = self.key(hidden_states) + mixed_value_layer = self.value(hidden_states) + + query_layer = self.transpose_for_scores(mixed_query_layer) + key_layer = self.transpose_for_scores(mixed_key_layer) + value_layer = self.transpose_for_scores(mixed_value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + if self.keep_multihead_output: + self.multihead_output = context_layer + self.multihead_output.retain_grad() + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + if self.output_attentions: + return attention_probs, context_layer + return context_layer + + +class BertSelfOutput(nn.Module): + def __init__(self, config): + super(BertSelfOutput, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertAttention(nn.Module): + def __init__(self, config, output_attentions=False, keep_multihead_output=False): + super(BertAttention, self).__init__() + self.output_attentions = output_attentions + self.self = BertSelfAttention(config, output_attentions=output_attentions, + keep_multihead_output=keep_multihead_output) + self.output = BertSelfOutput(config) + + def prune_heads(self, heads): + if len(heads) == 0: + return + mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size) + for head in heads: + mask[head] = 0 + mask = mask.view(-1).contiguous().eq(1) + index = torch.arange(len(mask))[mask].long() + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + # Update hyper params + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + + def forward(self, input_tensor, attention_mask, head_mask=None): + self_output = self.self(input_tensor, attention_mask, head_mask) + if self.output_attentions: + attentions, self_output = self_output + attention_output = self.output(self_output, input_tensor) + if self.output_attentions: + return attentions, attention_output + return attention_output + + +class BertIntermediate(nn.Module): + def __init__(self, config): + super(BertIntermediate, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class BertOutput(nn.Module): + def __init__(self, config): + super(BertOutput, self).__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertLayer(nn.Module): + def __init__(self, config, output_attentions=False, keep_multihead_output=False): + super(BertLayer, self).__init__() + self.output_attentions = output_attentions + self.attention = BertAttention(config, output_attentions=output_attentions, + keep_multihead_output=keep_multihead_output) + self.intermediate = BertIntermediate(config) + self.output = BertOutput(config) + + def forward(self, hidden_states, attention_mask, head_mask=None): + attention_output = self.attention(hidden_states, attention_mask, head_mask) + if self.output_attentions: + attentions, attention_output = attention_output + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + if self.output_attentions: + return attentions, layer_output + return layer_output + + +class BertEncoder(nn.Module): + def __init__(self, config, output_attentions=False, keep_multihead_output=False): + super(BertEncoder, self).__init__() + self.output_attentions = output_attentions + layer = BertLayer(config, output_attentions=output_attentions, + keep_multihead_output=keep_multihead_output) + self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)]) + + def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, head_mask=None): + all_encoder_layers = [] + all_attentions = [] + for i, layer_module in enumerate(self.layer): + hidden_states = layer_module(hidden_states, attention_mask, head_mask[i]) + if self.output_attentions: + attentions, hidden_states = hidden_states + all_attentions.append(attentions) + if output_all_encoded_layers: + all_encoder_layers.append(hidden_states) + if not output_all_encoded_layers: + all_encoder_layers.append(hidden_states) + if self.output_attentions: + return all_attentions, all_encoder_layers + return all_encoder_layers + + +class BertPooler(nn.Module): + def __init__(self, config): + super(BertPooler, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class BertPredictionHeadTransform(nn.Module): + def __init__(self, config): + super(BertPredictionHeadTransform, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class BertLMPredictionHead(nn.Module): + def __init__(self, config, bert_model_embedding_weights): + super(BertLMPredictionHead, self).__init__() + self.transform = BertPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear(bert_model_embedding_weights.size(1), + bert_model_embedding_weights.size(0), + bias=False) + self.decoder.weight = bert_model_embedding_weights + self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0))) + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + self.bias + return hidden_states + + +class BertOnlyMLMHead(nn.Module): + def __init__(self, config, bert_model_embedding_weights): + super(BertOnlyMLMHead, self).__init__() + self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights) + + def forward(self, sequence_output): + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +class BertOnlyNSPHead(nn.Module): + def __init__(self, config): + super(BertOnlyNSPHead, self).__init__() + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, pooled_output): + seq_relationship_score = self.seq_relationship(pooled_output) + return seq_relationship_score + + +class BertPreTrainingHeads(nn.Module): + def __init__(self, config, bert_model_embedding_weights): + super(BertPreTrainingHeads, self).__init__() + self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights) + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, sequence_output, pooled_output): + prediction_scores = self.predictions(sequence_output) + seq_relationship_score = self.seq_relationship(pooled_output) + return prediction_scores, seq_relationship_score + + +class BertPreTrainedModel(nn.Module): + """ An abstract class to handle weights initialization and + a simple interface for dowloading and loading pretrained models. + """ + + def __init__(self, config, *inputs, **kwargs): + super(BertPreTrainedModel, self).__init__() + if not isinstance(config, BertConfig): + raise ValueError( + "Parameter config in `{}(config)` should be an instance of class `BertConfig`. " + "To create a model from a Google pretrained model use " + "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format( + self.__class__.__name__, self.__class__.__name__ + )) + self.config = config + + def init_bert_weights(self, module): + """ Initialize the weights. + """ + if isinstance(module, (nn.Linear, nn.Embedding)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + elif isinstance(module, BertLayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): + """ + Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict. + Download and cache the pre-trained model file if needed. + + Params: + pretrained_model_name_or_path: either: + - a str with the name of a pre-trained model to load selected in the list of: + . `bert-base-uncased` + . `bert-large-uncased` + . `bert-base-cased` + . `bert-large-cased` + . `bert-base-multilingual-uncased` + . `bert-base-multilingual-cased` + . `bert-base-chinese` + . `bert-base-german-cased` + . `bert-large-uncased-whole-word-masking` + . `bert-large-cased-whole-word-masking` + - a path or url to a pretrained model archive containing: + . `bert_config.json` a configuration file for the model + . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance + - a path or url to a pretrained model archive containing: + . `bert_config.json` a configuration file for the model + . `model.chkpt` a TensorFlow checkpoint + from_tf: should we load the weights from a locally saved TensorFlow checkpoint + cache_dir: an optional path to a folder in which the pre-trained models will be cached. + state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models + *inputs, **kwargs: additional input for the specific Bert class + (ex: num_labels for BertForSequenceClassification) + """ + state_dict = kwargs.get('state_dict', None) + kwargs.pop('state_dict', None) + cache_dir = kwargs.get('cache_dir', None) + kwargs.pop('cache_dir', None) + from_tf = kwargs.get('from_tf', False) + kwargs.pop('from_tf', None) + + if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP: + archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path] + config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path] + else: + if from_tf: + # Directly load from a TensorFlow checkpoint + archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME) + config_file = os.path.join(pretrained_model_name_or_path, BERT_CONFIG_NAME) + else: + archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME) + config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME) + # redirect to the cache, if necessary + try: + resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir) + except EnvironmentError: + if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP: + logger.error( + "Couldn't reach server at '{}' to download pretrained weights.".format( + archive_file)) + else: + logger.error( + "Model name '{}' was not found in model name list ({}). " + "We assumed '{}' was a path or url but couldn't find any file " + "associated to this path or url.".format( + pretrained_model_name_or_path, + ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), + archive_file)) + return None + try: + resolved_config_file = cached_path(config_file, cache_dir=cache_dir) + except EnvironmentError: + if pretrained_model_name_or_path in PRETRAINED_CONFIG_ARCHIVE_MAP: + logger.error( + "Couldn't reach server at '{}' to download pretrained model configuration file.".format( + config_file)) + else: + logger.error( + "Model name '{}' was not found in model name list ({}). " + "We assumed '{}' was a path or url but couldn't find any file " + "associated to this path or url.".format( + pretrained_model_name_or_path, + ', '.join(PRETRAINED_CONFIG_ARCHIVE_MAP.keys()), + config_file)) + return None + if resolved_archive_file == archive_file and resolved_config_file == config_file: + logger.info("loading weights file {}".format(archive_file)) + logger.info("loading configuration file {}".format(config_file)) + else: + logger.info("loading weights file {} from cache at {}".format( + archive_file, resolved_archive_file)) + logger.info("loading configuration file {} from cache at {}".format( + config_file, resolved_config_file)) + # Load config + config = BertConfig.from_json_file(resolved_config_file) + logger.info("Model config {}".format(config)) + # Instantiate model. + model = cls(config, *inputs, **kwargs) + if state_dict is None and not from_tf: + state_dict = torch.load(resolved_archive_file, map_location='cpu') + if from_tf: + # Directly load from a TensorFlow checkpoint + return load_tf_weights_in_bert(model, weights_path) + # Load from a PyTorch state_dict + old_keys = [] + new_keys = [] + for key in state_dict.keys(): + new_key = None + if 'gamma' in key: + new_key = key.replace('gamma', 'weight') + if 'beta' in key: + new_key = key.replace('beta', 'bias') + if new_key: + old_keys.append(key) + new_keys.append(new_key) + for old_key, new_key in zip(old_keys, new_keys): + state_dict[new_key] = state_dict.pop(old_key) + + missing_keys = [] + unexpected_keys = [] + error_msgs = [] + # copy state_dict so _load_from_state_dict can modify it + metadata = getattr(state_dict, '_metadata', None) + state_dict = state_dict.copy() + if metadata is not None: + state_dict._metadata = metadata + + def load(module, prefix=''): + local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) + module._load_from_state_dict( + state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs) + for name, child in module._modules.items(): + if child is not None: + load(child, prefix + name + '.') + + start_prefix = '' + if not hasattr(model, 'bert') and any(s.startswith('bert.') for s in state_dict.keys()): + start_prefix = 'bert.' + load(model, prefix=start_prefix) + if len(missing_keys) > 0: + logger.info("Weights of {} not initialized from pretrained model: {}".format( + model.__class__.__name__, missing_keys)) + if len(unexpected_keys) > 0: + logger.info("Weights from pretrained model not used in {}: {}".format( + model.__class__.__name__, unexpected_keys)) + if len(error_msgs) > 0: + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + model.__class__.__name__, "\n\t".join(error_msgs))) + return model + + +class BertModel(BertPreTrainedModel): + """BERT model ("Bidirectional Embedding Representations from a Transformer"). + + Params: + `config`: a BertConfig class instance with the configuration to build a new model + `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False + `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient. + This can be used to compute head importance metrics. Default: False + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] + with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts + `extract_features.py`, `run_classifier.py` and `run_squad.py`) + `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token + types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to + a `sentence B` token (see BERT paper for more details). + `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices + selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max + input sequence length in the current batch. It's the mask that we typically use for attention when + a batch has varying length sentences. + `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`. + `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. + It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked. + + + Outputs: Tuple of (encoded_layers, pooled_output) + `encoded_layers`: controled by `output_all_encoded_layers` argument: + - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end + of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each + encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size], + - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding + to the last attention block of shape [batch_size, sequence_length, hidden_size], + `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a + classifier pretrained on top of the hidden state associated to the first character of the + input (`CLS`) to train on the Next-Sentence task (see BERT's paper). + + Example usage: + ```python + # Already been converted into WordPiece token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) + + config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + + model = modeling.BertModel(config=config) + all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask) + ``` + """ + + def __init__(self, config, output_attentions=False, keep_multihead_output=False): + super(BertModel, self).__init__(config) + self.output_attentions = output_attentions + self.embeddings = BertEmbeddings(config) + self.encoder = BertEncoder(config, output_attentions=output_attentions, + keep_multihead_output=keep_multihead_output) + self.pooler = BertPooler(config) + self.apply(self.init_bert_weights) + + def prune_heads(self, heads_to_prune): + """ Prunes heads of the model. + heads_to_prune: dict of {layer_num: list of heads to prune in this layer} + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def get_multihead_outputs(self): + """ Gather all multi-head outputs. + Return: list (layers) of multihead module outputs with gradients + """ + return [layer.attention.self.multihead_output for layer in self.encoder.layer] + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True, + head_mask=None): + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + if token_type_ids is None: + token_type_ids = torch.zeros_like(input_ids) + + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + if head_mask is not None: + if head_mask.dim() == 1: + head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) + head_mask = head_mask.expand_as(self.config.num_hidden_layers, -1, -1, -1, -1) + elif head_mask.dim() == 2: + head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze( + -1) # We can specify head_mask for each layer + head_mask = head_mask.to( + dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility + else: + head_mask = [None] * self.config.num_hidden_layers + + embedding_output = self.embeddings(input_ids, token_type_ids) + encoded_layers = self.encoder(embedding_output, + extended_attention_mask, + output_all_encoded_layers=output_all_encoded_layers, + head_mask=head_mask) + if self.output_attentions: + all_attentions, encoded_layers = encoded_layers + sequence_output = encoded_layers[-1] + pooled_output = self.pooler(sequence_output) + if not output_all_encoded_layers: + encoded_layers = encoded_layers[-1] + if self.output_attentions: + return all_attentions, encoded_layers, pooled_output + return encoded_layers, pooled_output + + +class BertForPreTraining(BertPreTrainedModel): + """BERT model with pre-training heads. + This module comprises the BERT model followed by the two pre-training heads: + - the masked language modeling head, and + - the next sentence classification head. + + Params: + `config`: a BertConfig class instance with the configuration to build a new model + `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False + `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient. + This can be used to compute head importance metrics. Default: False + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] + with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts + `extract_features.py`, `run_classifier.py` and `run_squad.py`) + `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token + types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to + a `sentence B` token (see BERT paper for more details). + `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices + selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max + input sequence length in the current batch. It's the mask that we typically use for attention when + a batch has varying length sentences. + `masked_lm_labels`: optional masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] + with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss + is only computed for the labels set in [0, ..., vocab_size] + `next_sentence_label`: optional next sentence classification loss: torch.LongTensor of shape [batch_size] + with indices selected in [0, 1]. + 0 => next sentence is the continuation, 1 => next sentence is a random sentence. + `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. + It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked. + + Outputs: + if `masked_lm_labels` and `next_sentence_label` are not `None`: + Outputs the total_loss which is the sum of the masked language modeling loss and the next + sentence classification loss. + if `masked_lm_labels` or `next_sentence_label` is `None`: + Outputs a tuple comprising + - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and + - the next sentence classification logits of shape [batch_size, 2]. + + Example usage: + ```python + # Already been converted into WordPiece token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) + + config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + + model = BertForPreTraining(config) + masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask) + ``` + """ + + def __init__(self, config, output_attentions=False, keep_multihead_output=False): + super(BertForPreTraining, self).__init__(config) + self.output_attentions = output_attentions + self.bert = BertModel(config, output_attentions=output_attentions, + keep_multihead_output=keep_multihead_output) + self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, + next_sentence_label=None, head_mask=None): + outputs = self.bert(input_ids, token_type_ids, attention_mask, + output_all_encoded_layers=False, head_mask=head_mask) + if self.output_attentions: + all_attentions, sequence_output, pooled_output = outputs + else: + sequence_output, pooled_output = outputs + prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) + + if masked_lm_labels is not None and next_sentence_label is not None: + loss_fct = CrossEntropyLoss(ignore_index=-1) + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) + next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) + total_loss = masked_lm_loss + next_sentence_loss + return total_loss + elif self.output_attentions: + return all_attentions, prediction_scores, seq_relationship_score + return prediction_scores, seq_relationship_score + + +class BertForMaskedLM(BertPreTrainedModel): + """BERT model with the masked language modeling head. + This module comprises the BERT model followed by the masked language modeling head. + + Params: + `config`: a BertConfig class instance with the configuration to build a new model + `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False + `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient. + This can be used to compute head importance metrics. Default: False + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] + with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts + `extract_features.py`, `run_classifier.py` and `run_squad.py`) + `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token + types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to + a `sentence B` token (see BERT paper for more details). + `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices + selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max + input sequence length in the current batch. It's the mask that we typically use for attention when + a batch has varying length sentences. + `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] + with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss + is only computed for the labels set in [0, ..., vocab_size] + `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. + It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked. + + Outputs: + if `masked_lm_labels` is not `None`: + Outputs the masked language modeling loss. + if `masked_lm_labels` is `None`: + Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size]. + + Example usage: + ```python + # Already been converted into WordPiece token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) + + config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + + model = BertForMaskedLM(config) + masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask) + ``` + """ + + def __init__(self, config, output_attentions=False, keep_multihead_output=False): + super(BertForMaskedLM, self).__init__(config) + self.output_attentions = output_attentions + self.bert = BertModel(config, output_attentions=output_attentions, + keep_multihead_output=keep_multihead_output) + self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None): + #todo added by rooh + input_shape = input_ids.size() # (B, C, F) + input_ids = input_ids.view(-1, input_ids.size(-1)) + token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + #todo + outputs = self.bert(input_ids, token_type_ids, attention_mask, + output_all_encoded_layers=False, + head_mask=head_mask) + if self.output_attentions: + all_attentions, sequence_output, _ = outputs + else: + sequence_output, _ = outputs + prediction_scores = self.cls(sequence_output) + + if masked_lm_labels is not None: + # #todo + # prediction_scores = prediction_scores[..., :-1, :].contiguous() + # masked_lm_labels = masked_lm_labels[..., 1:].contiguous() + # #todo + loss_fct = CrossEntropyLoss(ignore_index=-1) + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) + return masked_lm_loss + elif self.output_attentions: + return all_attentions, prediction_scores + return prediction_scores + + +class BertForNextSentencePrediction(BertPreTrainedModel): + """BERT model with next sentence prediction head. + This module comprises the BERT model followed by the next sentence classification head. + + Params: + `config`: a BertConfig class instance with the configuration to build a new model + `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False + `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient. + This can be used to compute head importance metrics. Default: False + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] + with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts + `extract_features.py`, `run_classifier.py` and `run_squad.py`) + `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token + types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to + a `sentence B` token (see BERT paper for more details). + `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices + selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max + input sequence length in the current batch. It's the mask that we typically use for attention when + a batch has varying length sentences. + `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size] + with indices selected in [0, 1]. + 0 => next sentence is the continuation, 1 => next sentence is a random sentence. + `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. + It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked. + + Outputs: + if `next_sentence_label` is not `None`: + Outputs the total_loss which is the sum of the masked language modeling loss and the next + sentence classification loss. + if `next_sentence_label` is `None`: + Outputs the next sentence classification logits of shape [batch_size, 2]. + + Example usage: + ```python + # Already been converted into WordPiece token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) + + config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + + model = BertForNextSentencePrediction(config) + seq_relationship_logits = model(input_ids, token_type_ids, input_mask) + ``` + """ + + def __init__(self, config, output_attentions=False, keep_multihead_output=False): + super(BertForNextSentencePrediction, self).__init__(config) + self.output_attentions = output_attentions + self.bert = BertModel(config, output_attentions=output_attentions, + keep_multihead_output=keep_multihead_output) + self.cls = BertOnlyNSPHead(config) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None, head_mask=None): + outputs = self.bert(input_ids, token_type_ids, attention_mask, + output_all_encoded_layers=False, + head_mask=head_mask) + if self.output_attentions: + all_attentions, _, pooled_output = outputs + else: + _, pooled_output = outputs + seq_relationship_score = self.cls(pooled_output) + + if next_sentence_label is not None: + loss_fct = CrossEntropyLoss(ignore_index=-1) + next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) + return next_sentence_loss + elif self.output_attentions: + return all_attentions, seq_relationship_score + return seq_relationship_score + + +class BertForSequenceClassification(BertPreTrainedModel): + """BERT model for classification. + This module is composed of the BERT model with a linear layer on top of + the pooled output. + + Params: + `config`: a BertConfig class instance with the configuration to build a new model + `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False + `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient. + This can be used to compute head importance metrics. Default: False + `num_labels`: the number of classes for the classifier. Default = 2. + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] + with the word token indices in the vocabulary. Items in the batch should begin with the special "CLS" token. (see the tokens preprocessing logic in the scripts + `extract_features.py`, `run_classifier.py` and `run_squad.py`) + `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token + types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to + a `sentence B` token (see BERT paper for more details). + `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices + selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max + input sequence length in the current batch. It's the mask that we typically use for attention when + a batch has varying length sentences. + `labels`: labels for the classification output: torch.LongTensor of shape [batch_size] + with indices selected in [0, ..., num_labels]. + `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. + It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked. + + Outputs: + if `labels` is not `None`: + Outputs the CrossEntropy classification loss of the output with the labels. + if `labels` is `None`: + Outputs the classification logits of shape [batch_size, num_labels]. + + Example usage: + ```python + # Already been converted into WordPiece token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) + + config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + + num_labels = 2 + + model = BertForSequenceClassification(config, num_labels) + logits = model(input_ids, token_type_ids, input_mask) + ``` + """ + + def __init__(self, config, num_labels=2, output_attentions=False, keep_multihead_output=False): + super(BertForSequenceClassification, self).__init__(config) + self.output_attentions = output_attentions + self.num_labels = num_labels + self.bert = BertModel(config, output_attentions=output_attentions, + keep_multihead_output=keep_multihead_output) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, num_labels) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None): + outputs = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False, + head_mask=head_mask) + if self.output_attentions: + all_attentions, _, pooled_output = outputs + else: + _, pooled_output = outputs + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + return loss + elif self.output_attentions: + return all_attentions, logits + return logits + + +class BertForMultipleChoice(BertPreTrainedModel): + """BERT model for multiple choice tasks. + This module is composed of the BERT model with a linear layer on top of + the pooled output. + + Params: + `config`: a BertConfig class instance with the configuration to build a new model + `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False + `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient. + This can be used to compute head importance metrics. Default: False + `num_choices`: the number of classes for the classifier. Default = 2. + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] + with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts + `extract_features.py`, `run_classifier.py` and `run_squad.py`) + `token_type_ids`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] + with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` + and type 1 corresponds to a `sentence B` token (see BERT paper for more details). + `attention_mask`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] with indices + selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max + input sequence length in the current batch. It's the mask that we typically use for attention when + a batch has varying length sentences. + `labels`: labels for the classification output: torch.LongTensor of shape [batch_size] + with indices selected in [0, ..., num_choices]. + `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. + It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked. + + Outputs: + if `labels` is not `None`: + Outputs the CrossEntropy classification loss of the output with the labels. + if `labels` is `None`: + Outputs the classification logits of shape [batch_size, num_labels]. + + Example usage: + ```python + # Already been converted into WordPiece token ids + input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]]) + input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]]) + token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]]) + config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + + num_choices = 2 + + model = BertForMultipleChoice(config, num_choices) + logits = model(input_ids, token_type_ids, input_mask) + ``` + """ + + def __init__(self, config, num_choices=2, output_attentions=False, keep_multihead_output=False): + super(BertForMultipleChoice, self).__init__(config) + self.output_attentions = output_attentions + self.num_choices = num_choices + self.bert = BertModel(config, output_attentions=output_attentions, + keep_multihead_output=keep_multihead_output) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, 1) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None): + flat_input_ids = input_ids.view(-1, input_ids.size(-1)) + flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + outputs = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, output_all_encoded_layers=False, + head_mask=head_mask) + if self.output_attentions: + all_attentions, _, pooled_output = outputs + else: + _, pooled_output = outputs + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, self.num_choices) + + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + return loss + elif self.output_attentions: + return all_attentions, reshaped_logits + return reshaped_logits + + +class BertMultipleChoice(BertPreTrainedModel): + + def __init__(self, config, num_choices=2, output_attentions=False, keep_multihead_output=False): + super(BertMultipleChoice, self).__init__(config) + self.output_attentions = output_attentions + self.num_choices = num_choices + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, 1) + self.apply(self.init_bert_weights) + + def forward(self, pooled_output, num_choices=2, labels=None, head_mask=None): + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, num_choices) + + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + return loss + return reshaped_logits + + +class BertForTokenClassification(BertPreTrainedModel): + """BERT model for token-level classification. + This module is composed of the BERT model with a linear layer on top of + the full hidden state of the last layer. + + Params: + `config`: a BertConfig class instance with the configuration to build a new model + `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False + `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient. + This can be used to compute head importance metrics. Default: False + `num_labels`: the number of classes for the classifier. Default = 2. + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] + with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts + `extract_features.py`, `run_classifier.py` and `run_squad.py`) + `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token + types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to + a `sentence B` token (see BERT paper for more details). + `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices + selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max + input sequence length in the current batch. It's the mask that we typically use for attention when + a batch has varying length sentences. + `labels`: labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length] + with indices selected in [0, ..., num_labels]. + `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. + It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked. + + Outputs: + if `labels` is not `None`: + Outputs the CrossEntropy classification loss of the output with the labels. + if `labels` is `None`: + Outputs the classification logits of shape [batch_size, sequence_length, num_labels]. + + Example usage: + ```python + # Already been converted into WordPiece token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) + + config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + + num_labels = 2 + + model = BertForTokenClassification(config, num_labels) + logits = model(input_ids, token_type_ids, input_mask) + ``` + """ + + def __init__(self, config, num_labels=2, output_attentions=False, keep_multihead_output=False): + super(BertForTokenClassification, self).__init__(config) + self.output_attentions = output_attentions + self.num_labels = num_labels + self.bert = BertModel(config, output_attentions=output_attentions, + keep_multihead_output=keep_multihead_output) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, num_labels) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None): + outputs = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False, + head_mask=head_mask) + if self.output_attentions: + all_attentions, sequence_output, _ = outputs + else: + sequence_output, _ = outputs + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + if labels is not None: + loss_fct = CrossEntropyLoss() + # Only keep active parts of the loss + if attention_mask is not None: + active_loss = attention_mask.view(-1) == 1 + active_logits = logits.view(-1, self.num_labels)[active_loss] + active_labels = labels.view(-1)[active_loss] + loss = loss_fct(active_logits, active_labels) + else: + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + return loss + elif self.output_attentions: + return all_attentions, logits + return logits + + +class BertForQuestionAnswering(BertPreTrainedModel): + """BERT model for Question Answering (span extraction). + This module is composed of the BERT model with a linear layer on top of + the sequence output that computes start_logits and end_logits + + Params: + `config`: a BertConfig class instance with the configuration to build a new model + `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False + `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient. + This can be used to compute head importance metrics. Default: False + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] + with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts + `extract_features.py`, `run_classifier.py` and `run_squad.py`) + `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token + types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to + a `sentence B` token (see BERT paper for more details). + `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices + selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max + input sequence length in the current batch. It's the mask that we typically use for attention when + a batch has varying length sentences. + `start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size]. + Positions are clamped to the length of the sequence and position outside of the sequence are not taken + into account for computing the loss. + `end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size]. + Positions are clamped to the length of the sequence and position outside of the sequence are not taken + into account for computing the loss. + `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. + It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked. + + Outputs: + if `start_positions` and `end_positions` are not `None`: + Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions. + if `start_positions` or `end_positions` is `None`: + Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end + position tokens of shape [batch_size, sequence_length]. + + Example usage: + ```python + # Already been converted into WordPiece token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) + + config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + + model = BertForQuestionAnswering(config) + start_logits, end_logits = model(input_ids, token_type_ids, input_mask) + ``` + """ + + def __init__(self, config, output_attentions=False, keep_multihead_output=False): + super(BertForQuestionAnswering, self).__init__(config) + self.output_attentions = output_attentions + self.bert = BertModel(config, output_attentions=output_attentions, + keep_multihead_output=keep_multihead_output) + self.qa_outputs = nn.Linear(config.hidden_size, 2) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None, + end_positions=None, head_mask=None): + outputs = self.bert(input_ids, token_type_ids, attention_mask, + output_all_encoded_layers=False, + head_mask=head_mask) + if self.output_attentions: + all_attentions, sequence_output, _ = outputs + else: + sequence_output, _ = outputs + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions.clamp_(0, ignored_index) + end_positions.clamp_(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + return total_loss + elif self.output_attentions: + return all_attentions, start_logits, end_logits + return start_logits, end_logits + + +class BertLMHeadModel(BertPreTrainedModel): + def __init__(self, config, output_attentions=False): + super(BertLMHeadModel, self).__init__(config) + self.bert = BertModel(config, output_attentions=output_attentions) + self.lm_head = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, input_mask=None, lm_labels=None, token_type_ids=None, position_ids=None): + input_shape = input_ids.size() # (B, C, F) + flat_input_ids = input_ids.view(-1, input_ids.size(-1)) + flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + flat_attention_mask = input_mask.view(-1, input_mask.size(-1)) if input_mask is not None else None + hidden_states, pooled_output = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, + output_all_encoded_layers=False) + + lm_logits = self.lm_head(hidden_states) + + if lm_labels is not None: + shift_logits = lm_logits[..., :-1, :].contiguous() + shift_labels = lm_labels[..., 1:].contiguous() + loss_fct = CrossEntropyLoss(ignore_index=-1) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + return loss + + return lm_logits + + +class BertDoubleHeadsModel(BertPreTrainedModel): + def __init__(self, config, output_attentions=False): + super(BertDoubleHeadsModel, self).__init__(config) + self.bert = BertModel(config, output_attentions=output_attentions) + self.lm_head = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight) + self.multiple_choice_head = BertMultipleChoice(config) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, mc_token_ids, input_mask=None, lm_labels=None, mc_labels=None, token_type_ids=None, + position_ids=None): + input_shape = input_ids.size() # (B, C, F) + flat_input_ids = input_ids.view(-1, input_ids.size(-1)) + flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + flat_attention_mask = input_mask.view(-1, input_mask.size(-1)) if input_mask is not None else None + hidden_states, pooled_output = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, + output_all_encoded_layers=False) + + num_choices = input_shape[1] + output_shape = (input_shape) + (hidden_states.size(-1),) + hidden_states = hidden_states.view(*output_shape) + + lm_logits = self.lm_head(hidden_states) + mc_logits = self.multiple_choice_head(pooled_output, num_choices=num_choices) + losses = [] + if lm_labels is not None: + #bert is not a causal language model so the lm loss can't be defined. But I used it + # and for now it works pretty well + shift_logits = lm_logits[..., :-1, :].contiguous() + shift_labels = lm_labels[..., 1:].contiguous() + loss_fct = CrossEntropyLoss(ignore_index=-1) + losses.append(loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))) + if mc_labels is not None: + loss_fct = CrossEntropyLoss() + losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))) + if losses: + return losses + return lm_logits, mc_logits + + +# class BertOpenAIDoubleHeadsModel(BertPreTrainedModel): +# def __init__(self, config, output_attentions=False): +# super(BertOpenAIDoubleHeadsModel, self).__init__(config) +# self.bert = BertModel(config, output_attentions=output_attentions) +# self.transformer = OpenAIGPTModel(config, output_attentions=output_attentions) +# self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config) +# self.multiple_choice_head = BertMultipleChoice(config) +# self.apply(self.init_bert_weights) +# +# def forward(self, input_ids, mc_token_ids, input_mask=None, lm_labels=None, mc_labels=None, token_type_ids=None, +# position_ids=None): +# input_shape = input_ids.size() # (B, C, F) +# flat_input_ids = input_ids.view(-1, input_ids.size(-1)) +# flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None +# flat_attention_mask = input_mask.view(-1, input_mask.size(-1)) if input_mask is not None else None +# bert_hidden_states, pooled_output = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, +# output_all_encoded_layers=False) +# +# transformer_hidden_states = self.transformer(input_ids, position_ids, token_type_ids) +# +# num_choices = input_shape[1] +# # output_shape = (input_shape) + (hidden_states.size(-1),) +# # hidden_states = hidden_states.view(*output_shape) +# +# lm_logits = self.lm_head(transformer_hidden_states) +# mc_logits = self.multiple_choice_head(pooled_output, num_choices=num_choices) +# losses = [] +# if lm_labels is not None: +# shift_logits = lm_logits[..., :-1, :].contiguous() +# shift_labels = lm_labels[..., 1:].contiguous() +# loss_fct = CrossEntropyLoss(ignore_index=-1) +# losses.append(loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))) +# if mc_labels is not None: +# loss_fct = CrossEntropyLoss() +# losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))) +# if losses: +# return losses +# return lm_logits, mc_logits \ No newline at end of file diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py new file mode 100644 index 0000000..3d227a3 --- /dev/null +++ b/pytorch_pretrained_bert/modeling_gpt2.py @@ -0,0 +1,821 @@ +# coding=utf-8 +# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch OpenAI GPT-2 model.""" + +from __future__ import absolute_import, division, print_function, unicode_literals + +import collections +import copy +import json +import logging +import math +import os +import shutil +import tarfile +import tempfile +import sys +from io import open + +import torch +import torch.nn as nn +from torch.nn import CrossEntropyLoss +from torch.nn.parameter import Parameter + +from .file_utils import cached_path, CONFIG_NAME, WEIGHTS_NAME +from .modeling import BertLayerNorm as LayerNorm + +logger = logging.getLogger(__name__) + +PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin", + "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin"} +PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json", + "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json"} + +def load_tf_weights_in_gpt2(model, gpt2_checkpoint_path): + """ Load tf checkpoints in a pytorch model + """ + try: + import re + import numpy as np + import tensorflow as tf + except ImportError: + print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions.") + raise + tf_path = os.path.abspath(gpt2_checkpoint_path) + print("Converting TensorFlow checkpoint from {}".format(tf_path)) + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + for name, shape in init_vars: + print("Loading TF weight {} with shape {}".format(name, shape)) + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array.squeeze()) + + for name, array in zip(names, arrays): + name = name[6:] # skip "model/" + name = name.split('/') + pointer = model + for m_name in name: + if re.fullmatch(r'[A-Za-z]+\d+', m_name): + l = re.split(r'(\d+)', m_name) + else: + l = [m_name] + if l[0] == 'w' or l[0] == 'g': + pointer = getattr(pointer, 'weight') + elif l[0] == 'b': + pointer = getattr(pointer, 'bias') + elif l[0] == 'wpe' or l[0] == 'wte': + pointer = getattr(pointer, l[0]) + pointer = getattr(pointer, 'weight') + else: + pointer = getattr(pointer, l[0]) + if len(l) >= 2: + num = int(l[1]) + pointer = pointer[num] + try: + assert pointer.shape == array.shape + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + print("Initialize PyTorch weight {}".format(name)) + pointer.data = torch.from_numpy(array) + return model + + +def gelu(x): + return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) + + +class GPT2Config(object): + """Configuration class to store the configuration of a `GPT2Model`. + """ + + def __init__( + self, + vocab_size_or_config_json_file=50257, + n_special=0, + n_positions=1024, + n_ctx=1024, + n_embd=768, + n_layer=12, + n_head=12, + resid_pdrop=0.1, + embd_pdrop=0.1, + attn_pdrop=0.1, + layer_norm_epsilon=1e-5, + initializer_range=0.02, + predict_special_tokens=True + ): + """Constructs GPT2Config. + + Args: + vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file. + n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...) + n_positions: Number of positional embeddings. + n_ctx: Size of the causal mask (usually same as n_positions). + n_embd: Dimensionality of the embeddings and hidden states. + n_layer: Number of hidden layers in the Transformer encoder. + n_head: Number of attention heads for each attention layer in + the Transformer encoder. + layer_norm_epsilon: epsilon to use in the layer norm layers + resid_pdrop: The dropout probabilitiy for all fully connected + layers in the embeddings, encoder, and pooler. + attn_pdrop: The dropout ratio for the attention + probabilities. + embd_pdrop: The dropout ratio for the embeddings. + initializer_range: The sttdev of the truncated_normal_initializer for + initializing all weight matrices. + predict_special_tokens: should we predict special tokens (when the model has a LM head) + """ + if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 + and isinstance(vocab_size_or_config_json_file, unicode)): + with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader: + json_config = json.loads(reader.read()) + for key, value in json_config.items(): + self.__dict__[key] = value + elif isinstance(vocab_size_or_config_json_file, int): + self.vocab_size = vocab_size_or_config_json_file + self.n_special = n_special + self.n_ctx = n_ctx + self.n_positions = n_positions + self.n_embd = n_embd + self.n_layer = n_layer + self.n_head = n_head + self.resid_pdrop = resid_pdrop + self.embd_pdrop = embd_pdrop + self.attn_pdrop = attn_pdrop + self.layer_norm_epsilon = layer_norm_epsilon + self.initializer_range = initializer_range + self.predict_special_tokens = predict_special_tokens + else: + raise ValueError( + "First argument must be either a vocabulary size (int)" + "or the path to a pretrained model config file (str)" + ) + + @property + def total_tokens_embeddings(self): + return self.vocab_size + self.n_special + + @classmethod + def from_dict(cls, json_object): + """Constructs a `GPT2Config` from a Python dictionary of parameters.""" + config = GPT2Config(vocab_size_or_config_json_file=-1) + for key, value in json_object.items(): + config.__dict__[key] = value + return config + + @classmethod + def from_json_file(cls, json_file): + """Constructs a `GPT2Config` from a json file of parameters.""" + with open(json_file, "r", encoding="utf-8") as reader: + text = reader.read() + return cls.from_dict(json.loads(text)) + + def __repr__(self): + return str(self.to_json_string()) + + def to_dict(self): + """Serializes this instance to a Python dictionary.""" + output = copy.deepcopy(self.__dict__) + return output + + def to_json_string(self): + """Serializes this instance to a JSON string.""" + return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" + + def to_json_file(self, json_file_path): + """ Save this instance to a json file.""" + with open(json_file_path, "w", encoding='utf-8') as writer: + writer.write(self.to_json_string()) + + +class Conv1D(nn.Module): + def __init__(self, nf, nx): + super(Conv1D, self).__init__() + self.nf = nf + w = torch.empty(nx, nf) + nn.init.normal_(w, std=0.02) + self.weight = Parameter(w) + self.bias = Parameter(torch.zeros(nf)) + + def forward(self, x): + size_out = x.size()[:-1] + (self.nf,) + x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight) + x = x.view(*size_out) + return x + + +class Attention(nn.Module): + def __init__(self, nx, n_ctx, config, scale=False, output_attentions=False): + super(Attention, self).__init__() + n_state = nx # in Attention: n_state=768 (nx=n_embd) + # [switch nx => n_state from Block to Attention to keep identical to TF implem] + assert n_state % config.n_head == 0 + self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx)) + self.n_head = config.n_head + self.split_size = n_state + self.scale = scale + self.output_attentions = output_attentions + self.c_attn = Conv1D(n_state * 3, nx) + self.c_proj = Conv1D(n_state, nx) + self.attn_dropout = nn.Dropout(config.attn_pdrop) + self.resid_dropout = nn.Dropout(config.resid_pdrop) + + def _attn(self, q, k, v): + w = torch.matmul(q, k) + if self.scale: + w = w / math.sqrt(v.size(-1)) + nd, ns = w.size(-2), w.size(-1) + b = self.bias[:, :, ns-nd:ns, :ns] + w = w * b - 1e4 * (1 - b) + + w = nn.Softmax(dim=-1)(w) + w = self.attn_dropout(w) + if self.output_attentions: + return w, torch.matmul(w, v) + return torch.matmul(w, v) + + def merge_heads(self, x): + x = x.permute(0, 2, 1, 3).contiguous() + new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),) + return x.view(*new_x_shape) # in Tensorflow implem: fct merge_states + + def split_heads(self, x, k=False): + new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head) + x = x.view(*new_x_shape) # in Tensorflow implem: fct split_states + if k: + return x.permute(0, 2, 3, 1) # (batch, head, head_features, seq_length) + else: + return x.permute(0, 2, 1, 3) # (batch, head, seq_length, head_features) + + def forward(self, x, layer_past=None): + x = self.c_attn(x) + query, key, value = x.split(self.split_size, dim=2) + query = self.split_heads(query) + key = self.split_heads(key, k=True) + value = self.split_heads(value) + if layer_past is not None: + past_key, past_value = layer_past[0].transpose(-2, -1), layer_past[1] # transpose back cf below + key = torch.cat((past_key, key), dim=-1) + value = torch.cat((past_value, value), dim=-2) + present = torch.stack((key.transpose(-2, -1), value)) # transpose to have same shapes for stacking + a = self._attn(query, key, value) + if self.output_attentions: + attentions, a = a + a = self.merge_heads(a) + a = self.c_proj(a) + a = self.resid_dropout(a) + if self.output_attentions: + return attentions, a, present + return a, present + + +class MLP(nn.Module): + def __init__(self, n_state, config): # in MLP: n_state=3072 (4 * n_embd) + super(MLP, self).__init__() + nx = config.n_embd + self.c_fc = Conv1D(n_state, nx) + self.c_proj = Conv1D(nx, n_state) + self.act = gelu + self.dropout = nn.Dropout(config.resid_pdrop) + + def forward(self, x): + h = self.act(self.c_fc(x)) + h2 = self.c_proj(h) + return self.dropout(h2) + + +class Block(nn.Module): + def __init__(self, n_ctx, config, scale=False, output_attentions=False): + super(Block, self).__init__() + nx = config.n_embd + self.output_attentions = output_attentions + self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon) + self.attn = Attention(nx, n_ctx, config, scale, output_attentions) + self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon) + self.mlp = MLP(4 * nx, config) + + def forward(self, x, layer_past=None): + output_attn = self.attn(self.ln_1(x), layer_past=layer_past) + if self.output_attentions: + attentions, a, present = output_attn + else: + a, present = output_attn + x = x + a + m = self.mlp(self.ln_2(x)) + x = x + m + if self.output_attentions: + return attentions, x, present + return x, present + + +class GPT2LMHead(nn.Module): + """ Language Model Head for the transformer """ + + def __init__(self, model_embeddings_weights, config): + super(GPT2LMHead, self).__init__() + self.n_embd = config.n_embd + self.vocab_size = config.vocab_size + self.predict_special_tokens = config.predict_special_tokens + embed_shape = model_embeddings_weights.shape + self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False) + self.set_embeddings_weights(model_embeddings_weights) + + def set_embeddings_weights(self, model_embeddings_weights, predict_special_tokens=True): + self.predict_special_tokens = predict_special_tokens + self.decoder.weight = model_embeddings_weights # Tied weights + + def forward(self, hidden_state): + lm_logits = self.decoder(hidden_state) + if not self.predict_special_tokens: + lm_logits = lm_logits[..., :self.vocab_size] + return lm_logits + + +class GPT2MultipleChoiceHead(nn.Module): + """ Classifier Head for the transformer """ + + def __init__(self, config): + super(GPT2MultipleChoiceHead, self).__init__() + self.n_embd = config.n_embd + self.dropout = nn.Dropout2d(config.resid_pdrop) # To reproduce the noise_shape parameter of TF implementation + self.linear = nn.Linear(config.n_embd, 1) + + nn.init.normal_(self.linear.weight, std=0.02) + nn.init.normal_(self.linear.bias, 0) + + def forward(self, hidden_states, mc_token_ids): + # Classification logits + # hidden_state (bsz, num_choices, seq_length, hidden_size) + # mc_token_ids (bsz, num_choices) + mc_token_ids = mc_token_ids.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, -1, hidden_states.size(-1)) + # (bsz, num_choices, 1, hidden_size) + multiple_choice_h = hidden_states.gather(2, mc_token_ids).squeeze(2) + # (bsz, num_choices, hidden_size) + multiple_choice_h = self.dropout(multiple_choice_h.transpose(1, 2)).transpose(1, 2) + multiple_choice_logits = self.linear(multiple_choice_h).squeeze(-1) + # (bsz, num_choices) + return multiple_choice_logits + + +class GPT2PreTrainedModel(nn.Module): + """ An abstract class to handle weights initialization and + a simple interface for dowloading and loading pretrained models. + """ + + def __init__(self, config, *inputs, **kwargs): + super(GPT2PreTrainedModel, self).__init__() + if not isinstance(config, GPT2Config): + raise ValueError( + "Parameter config in `{}(config)` should be an instance of class `GPT2Config`. " + "To create a model from a pretrained model use " + "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format( + self.__class__.__name__, self.__class__.__name__ + ) + ) + self.config = config + + def init_weights(self, module): + """ Initialize the weights. + """ + if isinstance(module, (nn.Linear, nn.Embedding)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + elif isinstance(module, LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): + """ + Instantiate a GPT2PreTrainedModel from a pre-trained model file or a pytorch state dict. + Download and cache the pre-trained model file if needed. + + Params: + pretrained_model_name_or_path: either: + - a str with the name of a pre-trained model to load selected in the list of: + . `gpt2` + - a path or url to a pretrained model archive containing: + . `gpt2_config.json` a configuration file for the model + . `pytorch_model.bin` a PyTorch dump of a GPT2Model instance + - a path or url to a pretrained model archive containing: + . `gpt2_config.json` a configuration file for the model + . a TensorFlow checkpoint with trained weights + from_tf: should we load the weights from a locally saved TensorFlow checkpoint + cache_dir: an optional path to a folder in which the pre-trained models will be cached. + state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models + *inputs, **kwargs: additional input for the specific GPT2 class + """ + state_dict = kwargs.get('state_dict', None) + kwargs.pop('state_dict', None) + cache_dir = kwargs.get('cache_dir', None) + kwargs.pop('cache_dir', None) + from_tf = kwargs.get('from_tf', False) + kwargs.pop('from_tf', None) + num_special_tokens = kwargs.get('num_special_tokens', None) + kwargs.pop('num_special_tokens', None) + + if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP: + archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path] + config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path] + else: + archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME) + config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME) + # redirect to the cache, if necessary + try: + resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir) + resolved_config_file = cached_path(config_file, cache_dir=cache_dir) + except EnvironmentError: + logger.error( + "Model name '{}' was not found in model name list ({}). " + "We assumed '{}' was a path or url but couldn't find files {} and {} " + "at this path or url.".format( + pretrained_model_name_or_path, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path, + archive_file, config_file + ) + ) + return None + if resolved_archive_file == archive_file and resolved_config_file == config_file: + logger.info("loading weights file {}".format(archive_file)) + logger.info("loading configuration file {}".format(config_file)) + else: + logger.info("loading weights file {} from cache at {}".format( + archive_file, resolved_archive_file)) + logger.info("loading configuration file {} from cache at {}".format( + config_file, resolved_config_file)) + # Load config + config = GPT2Config.from_json_file(resolved_config_file) + logger.info("Model config {}".format(config)) + # Instantiate model. + model = cls(config, *inputs, **kwargs) + if state_dict is None and not from_tf: + state_dict = torch.load(resolved_archive_file, map_location='cpu') + if from_tf: + # Directly load from a TensorFlow checkpoint (stored as NumPy array) + return load_tf_weights_in_gpt2(model, resolved_archive_file) + + old_keys = [] + new_keys = [] + for key in state_dict.keys(): + new_key = None + if key.endswith(".g"): + new_key = key[:-2] + ".weight" + elif key.endswith(".b"): + new_key = key[:-2] + ".bias" + elif key.endswith(".w"): + new_key = key[:-2] + ".weight" + if new_key: + old_keys.append(key) + new_keys.append(new_key) + for old_key, new_key in zip(old_keys, new_keys): + state_dict[new_key] = state_dict.pop(old_key) + + missing_keys = [] + unexpected_keys = [] + error_msgs = [] + # copy state_dict so _load_from_state_dict can modify it + metadata = getattr(state_dict, "_metadata", None) + state_dict = state_dict.copy() + if metadata is not None: + state_dict._metadata = metadata + + def load(module, prefix=""): + local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) + module._load_from_state_dict( + state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs + ) + for name, child in module._modules.items(): + if child is not None: + load(child, prefix + name + ".") + + start_model = model + if hasattr(model, "transformer") and all(not s.startswith('transformer.') for s in state_dict.keys()): + start_model = model.transformer + load(start_model, prefix="") + + if len(missing_keys) > 0: + logger.info( + "Weights of {} not initialized from pretrained model: {}".format(model.__class__.__name__, missing_keys) + ) + if len(unexpected_keys) > 0: + logger.info( + "Weights from pretrained model not used in {}: {}".format(model.__class__.__name__, unexpected_keys) + ) + if len(error_msgs) > 0: + raise RuntimeError( + "Error(s) in loading state_dict for {}:\n\t{}".format(model.__class__.__name__, "\n\t".join(error_msgs)) + ) + + # Add additional embeddings for special tokens if needed + # This step also make sure we are still sharing the output and input embeddings after loading weights + model.set_num_special_tokens(num_special_tokens if num_special_tokens is not None else config.n_special) + return model + + +class GPT2Model(GPT2PreTrainedModel): + """OpenAI GPT-2 model ("Language Models are Unsupervised Multitask Learners"). + + GPT-2 use a single embedding matrix to store the word and special embeddings. + Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]... + Special tokens need to be trained during the fine-tuning if you use them. + The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function. + + The embeddings are ordered as follow in the token embeddings matrice: + [0, ---------------------- + ... -> word embeddings + config.vocab_size - 1, ______________________ + config.vocab_size, + ... -> special embeddings + config.vocab_size + config.n_special - 1] ______________________ + + where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is: + total_tokens_embeddings = config.vocab_size + config.n_special + You should use the associate indices to index the embeddings. + + Params: + config: a GPT2Config class instance with the configuration to build a new model + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length] + were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[ + `position_ids`: an optional torch.LongTensor with the same shape as input_ids + with the position indices (selected in the range [0, config.n_positions - 1[. + `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids + You can use it to add a third type of embedding to each input token in the sequence + (the previous two being the word and position embeddings). + The input, position and token_type embeddings are summed inside the Transformer before the first + self-attention block. + `past`: an optional list of torch.LongTensor that contains pre-computed hidden-states + (key and values in the attention blocks) to speed up sequential decoding + (this is the presents output of the model, cf. below). + + Outputs a tuple consisting of: + `hidden_states`: the encoded-hidden-states at the top of the model + as a torch.FloatTensor of size [batch_size, sequence_length, hidden_size] + (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids) + `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as + torch.FloatTensors. They can be reused to speed up sequential decoding. + + Example usage: + ```python + # Already been converted into BPE token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + + config = modeling_gpt2.GPT2Config() + + model = modeling_gpt2.GPT2Model(config) + hidden_states, presents = model(input_ids) + ``` + """ + + def __init__(self, config, output_attentions=False): + super(GPT2Model, self).__init__(config) + self.output_attentions = output_attentions + self.wte = nn.Embedding(config.total_tokens_embeddings, config.n_embd) + self.wpe = nn.Embedding(config.n_positions, config.n_embd) + self.drop = nn.Dropout(config.embd_pdrop) + block = Block(config.n_ctx, config, scale=True, output_attentions=output_attentions) + self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)]) + self.ln_f = LayerNorm(config.n_embd, eps=config.layer_norm_epsilon) + + self.apply(self.init_weights) + + def set_num_special_tokens(self, num_special_tokens): + " Update input embeddings with new embedding matrice if needed " + if self.config.n_special == num_special_tokens: + return + # Update config + self.config.n_special = num_special_tokens + # Build new embeddings and initialize all new embeddings (in particular the special tokens) + old_embed = self.wte + self.wte = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd) + self.wte.to(old_embed.weight.device) + self.init_weights(self.wte) + # Copy word embeddings from the previous weights + self.wte.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :] + + def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None): + if past is None: + past_length = 0 + past = [None] * len(self.h) + else: + past_length = past[0][0].size(-2) + if position_ids is None: + position_ids = torch.arange(past_length, input_ids.size(-1) + past_length, dtype=torch.long, device=input_ids.device) + position_ids = position_ids.unsqueeze(0).expand_as(input_ids) + + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_ids.size(-1)) + position_ids = position_ids.view(-1, position_ids.size(-1)) + + inputs_embeds = self.wte(input_ids) + position_embeds = self.wpe(position_ids) + if token_type_ids is not None: + token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) + token_type_embeds = self.wte(token_type_ids) + else: + token_type_embeds = 0 + hidden_states = inputs_embeds + position_embeds + token_type_embeds + hidden_states = self.drop(hidden_states) + + presents = [] + all_attentions = [] + for block, layer_past in zip(self.h, past): + if self.output_attentions: + attentions, hidden_states, present = block(hidden_states, layer_past) + all_attentions.append(attentions) + else: + hidden_states, present = block(hidden_states, layer_past) + presents.append(present) + hidden_states = self.ln_f(hidden_states) + output_shape = input_shape + (hidden_states.size(-1),) + if self.output_attentions: + return all_attentions, hidden_states.view(*output_shape), presents + return hidden_states.view(*output_shape), presents + + +class GPT2LMHeadModel(GPT2PreTrainedModel): + """OpenAI GPT-2 model with a Language Modeling head ("Language Models are Unsupervised Multitask Learners"). + + Params: + config: a GPT2Config class instance with the configuration to build a new model + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length] + were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[ + `position_ids`: an optional torch.LongTensor with the same shape as input_ids + with the position indices (selected in the range [0, config.n_positions - 1[. + `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids + You can use it to add a third type of embedding to each input token in the sequence + (the previous two being the word and position embeddings). + The input, position and token_type embeddings are summed inside the Transformer before the first + self-attention block. + `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] + with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss + is only computed for the labels set in [0, ..., vocab_size] + `past`: an optional list of torch.LongTensor that contains pre-computed hidden-states + (key and values in the attention blocks) to speed up sequential decoding + (this is the presents output of the model, cf. below). + + Outputs: + if `lm_labels` is not `None`: + Outputs the language modeling loss. + else a tuple: + `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, sequence_length, config.vocab_size] + (or more generally [d_1, ..., d_n, config.vocab_size] were d_1 ... d_n are the dimension of input_ids) + `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as + torch.FloatTensors. They can be reused to speed up sequential decoding. + + Example usage: + ```python + # Already been converted into BPE token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + + config = modeling_gpt2.GPT2Config() + + model = modeling_gpt2.GPT2LMHeadModel(config) + lm_logits, presents = model(input_ids) + ``` + """ + + def __init__(self, config, output_attentions=False): + super(GPT2LMHeadModel, self).__init__(config) + self.transformer = GPT2Model(config, output_attentions=output_attentions) + self.lm_head = GPT2LMHead(self.transformer.wte.weight, config) + self.apply(self.init_weights) + + def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True): + """ Update input and output embeddings with new embedding matrice + Make sure we are sharing the embeddings + """ + self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens + self.transformer.set_num_special_tokens(num_special_tokens) + self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens) + + def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None): + transformer_output = self.transformer(input_ids, position_ids, token_type_ids, past) + if self.transformer.output_attentions: + all_attentions, hidden_states, presents = transformer_output + else: + hidden_states, presents = transformer_output + lm_logits = self.lm_head(hidden_states) + if lm_labels is not None: + # Shift so that tokens < n predict n + shift_logits = lm_logits[..., :-1, :].contiguous() + shift_labels = lm_labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss(ignore_index=-1) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), + shift_labels.view(-1)) + return loss + if self.transformer.output_attentions: + return all_attentions, lm_logits, presents + return lm_logits, presents + + +class GPT2DoubleHeadsModel(GPT2PreTrainedModel): + """OpenAI GPT-2 model with a Language Modeling and a Multiple Choice head ("Language Models are Unsupervised Multitask Learners"). + + Params: + config: a GPT2Config class instance with the configuration to build a new model + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] with the BPE token + indices selected in the range [0, config.vocab_size[ + `mc_token_ids`: a torch.LongTensor of shape [batch_size, num_choices] with the index of the token from + which we should take the hidden state to feed the multiple choice classifier (usually last token of the sequence) + `position_ids`: an optional torch.LongTensor with the same shape as input_ids + with the position indices (selected in the range [0, config.n_positions - 1[. + `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids + You can use it to add a third type of embedding to each input token in the sequence + (the previous two being the word and position embeddings). + The input, position and token_type embeddings are summed inside the Transformer before the first + self-attention block. + `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, num_choices, sequence_length] + with indices selected in [-1, 0, ..., config.vocab_size]. All labels set to -1 are ignored (masked), the loss + is only computed for the labels set in [0, ..., config.vocab_size] + `multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size] + with indices selected in [0, ..., num_choices]. + `past`: an optional list of torch.LongTensor that contains pre-computed hidden-states + (key and values in the attention blocks) to speed up sequential decoding + (this is the presents output of the model, cf. below). + + Outputs: + if `lm_labels` and `multiple_choice_labels` are not `None`: + Outputs a tuple of losses with the language modeling loss and the multiple choice loss. + else: a tuple with + `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, num_choices, sequence_length, config.vocab_size] + `multiple_choice_logits`: the multiple choice logits as a torch.FloatTensor of size [batch_size, num_choices] + `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as + torch.FloatTensors. They can be reused to speed up sequential decoding. + + Example usage: + ```python + # Already been converted into BPE token ids + input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]]]) # (bsz, number of choice, seq length) + mc_token_ids = torch.LongTensor([[2], [1]]) # (bsz, number of choice) + + config = modeling_gpt2.GPT2Config() + + model = modeling_gpt2.GPT2DoubleHeadsModel(config) + lm_logits, multiple_choice_logits, presents = model(input_ids, mc_token_ids) + ``` + """ + + def __init__(self, config, output_attentions=False): + super(GPT2DoubleHeadsModel, self).__init__(config) + self.transformer = GPT2Model(config, output_attentions=output_attentions) + self.lm_head = GPT2LMHead(self.transformer.wte.weight, config) + self.multiple_choice_head = GPT2MultipleChoiceHead(config) + self.apply(self.init_weights) + + def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True): + """ Update input and output embeddings with new embedding matrice + Make sure we are sharing the embeddings + """ + self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens + self.transformer.set_num_special_tokens(num_special_tokens) + self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens) + + def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None, position_ids=None, past=None): + transformer_output = self.transformer(input_ids, position_ids, token_type_ids, past) + if self.transformer.output_attentions: + all_attentions, hidden_states, presents = transformer_output + else: + hidden_states, presents = transformer_output + lm_logits = self.lm_head(hidden_states) + mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids) + losses = [] + if lm_labels is not None: + shift_logits = lm_logits[..., :-1, :].contiguous() + shift_labels = lm_labels[..., 1:].contiguous() + loss_fct = CrossEntropyLoss(ignore_index=-1) + losses.append(loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))) + if mc_labels is not None: + loss_fct = CrossEntropyLoss() + losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))) + if losses: + return losses + if self.transformer.output_attentions: + return all_attentions, lm_logits, mc_logits, presents + return lm_logits, mc_logits, presents diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py new file mode 100644 index 0000000..9aa5f77 --- /dev/null +++ b/pytorch_pretrained_bert/modeling_openai.py @@ -0,0 +1,1089 @@ +# coding=utf-8 +# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch OpenAI GPT model.""" + +from __future__ import absolute_import, division, print_function, unicode_literals + +import collections +import copy +import json +import logging +import math +import os +import shutil +import tarfile +import tempfile +import sys +from io import open + +import torch +import torch.nn as nn +from torch.nn import CrossEntropyLoss +from torch.nn.parameter import Parameter + +from .file_utils import cached_path, CONFIG_NAME, WEIGHTS_NAME +from .modeling import BertLayerNorm as LayerNorm + +logger = logging.getLogger(__name__) + +PRETRAINED_MODEL_ARCHIVE_MAP = { + "openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-pytorch_model.bin"} +PRETRAINED_CONFIG_ARCHIVE_MAP = { + "openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json"} + + +def load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path): + """ Load tf pre-trained weights in a pytorch model (from NumPy arrays here) + """ + import re + import numpy as np + print("Loading weights...") + names = json.load(open(openai_checkpoint_folder_path + '/parameters_names.json', "r", encoding='utf-8')) + shapes = json.load(open(openai_checkpoint_folder_path + '/params_shapes.json', "r", encoding='utf-8')) + offsets = np.cumsum([np.prod(shape) for shape in shapes]) + init_params = [np.load(openai_checkpoint_folder_path + '/params_{}.npy'.format(n)) for n in range(10)] + init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1] + init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)] + + # This was used when we had a single embedding matrix for positions and tokens + # init_params[0] = np.concatenate([init_params[1], init_params[0]], 0) + # del init_params[1] + init_params = [arr.squeeze() for arr in init_params] + + try: + assert model.tokens_embed.weight.shape == init_params[1].shape + assert model.positions_embed.weight.shape == init_params[0].shape + except AssertionError as e: + e.args += (model.tokens_embed.weight.shape, init_params[1].shape) + e.args += (model.positions_embed.weight.shape, init_params[0].shape) + raise + + model.tokens_embed.weight.data = torch.from_numpy(init_params[1]) + model.positions_embed.weight.data = torch.from_numpy(init_params[0]) + names.pop(0) + # Pop position and token embedding arrays + init_params.pop(0) + init_params.pop(0) + + for name, array in zip(names, init_params): # names[1:n_transfer], init_params[1:n_transfer]): + name = name[6:] # skip "model/" + assert name[-2:] == ":0" + name = name[:-2] + name = name.split('/') + pointer = model + for m_name in name: + if re.fullmatch(r'[A-Za-z]+\d+', m_name): + l = re.split(r'(\d+)', m_name) + else: + l = [m_name] + if l[0] == 'g': + pointer = getattr(pointer, 'weight') + elif l[0] == 'b': + pointer = getattr(pointer, 'bias') + elif l[0] == 'w': + pointer = getattr(pointer, 'weight') + else: + pointer = getattr(pointer, l[0]) + if len(l) >= 2: + num = int(l[1]) + pointer = pointer[num] + try: + assert pointer.shape == array.shape + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + try: + assert pointer.shape == array.shape + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + print("Initialize PyTorch weight {}".format(name)) + pointer.data = torch.from_numpy(array) + return model + + +def gelu(x): + return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) + + +def swish(x): + return x * torch.sigmoid(x) + + +ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu} + + +class OpenAIGPTConfig(object): + """Configuration class to store the configuration of a `OpenAIGPTModel`. + """ + + def __init__( + self, + vocab_size_or_config_json_file=40478, + n_special=0, + n_positions=512, + n_ctx=512, + n_embd=768, + n_layer=12, + n_head=12, + afn="gelu", + resid_pdrop=0.1, + embd_pdrop=0.1, + attn_pdrop=0.1, + layer_norm_epsilon=1e-5, + initializer_range=0.02, + predict_special_tokens=True + ): + """Constructs OpenAIGPTConfig. + + Args: + vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file. + n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...) + n_positions: Number of positional embeddings. + n_ctx: Size of the causal mask (usually same as n_positions). + n_embd: Dimensionality of the embeddings and hidden states. + n_layer: Number of hidden layers in the Transformer encoder. + n_head: Number of attention heads for each attention layer in + the Transformer encoder. + afn: The non-linear activation function (function or string) in the + encoder and pooler. If string, "gelu", "relu" and "swish" are supported. + resid_pdrop: The dropout probabilitiy for all fully connected + layers in the embeddings, encoder, and pooler. + attn_pdrop: The dropout ratio for the attention + probabilities. + embd_pdrop: The dropout ratio for the embeddings. + layer_norm_epsilon: epsilon to use in the layer norm layers + initializer_range: The sttdev of the truncated_normal_initializer for + initializing all weight matrices. + predict_special_tokens: should we predict special tokens (when the model has a LM head) + """ + if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 + and isinstance(vocab_size_or_config_json_file, unicode)): + with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader: + json_config = json.loads(reader.read()) + for key, value in json_config.items(): + self.__dict__[key] = value + elif isinstance(vocab_size_or_config_json_file, int): + self.vocab_size = vocab_size_or_config_json_file + self.n_special = n_special + self.n_ctx = n_ctx + self.n_positions = n_positions + self.n_embd = n_embd + self.n_layer = n_layer + self.n_head = n_head + self.afn = afn + self.resid_pdrop = resid_pdrop + self.embd_pdrop = embd_pdrop + self.attn_pdrop = attn_pdrop + self.layer_norm_epsilon = layer_norm_epsilon + self.initializer_range = initializer_range + self.predict_special_tokens = predict_special_tokens + else: + raise ValueError( + "First argument must be either a vocabulary size (int)" + "or the path to a pretrained model config file (str)" + ) + + @property + def total_tokens_embeddings(self): + return self.vocab_size + self.n_special + + @classmethod + def from_dict(cls, json_object): + """Constructs a `OpenAIGPTConfig` from a Python dictionary of parameters.""" + config = OpenAIGPTConfig(vocab_size_or_config_json_file=-1) + for key, value in json_object.items(): + config.__dict__[key] = value + return config + + @classmethod + def from_json_file(cls, json_file): + """Constructs a `OpenAIGPTConfig` from a json file of parameters.""" + with open(json_file, "r", encoding="utf-8") as reader: + text = reader.read() + return cls.from_dict(json.loads(text)) + + def __repr__(self): + return str(self.to_json_string()) + + def to_dict(self): + """Serializes this instance to a Python dictionary.""" + output = copy.deepcopy(self.__dict__) + return output + + def to_json_string(self): + """Serializes this instance to a JSON string.""" + return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" + + def to_json_file(self, json_file_path): + """ Save this instance to a json file.""" + with open(json_file_path, "w", encoding='utf-8') as writer: + writer.write(self.to_json_string()) + + +class Conv1D(nn.Module): + def __init__(self, nf, rf, nx): + super(Conv1D, self).__init__() + self.rf = rf + self.nf = nf + if rf == 1: # faster 1x1 conv + w = torch.empty(nx, nf) + nn.init.normal_(w, std=0.02) + self.weight = Parameter(w) + self.bias = Parameter(torch.zeros(nf)) + else: # was used to train LM + raise NotImplementedError + + def forward(self, x): + if self.rf == 1: + size_out = x.size()[:-1] + (self.nf,) + x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight) + x = x.view(*size_out) + else: + raise NotImplementedError + return x + + +class Attention(nn.Module): + def __init__(self, nx, n_ctx, config, scale=False, output_attentions=False): + super(Attention, self).__init__() + n_state = nx # in Attention: n_state=768 (nx=n_embd) + # [switch nx => n_state from Block to Attention to keep identical to TF implem] + assert n_state % config.n_head == 0 + self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx)) + self.n_head = config.n_head + self.split_size = n_state + self.scale = scale + self.output_attentions = output_attentions + self.c_attn = Conv1D(n_state * 3, 1, nx) # (out_channels, size_conv, in_channels) + self.c_proj = Conv1D(n_state, 1, nx) + self.attn_dropout = nn.Dropout(config.attn_pdrop) + self.resid_dropout = nn.Dropout(config.resid_pdrop) + + def _attn(self, q, k, v): + w = torch.matmul(q, k) + if self.scale: + w = w / math.sqrt(v.size(-1)) + # w = w * self.bias + -1e9 * (1 - self.bias) # TF implem method: mask_attn_weights + # XD: self.b may be larger than w, so we need to crop it + b = self.bias[:, :, : w.size(-2), : w.size(-1)] + w = w * b + -1e9 * (1 - b) + + w = nn.Softmax(dim=-1)(w) + w = self.attn_dropout(w) + if self.output_attentions: + return w, torch.matmul(w, v) + return torch.matmul(w, v) + + def merge_heads(self, x): + x = x.permute(0, 2, 1, 3).contiguous() + new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),) + return x.view(*new_x_shape) # in Tensorflow implem: fct merge_states + + def split_heads(self, x, k=False): + new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head) + x = x.view(*new_x_shape) # in Tensorflow implem: fct split_states + if k: + return x.permute(0, 2, 3, 1) + else: + return x.permute(0, 2, 1, 3) + + def forward(self, x): + x = self.c_attn(x) + query, key, value = x.split(self.split_size, dim=2) + query = self.split_heads(query) + key = self.split_heads(key, k=True) + value = self.split_heads(value) + a = self._attn(query, key, value) + if self.output_attentions: + attentions, a = a + a = self.merge_heads(a) + a = self.c_proj(a) + a = self.resid_dropout(a) + if self.output_attentions: + return attentions, a + return a + + +class MLP(nn.Module): + def __init__(self, n_state, config): # in MLP: n_state=3072 (4 * n_embd) + super(MLP, self).__init__() + nx = config.n_embd + self.c_fc = Conv1D(n_state, 1, nx) + self.c_proj = Conv1D(nx, 1, n_state) + self.act = ACT_FNS[config.afn] + self.dropout = nn.Dropout(config.resid_pdrop) + + def forward(self, x): + h = self.act(self.c_fc(x)) + h2 = self.c_proj(h) + return self.dropout(h2) + + +class Block(nn.Module): + def __init__(self, n_ctx, config, scale=False, output_attentions=False): + super(Block, self).__init__() + nx = config.n_embd + self.output_attentions = output_attentions + self.attn = Attention(nx, n_ctx, config, scale, output_attentions) + self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon) + self.mlp = MLP(4 * nx, config) + self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon) + + def forward(self, x): + a = self.attn(x) + if self.output_attentions: + attentions, a = a + n = self.ln_1(x + a) + m = self.mlp(n) + h = self.ln_2(n + m) + if self.output_attentions: + return attentions, h + return h + + +class OpenAIGPTLMHead(nn.Module): + """ Language Model Head for the transformer """ + + def __init__(self, model_embeddings_weights, config): + super(OpenAIGPTLMHead, self).__init__() + self.n_embd = config.n_embd + self.vocab_size = config.vocab_size + self.predict_special_tokens = config.predict_special_tokens + embed_shape = model_embeddings_weights.shape + self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False) + self.set_embeddings_weights(model_embeddings_weights) + + def set_embeddings_weights(self, model_embeddings_weights, predict_special_tokens=True): + self.predict_special_tokens = predict_special_tokens + embed_shape = model_embeddings_weights.shape + self.decoder.weight = model_embeddings_weights # Tied weights + + def forward(self, hidden_state): + lm_logits = self.decoder(hidden_state) + if not self.predict_special_tokens: + lm_logits = lm_logits[..., :self.vocab_size] + return lm_logits + + +class OpenAIGPTMultipleChoiceHead(nn.Module): + """ Classifier Head for the transformer """ + + def __init__(self, config): + super(OpenAIGPTMultipleChoiceHead, self).__init__() + self.n_embd = config.n_embd + self.dropout = nn.Dropout2d(config.resid_pdrop) # To reproduce the noise_shape parameter of TF implementation + self.linear = nn.Linear(config.n_embd, 1) + + nn.init.normal_(self.linear.weight, std=0.02) + nn.init.normal_(self.linear.bias, 0) + + def forward(self, hidden_states, mc_token_ids): + # Classification logits + # hidden_state (bsz, num_choices, seq_length, hidden_size) + # mc_token_ids (bsz, num_choices) + mc_token_ids = mc_token_ids.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, -1, hidden_states.size(-1)) + # mc_token_ids (bsz, num_choices, 1, hidden_size) + multiple_choice_h = hidden_states.gather(2, mc_token_ids).squeeze(2) + # multiple_choice_h (bsz, num_choices, hidden_size) + multiple_choice_h = self.dropout(multiple_choice_h.transpose(1, 2)).transpose(1, 2) + multiple_choice_logits = self.linear(multiple_choice_h).squeeze(-1) + # (bsz, num_choices) + return multiple_choice_logits + + +class OpenAIGPTPreTrainedModel(nn.Module): + """ An abstract class to handle weights initialization and + a simple interface for dowloading and loading pretrained models. + """ + + def __init__(self, config, *inputs, **kwargs): + super(OpenAIGPTPreTrainedModel, self).__init__() + if not isinstance(config, OpenAIGPTConfig): + raise ValueError( + "Parameter config in `{}(config)` should be an instance of class `OpenAIGPTConfig`. " + "To create a model from a pretrained model use " + "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format( + self.__class__.__name__, self.__class__.__name__ + ) + ) + self.config = config + + def init_weights(self, module): + """ Initialize the weights. + """ + if isinstance(module, (nn.Linear, nn.Embedding)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + elif isinstance(module, LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, num_special_tokens=None, *inputs, **kwargs): + """ + Instantiate a OpenAIGPTPreTrainedModel from a pre-trained model file or a pytorch state dict. + Download and cache the pre-trained model file if needed. + + Params: + pretrained_model_name_or_path: either: + - a str with the name of a pre-trained model to load selected in the list of: + . `openai-gpt` + - a path or url to a pretrained model archive containing: + . `openai_gpt_config.json` a configuration file for the model + . `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance + - a path or url to a pretrained model archive containing: + . `openai-gpt-config.json` a configuration file for the model + . a series of NumPy files containing OpenAI TensorFlow trained weights + from_tf: should we load the weights from a locally saved TensorFlow checkpoint + cache_dir: an optional path to a folder in which the pre-trained models will be cached. + state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models + *inputs, **kwargs: additional input for the specific OpenAI-GPT class + """ + state_dict = kwargs.get('state_dict', None) + kwargs.pop('state_dict', None) + cache_dir = kwargs.get('cache_dir', None) + kwargs.pop('cache_dir', None) + from_tf = kwargs.get('from_tf', False) + kwargs.pop('from_tf', None) + + if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP: + archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path] + config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path] + else: + archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME) + config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME) + # redirect to the cache, if necessary + try: + resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir) + resolved_config_file = cached_path(config_file, cache_dir=cache_dir) + except EnvironmentError: + logger.error( + "Model name '{}' was not found in model name list ({}). " + "We assumed '{}' was a path or url but couldn't find files {} and {} " + "at this path or url.".format( + pretrained_model_name_or_path, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), + pretrained_model_name_or_path, + archive_file, config_file + ) + ) + return None + if resolved_archive_file == archive_file and resolved_config_file == config_file: + logger.info("loading weights file {}".format(archive_file)) + logger.info("loading configuration file {}".format(config_file)) + else: + logger.info("loading weights file {} from cache at {}".format( + archive_file, resolved_archive_file)) + logger.info("loading configuration file {} from cache at {}".format( + config_file, resolved_config_file)) + # Load config + config = OpenAIGPTConfig.from_json_file(resolved_config_file) + logger.info("Model config {}".format(config)) + # Instantiate model. + model = cls(config, *inputs, **kwargs) + if state_dict is None and not from_tf: + state_dict = torch.load(resolved_archive_file, map_location='cpu') + if from_tf: + # Directly load from a TensorFlow checkpoint (stored as NumPy array) + return load_tf_weights_in_openai_gpt(model, resolved_archive_file) + + old_keys = [] + new_keys = [] + for key in state_dict.keys(): + new_key = None + if key.endswith(".g"): + new_key = key[:-2] + ".weight" + elif key.endswith(".b"): + new_key = key[:-2] + ".bias" + elif key.endswith(".w"): + new_key = key[:-2] + ".weight" + if new_key: + old_keys.append(key) + new_keys.append(new_key) + for old_key, new_key in zip(old_keys, new_keys): + state_dict[new_key] = state_dict.pop(old_key) + + missing_keys = [] + unexpected_keys = [] + error_msgs = [] + # copy state_dict so _load_from_state_dict can modify it + metadata = getattr(state_dict, "_metadata", None) + state_dict = state_dict.copy() + if metadata is not None: + state_dict._metadata = metadata + + def load(module, prefix=""): + local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) + module._load_from_state_dict( + state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs + ) + for name, child in module._modules.items(): + if child is not None: + load(child, prefix + name + ".") + + start_model = model + if hasattr(model, "transformer") and all(not s.startswith('transformer.') for s in state_dict.keys()): + start_model = model.transformer + load(start_model, prefix="") + + if len(missing_keys) > 0: + logger.info( + "Weights of {} not initialized from pretrained model: {}".format(model.__class__.__name__, missing_keys) + ) + if len(unexpected_keys) > 0: + logger.info( + "Weights from pretrained model not used in {}: {}".format(model.__class__.__name__, unexpected_keys) + ) + if len(error_msgs) > 0: + raise RuntimeError( + "Error(s) in loading state_dict for {}:\n\t{}".format(model.__class__.__name__, "\n\t".join(error_msgs)) + ) + + # Add additional embeddings for special tokens if needed + # This step also make sure we are still sharing the output and input embeddings after loading weights + model.set_num_special_tokens(num_special_tokens if num_special_tokens is not None else config.n_special) + return model + + +class OpenAIGPTModel(OpenAIGPTPreTrainedModel): + """OpenAI GPT model ("Improving Language Understanding by Generative Pre-Training"). + + OpenAI GPT use a single embedding matrix to store the word and special embeddings. + Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]... + Special tokens need to be trained during the fine-tuning if you use them. + The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function. + + The embeddings are ordered as follow in the token embeddings matrice: + [0, ---------------------- + ... -> word embeddings + config.vocab_size - 1, ______________________ + config.vocab_size, + ... -> special embeddings + config.vocab_size + config.n_special - 1] ______________________ + + where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is: + total_tokens_embeddings = config.vocab_size + config.n_special + You should use the associate indices to index the embeddings. + + Params: + config: a OpenAIGPTConfig class instance with the configuration to build a new model + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length] + were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[ + `position_ids`: an optional torch.LongTensor with the same shape as input_ids + with the position indices (selected in the range [0, config.n_positions - 1[. + `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids + You can use it to add a third type of embedding to each input token in the sequence + (the previous two being the word and position embeddings). + The input, position and token_type embeddings are summed inside the Transformer before the first + self-attention block. + + Outputs: + `hidden_states`: the encoded-hidden-states at the top of the model + as a torch.FloatTensor of size [batch_size, sequence_length, hidden_size] + (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids) + + Example usage: + ```python + # Already been converted into BPE token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + + config = modeling_openai.OpenAIGPTConfig() + + model = modeling_openai.OpenAIGPTModel(config) + hidden_states = model(input_ids) + ``` + """ + + def __init__(self, config, output_attentions=False): + super(OpenAIGPTModel, self).__init__(config) + self.output_attentions = output_attentions + self.tokens_embed = nn.Embedding(config.total_tokens_embeddings, config.n_embd) + self.positions_embed = nn.Embedding(config.n_positions, config.n_embd) + self.drop = nn.Dropout(config.embd_pdrop) + block = Block(config.n_ctx, config, scale=True, output_attentions=output_attentions) + self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)]) + + self.apply(self.init_weights) + + def set_num_special_tokens(self, num_special_tokens): + " Update input embeddings with new embedding matrice if needed " + if self.config.n_special == num_special_tokens: + return + # Update config + self.config.n_special = num_special_tokens + # Build new embeddings and initialize all new embeddings (in particular the special tokens) + old_embed = self.tokens_embed + self.tokens_embed = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd) + self.tokens_embed.to(old_embed.weight.device) + self.init_weights(self.tokens_embed) + # Copy word embeddings from the previous weights + self.tokens_embed.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :] + + def forward(self, input_ids, position_ids=None, token_type_ids=None, token_emotion_ids=None, token_action_ids=None): + if position_ids is None: + # This was used when we had a single embedding matrice from position and token embeddings + # start = self.config.vocab_size + self.config.n_special + # end = start + input_ids.size(-1) + # position_ids = torch.arange(start, end, dtype=torch.long, device=input_ids.device) + position_ids = torch.arange(input_ids.size(-1), dtype=torch.long, device=input_ids.device) + position_ids = position_ids.unsqueeze(0).expand_as(input_ids) + + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_ids.size(-1)) + position_ids = position_ids.view(-1, position_ids.size(-1)) + + inputs_embeds = self.tokens_embed(input_ids) + position_embeds = self.positions_embed(position_ids) + if token_type_ids is not None: + token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) + token_type_embeds = self.tokens_embed(token_type_ids) + else: + token_type_embeds = 0 + + if token_emotion_ids is not None: + token_emotion_ids = token_emotion_ids.view(-1, token_emotion_ids.size(-1)) + token_emotion_embeds = self.tokens_embed(token_emotion_ids) + else: + token_emotion_embeds = 0 + + if token_action_ids is not None: + token_action_ids = token_action_ids.view(-1, token_action_ids.size(-1)) + token_action_embeds = self.tokens_embed(token_action_ids) + else: + token_action_embeds = 0 + + hidden_states = inputs_embeds + position_embeds + token_type_embeds + token_emotion_embeds + token_action_embeds + hidden_states = self.drop(hidden_states) + + all_attentions = [] + for block in self.h: + if self.output_attentions: + attentions, hidden_states = block(hidden_states) + all_attentions.append(attentions) + else: + hidden_states = block(hidden_states) + output_shape = input_shape + (hidden_states.size(-1),) + if self.output_attentions: + return all_attentions, hidden_states.view(*output_shape) + return hidden_states.view(*output_shape) + + +class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): + """OpenAI GPT model with a Language Modeling head ("Improving Language Understanding by Generative Pre-Training"). + + OpenAI GPT use a single embedding matrix to store the word and special embeddings. + Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]... + Special tokens need to be trained during the fine-tuning if you use them. + The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function. + + The embeddings are ordered as follow in the token embeddings matrice: + [0, ---------------------- + ... -> word embeddings + config.vocab_size - 1, ______________________ + config.vocab_size, + ... -> special embeddings + config.vocab_size + config.n_special - 1] ______________________ + + where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is: + total_tokens_embeddings = config.vocab_size + config.n_special + You should use the associate indices to index the embeddings. + + Params: + config: a OpenAIGPTConfig class instance with the configuration to build a new model + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length] + were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[ + `position_ids`: an optional torch.LongTensor with the same shape as input_ids + with the position indices (selected in the range [0, config.n_positions - 1[. + `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids + You can use it to add a third type of embedding to each input token in the sequence + (the previous two being the word and position embeddings). + The input, position and token_type embeddings are summed inside the Transformer before the first + self-attention block. + `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] + with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss + is only computed for the labels set in [0, ..., vocab_size] + + Outputs: + if `lm_labels` is not `None`: + Outputs the language modeling loss. + else: + `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, sequence_length, total_tokens_embeddings] + (or more generally [d_1, ..., d_n, total_tokens_embeddings] were d_1 ... d_n are the dimension of input_ids) + + Example usage: + ```python + # Already been converted into BPE token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + + config = modeling_openai.OpenAIGPTConfig() + + model = modeling_openai.OpenAIGPTLMHeadModel(config) + lm_logits = model(input_ids) + ``` + """ + + def __init__(self, config, output_attentions=False): + super(OpenAIGPTLMHeadModel, self).__init__(config) + self.transformer = OpenAIGPTModel(config, output_attentions=output_attentions) + self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config) + self.apply(self.init_weights) + + def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True): + """ Update input and output embeddings with new embedding matrice + Make sure we are sharing the embeddings + """ + self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens + self.transformer.set_num_special_tokens(num_special_tokens) + self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, + predict_special_tokens=predict_special_tokens) + + def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None): + hidden_states = self.transformer(input_ids, position_ids, token_type_ids) + if self.transformer.output_attentions: + all_attentions, hidden_states = hidden_states + lm_logits = self.lm_head(hidden_states) + if lm_labels is not None: + # Shift so that tokens < n predict n + shift_logits = lm_logits[..., :-1, :].contiguous() + shift_labels = lm_labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss(ignore_index=-1) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), + shift_labels.view(-1)) + return loss + if self.transformer.output_attentions: + return all_attentions, lm_logits + return lm_logits + + +class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): + """OpenAI GPT model with a Language Modeling and a Multiple Choice head ("Improving Language Understanding by Generative Pre-Training"). + + OpenAI GPT use a single embedding matrix to store the word and special embeddings. + Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]... + Special tokens need to be trained during the fine-tuning if you use them. + The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function. + + The embeddings are ordered as follow in the token embeddings matrice: + [0, ---------------------- + ... -> word embeddings + config.vocab_size - 1, ______________________ + config.vocab_size, + ... -> special embeddings + config.vocab_size + config.n_special - 1] ______________________ + + where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is: + total_tokens_embeddings = config.vocab_size + config.n_special + You should use the associate indices to index the embeddings. + + Params: + config: a OpenAIGPTConfig class instance with the configuration to build a new model + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] with the BPE token + indices selected in the range [0, total_tokens_embeddings[ + `mc_token_ids`: a torch.LongTensor of shape [batch_size, num_choices] with the index of the token from + which we should take the hidden state to feed the multiple choice classifier (usually last token of the sequence) + `position_ids`: an optional torch.LongTensor with the same shape as input_ids + with the position indices (selected in the range [0, config.n_positions - 1[. + `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids + You can use it to add a third type of embedding to each input token in the sequence + (the previous two being the word and position embeddings). + The input, position and token_type embeddings are summed inside the Transformer before the first + self-attention block. + `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, num_choices, sequence_length] + with indices selected in [-1, 0, ..., total_tokens_embeddings]. All labels set to -1 are ignored (masked), the loss + is only computed for the labels set in [0, ..., total_tokens_embeddings] + `multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size] + with indices selected in [0, ..., num_choices]. + + Outputs: + if `lm_labels` and `multiple_choice_labels` are not `None`: + Outputs a tuple of losses with the language modeling loss and the multiple choice loss. + else: a tuple with + `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, num_choices, sequence_length, total_tokens_embeddings] + `multiple_choice_logits`: the multiple choice logits as a torch.FloatTensor of size [batch_size, num_choices] + + Example usage: + ```python + # Already been converted into BPE token ids + input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]]]) # (bsz, number of choice, seq length) + mc_token_ids = torch.LongTensor([[2], [1]]) # (bsz, number of choice) + + config = modeling_openai.OpenAIGPTConfig() + + model = modeling_openai.OpenAIGPTDoubleHeadsModel(config) + lm_logits, multiple_choice_logits = model(input_ids, mc_token_ids) + ``` + """ + + def __init__(self, config, output_attentions=False): + super(OpenAIGPTDoubleHeadsModel, self).__init__(config) + self.transformer = OpenAIGPTModel(config, output_attentions=output_attentions) + self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config) + self.multiple_choice_head = OpenAIGPTMultipleChoiceHead(config) + self.apply(self.init_weights) + + def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True): + """ Update input and output embeddings with new embedding matrice + Make sure we are sharing the embeddings + """ + self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens + self.transformer.set_num_special_tokens(num_special_tokens) + self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, + predict_special_tokens=predict_special_tokens) + + def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, + token_type_ids=None, token_emotion_ids=None, token_action_ids=None, position_ids=None): + hidden_states = self.transformer(input_ids, position_ids, token_type_ids, token_emotion_ids, token_action_ids) + if self.transformer.output_attentions: + all_attentions, hidden_states = hidden_states + lm_logits = self.lm_head(hidden_states) + mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids) + losses = [] + if lm_labels is not None: # when lm_labels is all -1 it means it's not the correct candidate which in turn means it's a negative example and we ignore it because ignore_index=-1 + shift_logits = lm_logits[..., :-1, :].contiguous() + shift_labels = lm_labels[..., 1:].contiguous() + loss_fct = CrossEntropyLoss(ignore_index=-1) + losses.append(loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))) + if mc_labels is not None: + loss_fct = CrossEntropyLoss() + losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))) + if losses: + return losses + if self.transformer.output_attentions: + return all_attentions, lm_logits, mc_logits + return lm_logits, mc_logits + + +############################################################################### + + +class OpenAIGPTEmotionChoiceHead(nn.Module): + """ Classifier Head for the transformer """ + + def __init__(self, config): + super(OpenAIGPTEmotionChoiceHead, self).__init__() + self.n_embd = config.n_embd + self.dropout = nn.Dropout2d(config.resid_pdrop) # To reproduce the noise_shape parameter of TF implementation + num_emotions = 7 + self.linear = nn.Linear(config.n_embd, num_emotions) + + nn.init.normal_(self.linear.weight, std=0.02) + nn.init.normal_(self.linear.bias, 0) + + def forward(self, hidden_states, mc_token_ids): + # Classification logits + # hidden_state (bsz, seq_length, hidden_size) + # mc_token_ids (bsz,) + mc_token_ids = mc_token_ids.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, hidden_states.size(-1)) + # mc_token_ids (bsz, 1, hidden_size) + multiple_choice_h = hidden_states.gather(1, mc_token_ids).squeeze(1) + # multiple_choice_h (bsz, hidden_size) + multiple_choice_h = self.dropout(multiple_choice_h) + multiple_choice_logits = self.linear(multiple_choice_h) + # (bsz, num_choices) + return multiple_choice_logits + + +class OpenAIGPTBatchedEmotionChoiceHead(nn.Module): + + def __init__(self, config): + super(OpenAIGPTBatchedEmotionChoiceHead, self).__init__() + self.n_embd = config.n_embd + self.dropout = nn.Dropout2d(config.resid_pdrop) # To reproduce the noise_shape parameter of TF implementation + num_emotions = 7 + self.linear = nn.Linear(config.n_embd, num_emotions) + + nn.init.normal_(self.linear.weight, std=0.02) + nn.init.normal_(self.linear.bias, 0) + + def forward(self, hidden_states, mc_token_ids): + # Classification logits + # hidden_state (bsz, num_choices, seq_length, hidden_size) + # mc_token_ids (bsz, num_choices) + mc_token_ids = mc_token_ids.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, -1, hidden_states.size(-1)) + # mc_token_ids (bsz, num_choices, 1, hidden_size) + multiple_choice_h = hidden_states.gather(2, mc_token_ids).squeeze(2) + # multiple_choice_h (bsz, num_choices, hidden_size) + multiple_choice_h = self.dropout(multiple_choice_h.transpose(1, 2)).transpose(1, 2) + multiple_choice_logits = self.linear(multiple_choice_h).squeeze(-1) + # (bsz, num_choices) + return multiple_choice_logits + + +class OpenAIGPTEmotionHead(nn.Module): + """ Classifier Head for the transformer """ + + def __init__(self, config): + super(OpenAIGPTEmotionHead, self).__init__() + self.n_embd = config.n_embd + self.dropout = nn.Dropout2d(config.resid_pdrop) # To reproduce the noise_shape parameter of TF implementation + num_classes = 2 # this probably need to be 1 + self.linear = nn.Linear(config.n_embd, num_classes) + + nn.init.normal_(self.linear.weight, std=0.02) + nn.init.normal_(self.linear.bias, 0) + + def forward(self, hidden_states, mc_token_ids): + # Classification logits + # hidden_state (bsz, seq_length, hidden_size) + # mc_token_ids (bsz,) + mc_token_ids = mc_token_ids.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, hidden_states.size(-1)) + # mc_token_ids (bsz, 1, hidden_size) + multiple_choice_h = hidden_states.gather(1, mc_token_ids).squeeze(1) + # multiple_choice_h (bsz, hidden_size) + multiple_choice_h = self.dropout(multiple_choice_h) + multiple_choice_logits = self.linear(multiple_choice_h) + # (bsz, num_choices) + return multiple_choice_logits + + +class OpenAIGPTDoubleHeadLMEmotionRecognitionModel(OpenAIGPTPreTrainedModel): + def __init__(self, config, output_attentions=False): + super(OpenAIGPTDoubleHeadLMEmotionRecognitionModel, self).__init__(config) + self.transformer = OpenAIGPTModel(config, output_attentions=output_attentions) + self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config) + self.emotion_choice_head = OpenAIGPTEmotionChoiceHead(config) + self.apply(self.init_weights) + + def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True): + """ Update input and output embeddings with new embedding matrice + Make sure we are sharing the embeddings + """ + self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens + self.transformer.set_num_special_tokens(num_special_tokens) + self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, + predict_special_tokens=predict_special_tokens) + + def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None, + token_emotion_ids=None, position_ids=None): + hidden_states = self.transformer(input_ids, position_ids, token_type_ids, token_emotion_ids) + if self.transformer.output_attentions: + all_attentions, hidden_states = hidden_states + lm_logits = self.lm_head(hidden_states) + mc_logits = self.emotion_choice_head(hidden_states, mc_token_ids) + losses = [] + if lm_labels is not None: # when lm_labels is all -1 it means it's not the correct candidate which in turn means it's a negative example and we ignore it because ignore_index=-1 + shift_logits = lm_logits[..., :-1, :].contiguous() + shift_labels = lm_labels[..., 1:].contiguous() + loss_fct = CrossEntropyLoss(ignore_index=-1) + losses.append(loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))) + if mc_labels is not None: + loss_fct = CrossEntropyLoss(ignore_index=-1) + # loss_fct = CrossEntropyLoss() + losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))) + if losses: + return losses + if self.transformer.output_attentions: + return all_attentions, lm_logits, mc_logits + return lm_logits, mc_logits + + +class OpenAIGPTForEmotionDetection(OpenAIGPTPreTrainedModel): + def __init__(self, config, output_attentions=False): + super(OpenAIGPTForEmotionDetection, self).__init__(config) + self.transformer = OpenAIGPTModel(config, output_attentions=output_attentions) + self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config) + self.emotion_classification_head = OpenAIGPTEmotionHead(config) + self.apply(self.init_weights) + + def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True): + """ Update input and output embeddings with new embedding matrice + Make sure we are sharing the embeddings + """ + self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens + self.transformer.set_num_special_tokens(num_special_tokens) + self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, + predict_special_tokens=predict_special_tokens) + + def forward(self, input_ids, mc_token_ids, lm_labels=None, + mc_labels=None, token_type_ids=None, position_ids=None, token_emotion_ids=None): + hidden_states = self.transformer(input_ids, position_ids, token_type_ids, token_emotion_ids=token_emotion_ids) + if self.transformer.output_attentions: + all_attentions, hidden_states = hidden_states + lm_logits = self.lm_head(hidden_states) + mc_logits = self.emotion_classification_head(hidden_states, mc_token_ids) + losses = [] + if lm_labels is not None: # when lm_labels is all -1 it means it's not the correct candidate which in turn means it's a negative example and we ignore it because ignore_index=-1 + shift_logits = lm_logits[..., :-1, :].contiguous() + shift_labels = lm_labels[..., 1:].contiguous() + loss_fct = CrossEntropyLoss(ignore_index=-1) + losses.append(loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))) + if mc_labels is not None: + # loss_fct = CrossEntropyLoss(ignore_index=-1) + loss_fct = CrossEntropyLoss() + losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))) + if losses: + return losses + if self.transformer.output_attentions: + return all_attentions, lm_logits, mc_logits + return lm_logits, mc_logits + + +class OpenAIGPTMultiHeadModel(OpenAIGPTPreTrainedModel): + def __init__(self, config, output_attentions=False): + super(OpenAIGPTMultiHeadModel, self).__init__(config) + self.transformer = OpenAIGPTModel(config, output_attentions=output_attentions) + self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config) + self.emotion_choice_head = OpenAIGPTBatchedEmotionChoiceHead(config) + self.sentence_choice_head = OpenAIGPTMultipleChoiceHead(config) + self.apply(self.init_weights) + + def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True): + """ Update input and output embeddings with new embedding matrice + Make sure we are sharing the embeddings + """ + self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens + self.transformer.set_num_special_tokens(num_special_tokens) + self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, + predict_special_tokens=predict_special_tokens) + + def forward(self, input_ids, ec_token_ids, sc_token_ids, lm_labels=None, + ec_labels=None, sc_labels=None, token_type_ids=None, + token_emotion_ids=None, token_action_ids=None, + position_ids=None): + + hidden_states = self.transformer(input_ids, position_ids, token_type_ids, token_emotion_ids) + if self.transformer.output_attentions: + all_attentions, hidden_states = hidden_states + lm_logits = self.lm_head(hidden_states) + emotion_logits = self.emotion_choice_head(hidden_states, ec_token_ids) + sentence_logits = self.sentence_choice_head(hidden_states, sc_token_ids) + losses = [] + if lm_labels is not None: # when lm_labels is all -1 it means it's not the correct candidate which in turn means it's a negative example and we ignore it because ignore_index=-1 + shift_logits = lm_logits[..., :-1, :].contiguous() + shift_labels = lm_labels[..., 1:].contiguous() + loss_fct = CrossEntropyLoss(ignore_index=-1) + losses.append(loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))) + if ec_labels is not None: + loss_fct = CrossEntropyLoss(ignore_index=-1) + losses.append(loss_fct(emotion_logits.view(-1, emotion_logits.size(-1)), ec_labels.view(-1))) + if sc_labels is not None: + loss_fct = CrossEntropyLoss(ignore_index=-1) + losses.append(loss_fct(sentence_logits.view(-1, sentence_logits.size(-1)), sc_labels.view(-1))) + if losses: + return losses + if self.transformer.output_attentions: + return all_attentions, lm_logits, emotion_logits, sentence_logits + return lm_logits, emotion_logits, sentence_logits diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py new file mode 100644 index 0000000..e70a29a --- /dev/null +++ b/pytorch_pretrained_bert/modeling_transfo_xl.py @@ -0,0 +1,1392 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Transformer XL model. + Adapted from https://github.com/kimiyoung/transformer-xl. + In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/mem_transformer.py +""" + +from __future__ import absolute_import, division, print_function, unicode_literals + +import os +import copy +import json +import math +import logging +import tarfile +import tempfile +import shutil +import collections +import sys +from io import open + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import CrossEntropyLoss +from torch.nn.parameter import Parameter + +from .modeling import BertLayerNorm as LayerNorm +from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits +from .file_utils import cached_path, CONFIG_NAME, WEIGHTS_NAME + +logger = logging.getLogger(__name__) + +PRETRAINED_MODEL_ARCHIVE_MAP = { + 'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-pytorch_model.bin", +} +PRETRAINED_CONFIG_ARCHIVE_MAP = { + 'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json", +} + +TF_WEIGHTS_NAME = 'model.ckpt' + +def build_tf_to_pytorch_map(model, config): + """ A map of modules from TF to PyTorch. + This time I use a map to keep the PyTorch model as identical to the original PyTorch model as possible. + """ + tf_to_pt_map = {} + + if hasattr(model, 'transformer'): + # We are loading in a TransfoXLLMHeadModel => we will load also the Adaptive Softmax + tf_to_pt_map.update({ + "transformer/adaptive_softmax/cutoff_0/cluster_W": model.crit.cluster_weight, + "transformer/adaptive_softmax/cutoff_0/cluster_b": model.crit.cluster_bias}) + for i, (out_l, proj_l, tie_proj) in enumerate(zip( + model.crit.out_layers, + model.crit.out_projs, + config.tie_projs)): + layer_str = "transformer/adaptive_softmax/cutoff_%d/" % i + if config.tie_weight: + tf_to_pt_map.update({ + layer_str + 'b': out_l.bias}) + else: + raise NotImplementedError + # I don't think this is implemented in the TF code + tf_to_pt_map.update({ + layer_str + 'lookup_table': out_l.weight, + layer_str + 'b': out_l.bias}) + if not tie_proj: + tf_to_pt_map.update({ + layer_str + 'proj': proj_l + }) + # Now load the rest of the transformer + model = model.transformer + + # Embeddings + for i, (embed_l, proj_l) in enumerate(zip(model.word_emb.emb_layers, model.word_emb.emb_projs)): + layer_str = "transformer/adaptive_embed/cutoff_%d/" % i + tf_to_pt_map.update({ + layer_str + 'lookup_table': embed_l.weight, + layer_str + 'proj_W': proj_l + }) + + # Transformer blocks + for i, b in enumerate(model.layers): + layer_str = "transformer/layer_%d/" % i + tf_to_pt_map.update({ + layer_str + "rel_attn/LayerNorm/gamma": b.dec_attn.layer_norm.weight, + layer_str + "rel_attn/LayerNorm/beta": b.dec_attn.layer_norm.bias, + layer_str + "rel_attn/o/kernel": b.dec_attn.o_net.weight, + layer_str + "rel_attn/qkv/kernel": b.dec_attn.qkv_net.weight, + layer_str + "rel_attn/r/kernel": b.dec_attn.r_net.weight, + layer_str + "ff/LayerNorm/gamma": b.pos_ff.layer_norm.weight, + layer_str + "ff/LayerNorm/beta": b.pos_ff.layer_norm.bias, + layer_str + "ff/layer_1/kernel": b.pos_ff.CoreNet[0].weight, + layer_str + "ff/layer_1/bias": b.pos_ff.CoreNet[0].bias, + layer_str + "ff/layer_2/kernel": b.pos_ff.CoreNet[3].weight, + layer_str + "ff/layer_2/bias": b.pos_ff.CoreNet[3].bias, + }) + + # Relative positioning biases + if config.untie_r: + r_r_list = [] + r_w_list = [] + for b in model.layers: + r_r_list.append(b.dec_attn.r_r_bias) + r_w_list.append(b.dec_attn.r_w_bias) + else: + r_r_list = [model.r_r_bias] + r_w_list = [model.r_w_bias] + tf_to_pt_map.update({ + 'transformer/r_r_bias': r_r_list, + 'transformer/r_w_bias': r_w_list}) + return tf_to_pt_map + +def load_tf_weights_in_transfo_xl(model, config, tf_path): + """ Load tf checkpoints in a pytorch model + """ + try: + import numpy as np + import tensorflow as tf + except ImportError: + print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions.") + raise + # Build TF to PyTorch weights loading map + tf_to_pt_map = build_tf_to_pytorch_map(model, config) + + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + tf_weights = {} + for name, shape in init_vars: + print("Loading TF weight {} with shape {}".format(name, shape)) + array = tf.train.load_variable(tf_path, name) + tf_weights[name] = array + + for name, pointer in tf_to_pt_map.items(): + assert name in tf_weights + array = tf_weights[name] + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if 'kernel' in name or 'proj' in name: + array = np.transpose(array) + if ('r_r_bias' in name or 'r_w_bias' in name) and len(pointer) > 1: + # Here we will split the TF weigths + assert len(pointer) == array.shape[0] + for i, p_i in enumerate(pointer): + arr_i = array[i, ...] + try: + assert p_i.shape == arr_i.shape + except AssertionError as e: + e.args += (p_i.shape, arr_i.shape) + raise + print("Initialize PyTorch weight {} for layer {}".format(name, i)) + p_i.data = torch.from_numpy(arr_i) + else: + try: + assert pointer.shape == array.shape + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + print("Initialize PyTorch weight {}".format(name)) + pointer.data = torch.from_numpy(array) + tf_weights.pop(name, None) + tf_weights.pop(name + '/Adam', None) + tf_weights.pop(name + '/Adam_1', None) + + print("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys()))) + return model + + +class TransfoXLConfig(object): + """Configuration class to store the configuration of a `TransfoXLModel`. + """ + def __init__(self, + vocab_size_or_config_json_file=267735, + cutoffs=[20000, 40000, 200000], + d_model=1024, + d_embed=1024, + n_head=16, + d_head=64, + d_inner=4096, + div_val=4, + pre_lnorm=False, + n_layer=18, + tgt_len=128, + ext_len=0, + mem_len=1600, + clamp_len=1000, + same_length=True, + proj_share_all_but_first=True, + attn_type=0, + sample_softmax=-1, + adaptive=True, + tie_weight=True, + dropout=0.1, + dropatt=0.0, + untie_r=True, + init="normal", + init_range=0.01, + proj_init_std=0.01, + init_std=0.02): + """Constructs TransfoXLConfig. + + Args: + vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file. + cutoffs: cutoffs for the adaptive softmax + d_model: Dimensionality of the model's hidden states. + d_embed: Dimensionality of the embeddings + d_head: Dimensionality of the model's heads. + div_val: divident value for adapative input and softmax + pre_lnorm: apply LayerNorm to the input instead of the output + d_inner: Inner dimension in FF + n_layer: Number of hidden layers in the Transformer encoder. + n_head: Number of attention heads for each attention layer in + the Transformer encoder. + tgt_len: number of tokens to predict + ext_len: length of the extended context + mem_len: length of the retained previous heads + same_length: use the same attn length for all tokens + proj_share_all_but_first: True to share all but first projs, False not to share. + attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al. + clamp_len: use the same pos embeddings after clamp_len + sample_softmax: number of samples in sampled softmax + adaptive: use adaptive softmax + tie_weight: tie the word embedding and softmax weights + dropout: The dropout probabilitiy for all fully connected + layers in the embeddings, encoder, and pooler. + dropatt: The dropout ratio for the attention probabilities. + untie_r: untie relative position biases + embd_pdrop: The dropout ratio for the embeddings. + init: parameter initializer to use + init_range: parameters initialized by U(-init_range, init_range). + proj_init_std: parameters initialized by N(0, init_std) + init_std: parameters initialized by N(0, init_std) + """ + if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 + and isinstance(vocab_size_or_config_json_file, unicode)): + with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: + json_config = json.loads(reader.read()) + for key, value in json_config.items(): + self.__dict__[key] = value + elif isinstance(vocab_size_or_config_json_file, int): + self.n_token = vocab_size_or_config_json_file + self.cutoffs = [] + self.cutoffs.extend(cutoffs) + self.tie_weight = tie_weight + if proj_share_all_but_first: + self.tie_projs = [False] + [True] * len(self.cutoffs) + else: + self.tie_projs = [False] + [False] * len(self.cutoffs) + self.d_model = d_model + self.d_embed = d_embed + self.d_head = d_head + self.d_inner = d_inner + self.div_val = div_val + self.pre_lnorm = pre_lnorm + self.n_layer = n_layer + self.n_head = n_head + self.tgt_len = tgt_len + self.ext_len = ext_len + self.mem_len = mem_len + self.same_length = same_length + self.attn_type = attn_type + self.clamp_len = clamp_len + self.sample_softmax = sample_softmax + self.adaptive = adaptive + self.dropout = dropout + self.dropatt = dropatt + self.untie_r = untie_r + self.init = init + self.init_range = init_range + self.proj_init_std = proj_init_std + self.init_std = init_std + else: + raise ValueError("First argument must be either a vocabulary size (int)" + "or the path to a pretrained model config file (str)") + + @classmethod + def from_dict(cls, json_object): + """Constructs a `TransfoXLConfig` from a Python dictionary of parameters.""" + config = TransfoXLConfig(vocab_size_or_config_json_file=-1) + for key, value in json_object.items(): + config.__dict__[key] = value + return config + + @classmethod + def from_json_file(cls, json_file): + """Constructs a `TransfoXLConfig` from a json file of parameters.""" + with open(json_file, "r", encoding='utf-8') as reader: + text = reader.read() + return cls.from_dict(json.loads(text)) + + def __repr__(self): + return str(self.to_json_string()) + + def to_dict(self): + """Serializes this instance to a Python dictionary.""" + output = copy.deepcopy(self.__dict__) + return output + + def to_json_string(self): + """Serializes this instance to a JSON string.""" + return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" + + def to_json_file(self, json_file_path): + """ Save this instance to a json file.""" + with open(json_file_path, "w", encoding='utf-8') as writer: + writer.write(self.to_json_string()) + + +class PositionalEmbedding(nn.Module): + def __init__(self, demb): + super(PositionalEmbedding, self).__init__() + + self.demb = demb + + inv_freq = 1 / (10000 ** (torch.arange(0.0, demb, 2.0) / demb)) + self.register_buffer('inv_freq', inv_freq) + + def forward(self, pos_seq, bsz=None): + sinusoid_inp = torch.ger(pos_seq, self.inv_freq) + pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1) + + if bsz is not None: + return pos_emb[:,None,:].expand(-1, bsz, -1) + else: + return pos_emb[:,None,:] + + +class PositionwiseFF(nn.Module): + def __init__(self, d_model, d_inner, dropout, pre_lnorm=False): + super(PositionwiseFF, self).__init__() + + self.d_model = d_model + self.d_inner = d_inner + self.dropout = dropout + + self.CoreNet = nn.Sequential( + nn.Linear(d_model, d_inner), nn.ReLU(inplace=True), + nn.Dropout(dropout), + nn.Linear(d_inner, d_model), + nn.Dropout(dropout), + ) + + self.layer_norm = LayerNorm(d_model) + + self.pre_lnorm = pre_lnorm + + def forward(self, inp): + if self.pre_lnorm: + ##### layer normalization + positionwise feed-forward + core_out = self.CoreNet(self.layer_norm(inp)) + + ##### residual connection + output = core_out + inp + else: + ##### positionwise feed-forward + core_out = self.CoreNet(inp) + + ##### residual connection + layer normalization + output = self.layer_norm(inp + core_out) + + return output + +class MultiHeadAttn(nn.Module): + def __init__(self, n_head, d_model, d_head, dropout, dropatt=0, + pre_lnorm=False, r_r_bias=None, r_w_bias=None): + super(MultiHeadAttn, self).__init__() + + self.n_head = n_head + self.d_model = d_model + self.d_head = d_head + self.dropout = dropout + + self.q_net = nn.Linear(d_model, n_head * d_head, bias=False) + self.kv_net = nn.Linear(d_model, 2 * n_head * d_head, bias=False) + + self.drop = nn.Dropout(dropout) + self.dropatt = nn.Dropout(dropatt) + self.o_net = nn.Linear(n_head * d_head, d_model, bias=False) + + self.layer_norm = LayerNorm(d_model) + + self.scale = 1 / (d_head ** 0.5) + + self.pre_lnorm = pre_lnorm + + if r_r_bias is None or r_w_bias is None: # Biases are not shared + self.r_r_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head)) + self.r_w_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head)) + else: + self.r_r_bias = r_r_bias + self.r_w_bias = r_w_bias + + def forward(self, h, attn_mask=None, mems=None): + ##### multihead attention + # [hlen x bsz x n_head x d_head] + + if mems is not None: + c = torch.cat([mems, h], 0) + else: + c = h + + if self.pre_lnorm: + ##### layer normalization + c = self.layer_norm(c) + + head_q = self.q_net(h) + head_k, head_v = torch.chunk(self.kv_net(c), 2, -1) + + head_q = head_q.view(h.size(0), h.size(1), self.n_head, self.d_head) + head_k = head_k.view(c.size(0), c.size(1), self.n_head, self.d_head) + head_v = head_v.view(c.size(0), c.size(1), self.n_head, self.d_head) + + # [qlen x klen x bsz x n_head] + attn_score = torch.einsum('ibnd,jbnd->ijbn', (head_q, head_k)) + attn_score.mul_(self.scale) + if attn_mask is not None and attn_mask.any().item(): + if attn_mask.dim() == 2: + attn_score.masked_fill_(attn_mask[None,:,:,None], -float('inf')) + elif attn_mask.dim() == 3: + attn_score.masked_fill_(attn_mask[:,:,:,None], -float('inf')) + + # [qlen x klen x bsz x n_head] + attn_prob = F.softmax(attn_score, dim=1) + attn_prob = self.dropatt(attn_prob) + + # [qlen x klen x bsz x n_head] + [klen x bsz x n_head x d_head] -> [qlen x bsz x n_head x d_head] + attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, head_v)) + attn_vec = attn_vec.contiguous().view( + attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head) + + ##### linear projection + attn_out = self.o_net(attn_vec) + attn_out = self.drop(attn_out) + + if self.pre_lnorm: + ##### residual connection + output = h + attn_out + else: + ##### residual connection + layer normalization + output = self.layer_norm(h + attn_out) + + return output + +class RelMultiHeadAttn(nn.Module): + def __init__(self, n_head, d_model, d_head, dropout, dropatt=0, + tgt_len=None, ext_len=None, mem_len=None, pre_lnorm=False, + r_r_bias=None, r_w_bias=None): + super(RelMultiHeadAttn, self).__init__() + + self.n_head = n_head + self.d_model = d_model + self.d_head = d_head + self.dropout = dropout + + self.qkv_net = nn.Linear(d_model, 3 * n_head * d_head, bias=False) + + self.drop = nn.Dropout(dropout) + self.dropatt = nn.Dropout(dropatt) + self.o_net = nn.Linear(n_head * d_head, d_model, bias=False) + + self.layer_norm = LayerNorm(d_model) + + self.scale = 1 / (d_head ** 0.5) + + self.pre_lnorm = pre_lnorm + + if r_r_bias is None or r_w_bias is None: # Biases are not shared + self.r_r_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head)) + self.r_w_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head)) + else: + self.r_r_bias = r_r_bias + self.r_w_bias = r_w_bias + + def _parallelogram_mask(self, h, w, left=False): + mask = torch.ones((h, w)).byte() + m = min(h, w) + mask[:m,:m] = torch.triu(mask[:m,:m]) + mask[-m:,-m:] = torch.tril(mask[-m:,-m:]) + + if left: + return mask + else: + return mask.flip(0) + + def _shift(self, x, qlen, klen, mask, left=False): + if qlen > 1: + zero_pad = torch.zeros((x.size(0), qlen-1, x.size(2), x.size(3)), + device=x.device, dtype=x.dtype) + else: + zero_pad = torch.zeros(0, device=x.device, dtype=x.dtype) + + if left: + mask = mask.flip(1) + x_padded = torch.cat([zero_pad, x], dim=1).expand(qlen, -1, -1, -1) + else: + x_padded = torch.cat([x, zero_pad], dim=1).expand(qlen, -1, -1, -1) + + x = x_padded.masked_select(mask[:,:,None,None]) \ + .view(qlen, klen, x.size(2), x.size(3)) + + return x + + def _rel_shift(self, x, zero_triu=False): + zero_pad_shape = (x.size(0), 1) + x.size()[2:] + zero_pad = torch.zeros(zero_pad_shape, device=x.device, dtype=x.dtype) + x_padded = torch.cat([zero_pad, x], dim=1) + + x_padded_shape = (x.size(1) + 1, x.size(0)) + x.size()[2:] + x_padded = x_padded.view(*x_padded_shape) + + x = x_padded[1:].view_as(x) + + if zero_triu: + ones = torch.ones((x.size(0), x.size(1))) + x = x * torch.tril(ones, x.size(1) - x.size(0))[:,:,None,None] + + return x + + def forward(self, w, r, attn_mask=None, mems=None): + raise NotImplementedError + +class RelPartialLearnableMultiHeadAttn(RelMultiHeadAttn): + def __init__(self, *args, **kwargs): + super(RelPartialLearnableMultiHeadAttn, self).__init__(*args, **kwargs) + + self.r_net = nn.Linear(self.d_model, self.n_head * self.d_head, bias=False) + + def forward(self, w, r, attn_mask=None, mems=None): + qlen, rlen, bsz = w.size(0), r.size(0), w.size(1) + + if mems is not None: + cat = torch.cat([mems, w], 0) + if self.pre_lnorm: + w_heads = self.qkv_net(self.layer_norm(cat)) + else: + w_heads = self.qkv_net(cat) + r_head_k = self.r_net(r) + + w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1) + w_head_q = w_head_q[-qlen:] + else: + if self.pre_lnorm: + w_heads = self.qkv_net(self.layer_norm(w)) + else: + w_heads = self.qkv_net(w) + r_head_k = self.r_net(r) + + w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1) + + klen = w_head_k.size(0) + + w_head_q = w_head_q.view(qlen, bsz, self.n_head, self.d_head) # qlen x bsz x n_head x d_head + w_head_k = w_head_k.view(klen, bsz, self.n_head, self.d_head) # qlen x bsz x n_head x d_head + w_head_v = w_head_v.view(klen, bsz, self.n_head, self.d_head) # qlen x bsz x n_head x d_head + + r_head_k = r_head_k.view(rlen, self.n_head, self.d_head) # qlen x n_head x d_head + + #### compute attention score + rw_head_q = w_head_q + self.r_w_bias # qlen x bsz x n_head x d_head + AC = torch.einsum('ibnd,jbnd->ijbn', (rw_head_q, w_head_k)) # qlen x klen x bsz x n_head + + rr_head_q = w_head_q + self.r_r_bias + BD = torch.einsum('ibnd,jnd->ijbn', (rr_head_q, r_head_k)) # qlen x klen x bsz x n_head + BD = self._rel_shift(BD) + + # [qlen x klen x bsz x n_head] + attn_score = AC + BD + attn_score.mul_(self.scale) + + #### compute attention probability + if attn_mask is not None and attn_mask.any().item(): + if attn_mask.dim() == 2: + attn_score = attn_score.float().masked_fill( + attn_mask[None,:,:,None], -1e30).type_as(attn_score) + elif attn_mask.dim() == 3: + attn_score = attn_score.float().masked_fill( + attn_mask[:,:,:,None], -1e30).type_as(attn_score) + + # [qlen x klen x bsz x n_head] + attn_prob = F.softmax(attn_score, dim=1) + attn_prob = self.dropatt(attn_prob) + + #### compute attention vector + attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, w_head_v)) + + # [qlen x bsz x n_head x d_head] + attn_vec = attn_vec.contiguous().view( + attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head) + + ##### linear projection + attn_out = self.o_net(attn_vec) + attn_out = self.drop(attn_out) + + if self.pre_lnorm: + ##### residual connection + output = w + attn_out + else: + ##### residual connection + layer normalization + output = self.layer_norm(w + attn_out) + + return output + +class RelLearnableMultiHeadAttn(RelMultiHeadAttn): + def __init__(self, *args, **kwargs): + super(RelLearnableMultiHeadAttn, self).__init__(*args, **kwargs) + + def forward(self, w, r_emb, r_w_bias, r_bias, attn_mask=None, mems=None): + # r_emb: [klen, n_head, d_head], used for term B + # r_w_bias: [n_head, d_head], used for term C + # r_bias: [klen, n_head], used for term D + + qlen, bsz = w.size(0), w.size(1) + + if mems is not None: + cat = torch.cat([mems, w], 0) + if self.pre_lnorm: + w_heads = self.qkv_net(self.layer_norm(cat)) + else: + w_heads = self.qkv_net(cat) + w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1) + + w_head_q = w_head_q[-qlen:] + else: + if self.pre_lnorm: + w_heads = self.qkv_net(self.layer_norm(w)) + else: + w_heads = self.qkv_net(w) + w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1) + + klen = w_head_k.size(0) + + w_head_q = w_head_q.view(qlen, bsz, self.n_head, self.d_head) + w_head_k = w_head_k.view(klen, bsz, self.n_head, self.d_head) + w_head_v = w_head_v.view(klen, bsz, self.n_head, self.d_head) + + if klen > r_emb.size(0): + r_emb_pad = r_emb[0:1].expand(klen-r_emb.size(0), -1, -1) + r_emb = torch.cat([r_emb_pad, r_emb], 0) + r_bias_pad = r_bias[0:1].expand(klen-r_bias.size(0), -1) + r_bias = torch.cat([r_bias_pad, r_bias], 0) + else: + r_emb = r_emb[-klen:] + r_bias = r_bias[-klen:] + + #### compute attention score + rw_head_q = w_head_q + r_w_bias[None] # qlen x bsz x n_head x d_head + + AC = torch.einsum('ibnd,jbnd->ijbn', (rw_head_q, w_head_k)) # qlen x klen x bsz x n_head + B_ = torch.einsum('ibnd,jnd->ijbn', (w_head_q, r_emb)) # qlen x klen x bsz x n_head + D_ = r_bias[None, :, None] # 1 x klen x 1 x n_head + BD = self._rel_shift(B_ + D_) + + # [qlen x klen x bsz x n_head] + attn_score = AC + BD + attn_score.mul_(self.scale) + + #### compute attention probability + if attn_mask is not None and attn_mask.any().item(): + if attn_mask.dim() == 2: + attn_score.masked_fill_(attn_mask[None,:,:,None], -float('inf')) + elif attn_mask.dim() == 3: + attn_score.masked_fill_(attn_mask[:,:,:,None], -float('inf')) + + # [qlen x klen x bsz x n_head] + attn_prob = F.softmax(attn_score, dim=1) + attn_prob = self.dropatt(attn_prob) + + #### compute attention vector + attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, w_head_v)) + + # [qlen x bsz x n_head x d_head] + attn_vec = attn_vec.contiguous().view( + attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head) + + ##### linear projection + attn_out = self.o_net(attn_vec) + attn_out = self.drop(attn_out) + + if self.pre_lnorm: + ##### residual connection + output = w + attn_out + else: + ##### residual connection + layer normalization + output = self.layer_norm(w + attn_out) + + return output + +class DecoderLayer(nn.Module): + def __init__(self, n_head, d_model, d_head, d_inner, dropout, **kwargs): + super(DecoderLayer, self).__init__() + + self.dec_attn = MultiHeadAttn(n_head, d_model, d_head, dropout, **kwargs) + self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, + pre_lnorm=kwargs.get('pre_lnorm')) + + def forward(self, dec_inp, dec_attn_mask=None, mems=None): + + output = self.dec_attn(dec_inp, attn_mask=dec_attn_mask, + mems=mems) + output = self.pos_ff(output) + + return output + +class RelLearnableDecoderLayer(nn.Module): + def __init__(self, n_head, d_model, d_head, d_inner, dropout, + **kwargs): + super(RelLearnableDecoderLayer, self).__init__() + + self.dec_attn = RelLearnableMultiHeadAttn(n_head, d_model, d_head, dropout, + **kwargs) + self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, + pre_lnorm=kwargs.get('pre_lnorm')) + + def forward(self, dec_inp, r_emb, r_w_bias, r_bias, dec_attn_mask=None, mems=None): + + output = self.dec_attn(dec_inp, r_emb, r_w_bias, r_bias, + attn_mask=dec_attn_mask, + mems=mems) + output = self.pos_ff(output) + + return output + +class RelPartialLearnableDecoderLayer(nn.Module): + def __init__(self, n_head, d_model, d_head, d_inner, dropout, + **kwargs): + super(RelPartialLearnableDecoderLayer, self).__init__() + + self.dec_attn = RelPartialLearnableMultiHeadAttn(n_head, d_model, + d_head, dropout, **kwargs) + self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, + pre_lnorm=kwargs.get('pre_lnorm')) + + def forward(self, dec_inp, r, dec_attn_mask=None, mems=None): + + output = self.dec_attn(dec_inp, r, + attn_mask=dec_attn_mask, + mems=mems) + output = self.pos_ff(output) + + return output + + +class AdaptiveEmbedding(nn.Module): + def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, + sample_softmax=False): + super(AdaptiveEmbedding, self).__init__() + + self.n_token = n_token + self.d_embed = d_embed + + self.cutoffs = cutoffs + [n_token] + self.div_val = div_val + self.d_proj = d_proj + + self.emb_scale = d_proj ** 0.5 + + self.cutoff_ends = [0] + self.cutoffs + + self.emb_layers = nn.ModuleList() + self.emb_projs = nn.ParameterList() + if div_val == 1: + self.emb_layers.append( + nn.Embedding(n_token, d_embed, sparse=sample_softmax>0) + ) + if d_proj != d_embed: + self.emb_projs.append(nn.Parameter(torch.Tensor(d_proj, d_embed))) + else: + for i in range(len(self.cutoffs)): + l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1] + d_emb_i = d_embed // (div_val ** i) + self.emb_layers.append(nn.Embedding(r_idx-l_idx, d_emb_i)) + self.emb_projs.append(nn.Parameter(torch.Tensor(d_proj, d_emb_i))) + + def forward(self, inp): + if self.div_val == 1: + embed = self.emb_layers[0](inp) + if self.d_proj != self.d_embed: + embed = F.linear(embed, self.emb_projs[0]) + else: + param = next(self.parameters()) + inp_flat = inp.view(-1) + emb_flat = torch.zeros([inp_flat.size(0), self.d_proj], + dtype=param.dtype, device=param.device) + for i in range(len(self.cutoffs)): + l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] + + mask_i = (inp_flat >= l_idx) & (inp_flat < r_idx) + indices_i = mask_i.nonzero().squeeze() + + if indices_i.numel() == 0: + continue + + inp_i = inp_flat.index_select(0, indices_i) - l_idx + emb_i = self.emb_layers[i](inp_i) + emb_i = F.linear(emb_i, self.emb_projs[i]) + + emb_flat.index_copy_(0, indices_i, emb_i) + + embed_shape = inp.size() + (self.d_proj,) + embed = emb_flat.view(embed_shape) + + embed.mul_(self.emb_scale) + + return embed + + +class TransfoXLPreTrainedModel(nn.Module): + """ An abstract class to handle weights initialization and + a simple interface for dowloading and loading pretrained models. + """ + def __init__(self, config, *inputs, **kwargs): + super(TransfoXLPreTrainedModel, self).__init__() + if not isinstance(config, TransfoXLConfig): + raise ValueError( + "Parameter config in `{}(config)` should be an instance of class `TransfoXLConfig`. " + "To create a model from a pretrained model use " + "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format( + self.__class__.__name__, self.__class__.__name__ + )) + self.config = config + + def init_weight(self, weight): + if self.config.init == 'uniform': + nn.init.uniform_(weight, -self.config.init_range, self.config.init_range) + elif self.config.init == 'normal': + nn.init.normal_(weight, 0.0, self.config.init_std) + + def init_bias(self, bias): + nn.init.constant_(bias, 0.0) + + def init_weights(self, m): + """ Initialize the weights. + """ + classname = m.__class__.__name__ + if classname.find('Linear') != -1: + if hasattr(m, 'weight') and m.weight is not None: + self.init_weight(m.weight) + if hasattr(m, 'bias') and m.bias is not None: + self.init_bias(m.bias) + elif classname.find('AdaptiveEmbedding') != -1: + if hasattr(m, 'emb_projs'): + for i in range(len(m.emb_projs)): + if m.emb_projs[i] is not None: + nn.init.normal_(m.emb_projs[i], 0.0, self.config.proj_init_std) + elif classname.find('Embedding') != -1: + if hasattr(m, 'weight'): + self.init_weight(m.weight) + elif classname.find('ProjectedAdaptiveLogSoftmax') != -1: + if hasattr(m, 'cluster_weight') and m.cluster_weight is not None: + self.init_weight(m.cluster_weight) + if hasattr(m, 'cluster_bias') and m.cluster_bias is not None: + self.init_bias(m.cluster_bias) + if hasattr(m, 'out_projs'): + for i in range(len(m.out_projs)): + if m.out_projs[i] is not None: + nn.init.normal_(m.out_projs[i], 0.0, self.config.proj_init_std) + elif classname.find('LayerNorm') != -1: + if hasattr(m, 'weight'): + nn.init.normal_(m.weight, 1.0, self.config.init_std) + if hasattr(m, 'bias') and m.bias is not None: + self.init_bias(m.bias) + elif classname.find('TransformerLM') != -1: + if hasattr(m, 'r_emb'): + self.init_weight(m.r_emb) + if hasattr(m, 'r_w_bias'): + self.init_weight(m.r_w_bias) + if hasattr(m, 'r_r_bias'): + self.init_weight(m.r_r_bias) + if hasattr(m, 'r_bias'): + self.init_bias(m.r_bias) + + def set_num_special_tokens(self, num_special_tokens): + pass + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): + """ + Instantiate a TransfoXLPreTrainedModel from a pre-trained model file or a pytorch state dict. + Download and cache the pre-trained model file if needed. + + Params: + pretrained_model_name_or_path: either: + - a str with the name of a pre-trained model to load selected in the list of: + . `transfo-xl-wt103` + - a path or url to a pretrained model archive containing: + . `transfo_xl_config.json` a configuration file for the model + . `pytorch_model.bin` a PyTorch dump of a TransfoXLModel instance + - a path or url to a pretrained model archive containing: + . `transfo_xl_config.json` a configuration file for the model + . `model.chkpt` a TensorFlow checkpoint + from_tf: should we load the weights from a locally saved TensorFlow checkpoint + cache_dir: an optional path to a folder in which the pre-trained models will be cached. + state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models + *inputs, **kwargs: additional input for the specific TransformerXL class + """ + state_dict = kwargs.get('state_dict', None) + kwargs.pop('state_dict', None) + cache_dir = kwargs.get('cache_dir', None) + kwargs.pop('cache_dir', None) + from_tf = kwargs.get('from_tf', False) + kwargs.pop('from_tf', None) + + if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP: + archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path] + config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path] + else: + archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME) + config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME) + # redirect to the cache, if necessary + try: + resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir) + resolved_config_file = cached_path(config_file, cache_dir=cache_dir) + except EnvironmentError: + logger.error( + "Model name '{}' was not found in model name list ({}). " + "We assumed '{}' was a path or url but couldn't find files {} and {} " + "at this path or url.".format( + pretrained_model_name_or_path, + ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), + pretrained_model_name_or_path, + archive_file, config_file)) + return None + if resolved_archive_file == archive_file and resolved_config_file == config_file: + logger.info("loading weights file {}".format(archive_file)) + logger.info("loading configuration file {}".format(config_file)) + else: + logger.info("loading weights file {} from cache at {}".format( + archive_file, resolved_archive_file)) + logger.info("loading configuration file {} from cache at {}".format( + config_file, resolved_config_file)) + # Load config + config = TransfoXLConfig.from_json_file(resolved_config_file) + logger.info("Model config {}".format(config)) + # Instantiate model. + model = cls(config, *inputs, **kwargs) + if state_dict is None and not from_tf: + state_dict = torch.load(resolved_archive_file, map_location='cpu') + if from_tf: + # Directly load from a TensorFlow checkpoint + return load_tf_weights_in_transfo_xl(model, config, pretrained_model_name_or_path) + + missing_keys = [] + unexpected_keys = [] + error_msgs = [] + # copy state_dict so _load_from_state_dict can modify it + metadata = getattr(state_dict, '_metadata', None) + state_dict = state_dict.copy() + if metadata is not None: + state_dict._metadata = metadata + + def load(module, prefix=''): + local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) + module._load_from_state_dict( + state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs) + for name, child in module._modules.items(): + if child is not None: + load(child, prefix + name + '.') + + start_prefix = '' + if not hasattr(model, 'transformer') and any(s.startswith('transformer.') for s in state_dict.keys()): + start_prefix = 'transformer.' + load(model, prefix=start_prefix) + + if len(missing_keys) > 0: + logger.info("Weights of {} not initialized from pretrained model: {}".format( + model.__class__.__name__, missing_keys)) + if len(unexpected_keys) > 0: + logger.info("Weights from pretrained model not used in {}: {}".format( + model.__class__.__name__, unexpected_keys)) + if len(error_msgs) > 0: + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + model.__class__.__name__, "\n\t".join(error_msgs))) + # Make sure we are still sharing the input and output embeddings + if hasattr(model, 'tie_weights'): + model.tie_weights() + return model + + +class TransfoXLModel(TransfoXLPreTrainedModel): + """Transformer XL model ("Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"). + + Transformer XL use a relative positioning (with sinusiodal patterns) and adaptive softmax inputs which means that: + - you don't need to specify positioning embeddings indices + - the tokens in the vocabulary have to be sorted to decreasing frequency. + + Params: + config: a TransfoXLConfig class instance with the configuration to build a new model + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] + with the token indices selected in the range [0, self.config.n_token[ + `mems`: optional memomry of hidden states from previous forward passes + as a list (num layers) of hidden states at the entry of each layer + each hidden states has shape [self.config.mem_len, bsz, self.config.d_model] + Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `target` + Outputs: + A tuple of (last_hidden_state, new_mems) + `last_hidden_state`: the encoded-hidden-states at the top of the model + as a torch.FloatTensor of size [batch_size, sequence_length, self.config.d_model] + `new_mems`: list (num layers) of updated mem states at the entry of each layer + each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model] + Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `target` + + Example usage: + ```python + # Already been converted into BPE token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + input_ids_next = torch.LongTensor([[53, 21, 1], [64, 23, 100]]) + + config = TransfoXLConfig() + + model = TransfoXLModel(config) + last_hidden_state, new_mems = model(input_ids) + + # Another time on input_ids_next using the memory: + last_hidden_state, new_mems = model(input_ids_next, new_mems) + ``` + """ + def __init__(self, config): + super(TransfoXLModel, self).__init__(config) + self.n_token = config.n_token + + self.d_embed = config.d_embed + self.d_model = config.d_model + self.n_head = config.n_head + self.d_head = config.d_head + + self.word_emb = AdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs, + div_val=config.div_val) + + self.drop = nn.Dropout(config.dropout) + + self.n_layer = config.n_layer + + self.tgt_len = config.tgt_len + self.mem_len = config.mem_len + self.ext_len = config.ext_len + self.max_klen = config.tgt_len + config.ext_len + config.mem_len + + self.attn_type = config.attn_type + + if not config.untie_r: + self.r_w_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head)) + self.r_r_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head)) + + self.layers = nn.ModuleList() + if config.attn_type == 0: # the default attention + for i in range(config.n_layer): + self.layers.append( + RelPartialLearnableDecoderLayer( + config.n_head, config.d_model, config.d_head, config.d_inner, config.dropout, + tgt_len=config.tgt_len, ext_len=config.ext_len, mem_len=config.mem_len, + dropatt=config.dropatt, pre_lnorm=config.pre_lnorm, + r_w_bias=None if config.untie_r else self.r_w_bias, + r_r_bias=None if config.untie_r else self.r_r_bias) + ) + elif config.attn_type == 1: # learnable embeddings + for i in range(config.n_layer): + self.layers.append( + RelLearnableDecoderLayer( + config.n_head, config.d_model, config.d_head, config.d_inner, config.dropout, + tgt_len=config.tgt_len, ext_len=config.ext_len, mem_len=config.mem_len, + dropatt=config.dropatt, pre_lnorm=config.pre_lnorm, + r_w_bias=None if config.untie_r else self.r_w_bias, + r_r_bias=None if config.untie_r else self.r_r_bias) + ) + elif config.attn_type in [2, 3]: # absolute embeddings + for i in range(config.n_layer): + self.layers.append( + DecoderLayer( + config.n_head, config.d_model, config.d_head, config.d_inner, config.dropout, + dropatt=config.dropatt, pre_lnorm=config.pre_lnorm, + r_w_bias=None if config.untie_r else self.r_w_bias, + r_r_bias=None if config.untie_r else self.r_r_bias) + ) + + self.same_length = config.same_length + self.clamp_len = config.clamp_len + + if self.attn_type == 0: # default attention + self.pos_emb = PositionalEmbedding(self.d_model) + elif self.attn_type == 1: # learnable + self.r_emb = nn.Parameter(torch.Tensor( + self.n_layer, self.max_klen, self.n_head, self.d_head)) + self.r_bias = nn.Parameter(torch.Tensor( + self.n_layer, self.max_klen, self.n_head)) + elif self.attn_type == 2: # absolute standard + self.pos_emb = PositionalEmbedding(self.d_model) + elif self.attn_type == 3: # absolute deeper SA + self.r_emb = nn.Parameter(torch.Tensor( + self.n_layer, self.max_klen, self.n_head, self.d_head)) + self.apply(self.init_weights) + + def backward_compatible(self): + self.sample_softmax = -1 + + + def reset_length(self, tgt_len, ext_len, mem_len): + self.tgt_len = tgt_len + self.mem_len = mem_len + self.ext_len = ext_len + + def init_mems(self, data): + if self.mem_len > 0: + mems = [] + param = next(self.parameters()) + for i in range(self.n_layer): + empty = torch.zeros(self.mem_len, data.size(1), self.config.d_model, + dtype=param.dtype, device=param.device) + mems.append(empty) + + return mems + else: + return None + + def _update_mems(self, hids, mems, qlen, mlen): + # does not deal with None + if mems is None: return None + + # mems is not None + assert len(hids) == len(mems), 'len(hids) != len(mems)' + + # There are `mlen + qlen` steps that can be cached into mems + # For the next step, the last `ext_len` of the `qlen` tokens + # will be used as the extended context. Hence, we only cache + # the tokens from `mlen + qlen - self.ext_len - self.mem_len` + # to `mlen + qlen - self.ext_len`. + with torch.no_grad(): + new_mems = [] + end_idx = mlen + max(0, qlen - 0 - self.ext_len) + beg_idx = max(0, end_idx - self.mem_len) + for i in range(len(hids)): + + cat = torch.cat([mems[i], hids[i]], dim=0) + new_mems.append(cat[beg_idx:end_idx].detach()) + + return new_mems + + def _forward(self, dec_inp, mems=None): + qlen, bsz = dec_inp.size() + + word_emb = self.word_emb(dec_inp) + + mlen = mems[0].size(0) if mems is not None else 0 + klen = mlen + qlen + if self.same_length: + all_ones = word_emb.new_ones(qlen, klen) + mask_len = klen - self.mem_len + if mask_len > 0: + mask_shift_len = qlen - mask_len + else: + mask_shift_len = qlen + dec_attn_mask = (torch.triu(all_ones, 1+mlen) + + torch.tril(all_ones, -mask_shift_len)).byte()[:, :, None] # -1 + else: + dec_attn_mask = torch.triu( + word_emb.new_ones(qlen, klen), diagonal=1+mlen).byte()[:,:,None] + + hids = [] + if self.attn_type == 0: # default + pos_seq = torch.arange(klen-1, -1, -1.0, device=word_emb.device, + dtype=word_emb.dtype) + if self.clamp_len > 0: + pos_seq.clamp_(max=self.clamp_len) + pos_emb = self.pos_emb(pos_seq) + + core_out = self.drop(word_emb) + pos_emb = self.drop(pos_emb) + + for i, layer in enumerate(self.layers): + hids.append(core_out) + mems_i = None if mems is None else mems[i] + core_out = layer(core_out, pos_emb, dec_attn_mask=dec_attn_mask, mems=mems_i) + elif self.attn_type == 1: # learnable + core_out = self.drop(word_emb) + for i, layer in enumerate(self.layers): + hids.append(core_out) + if self.clamp_len > 0: + r_emb = self.r_emb[i][-self.clamp_len :] + r_bias = self.r_bias[i][-self.clamp_len :] + else: + r_emb, r_bias = self.r_emb[i], self.r_bias[i] + + mems_i = None if mems is None else mems[i] + core_out = layer(core_out, r_emb, self.r_w_bias[i], + r_bias, dec_attn_mask=dec_attn_mask, mems=mems_i) + elif self.attn_type == 2: # absolute + pos_seq = torch.arange(klen - 1, -1, -1.0, device=word_emb.device, + dtype=word_emb.dtype) + if self.clamp_len > 0: + pos_seq.clamp_(max=self.clamp_len) + pos_emb = self.pos_emb(pos_seq) + + core_out = self.drop(word_emb + pos_emb[-qlen:]) + + for i, layer in enumerate(self.layers): + hids.append(core_out) + mems_i = None if mems is None else mems[i] + if mems_i is not None and i == 0: + mems_i += pos_emb[:mlen] + core_out = layer(core_out, dec_attn_mask=dec_attn_mask, + mems=mems_i) + elif self.attn_type == 3: + core_out = self.drop(word_emb) + + for i, layer in enumerate(self.layers): + hids.append(core_out) + mems_i = None if mems is None else mems[i] + if mems_i is not None and mlen > 0: + cur_emb = self.r_emb[i][:-qlen] + cur_size = cur_emb.size(0) + if cur_size < mlen: + cur_emb_pad = cur_emb[0:1].expand(mlen-cur_size, -1, -1) + cur_emb = torch.cat([cur_emb_pad, cur_emb], 0) + else: + cur_emb = cur_emb[-mlen:] + mems_i += cur_emb.view(mlen, 1, -1) + core_out += self.r_emb[i][-qlen:].view(qlen, 1, -1) + + core_out = layer(core_out, dec_attn_mask=dec_attn_mask, + mems=mems_i) + + core_out = self.drop(core_out) + + new_mems = self._update_mems(hids, mems, mlen, qlen) + + return core_out, new_mems + + def forward(self, input_ids, mems=None): + """ Params: + input_ids :: [bsz, len] + mems :: optional mems from previous forwar passes (or init_mems) + list (num layers) of mem states at the entry of each layer + shape :: [self.config.mem_len, bsz, self.config.d_model] + Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `target` + Returns: + tuple (last_hidden, new_mems) where: + new_mems: list (num layers) of mem states at the entry of each layer + shape :: [self.config.mem_len, bsz, self.config.d_model] + last_hidden: output of the last layer: + shape :: [bsz, len, self.config.d_model] + """ + # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library + # so we transpose here from shape [bsz, len] to shape [len, bsz] + input_ids = input_ids.transpose(0, 1).contiguous() + + if mems is None: + mems = self.init_mems(input_ids) + last_hidden, new_mems = self._forward(input_ids, mems=mems) + + # We transpose back here to shape [bsz, len, hidden_dim] + last_hidden = last_hidden.transpose(0, 1).contiguous() + return (last_hidden, new_mems) + + +class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): + """Transformer XL model ("Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"). + + This model add an (adaptive) softmax head on top of the TransfoXLModel + + Transformer XL use a relative positioning (with sinusiodal patterns) and adaptive softmax inputs which means that: + - you don't need to specify positioning embeddings indices + - the tokens in the vocabulary have to be sorted to decreasing frequency. + + Call self.tie_weights() if you update/load the weights of the transformer to keep the weights tied. + + Params: + config: a TransfoXLConfig class instance with the configuration to build a new model + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] + with the token indices selected in the range [0, self.config.n_token[ + `target`: an optional torch.LongTensor of shape [batch_size, sequence_length] + with the target token indices selected in the range [0, self.config.n_token[ + `mems`: an optional memory of hidden states from previous forward passes + as a list (num layers) of hidden states at the entry of each layer + each hidden states has shape [self.config.mem_len, bsz, self.config.d_model] + Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `target` + + Outputs: + A tuple of (last_hidden_state, new_mems) + `softmax_output`: output of the (adaptive) softmax: + if target is None: + Negative log likelihood of shape [batch_size, sequence_length] + else: + log probabilities of tokens, shape [batch_size, sequence_length, n_tokens] + `new_mems`: list (num layers) of updated mem states at the entry of each layer + each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model] + Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `target` + + Example usage: + ```python + # Already been converted into BPE token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + input_ids_next = torch.LongTensor([[53, 21, 1], [64, 23, 100]]) + + config = TransfoXLConfig() + + model = TransfoXLModel(config) + last_hidden_state, new_mems = model(input_ids) + + # Another time on input_ids_next using the memory: + last_hidden_state, new_mems = model(input_ids_next, mems=new_mems) + ``` + """ + def __init__(self, config): + super(TransfoXLLMHeadModel, self).__init__(config) + self.transformer = TransfoXLModel(config) + self.sample_softmax = config.sample_softmax + # use sampled softmax + if config.sample_softmax > 0: + self.out_layer = nn.Linear(config.d_model, config.n_token) + self.sampler = LogUniformSampler(config.n_token, config.sample_softmax) + # use adaptive softmax (including standard softmax) + else: + self.crit = ProjectedAdaptiveLogSoftmax(config.n_token, config.d_embed, config.d_model, + config.cutoffs, div_val=config.div_val) + self.apply(self.init_weights) + self.tie_weights() + + def tie_weights(self): + """ Run this to be sure output and input (adaptive) softmax weights are tied """ + # sampled softmax + if self.sample_softmax > 0: + if self.config.tie_weight: + self.out_layer.weight = self.transformer.word_emb.weight + # adaptive softmax (including standard softmax) + else: + if self.config.tie_weight: + for i in range(len(self.crit.out_layers)): + self.crit.out_layers[i].weight = self.transformer.word_emb.emb_layers[i].weight + if self.config.tie_projs: + for i, tie_proj in enumerate(self.config.tie_projs): + if tie_proj and self.config.div_val == 1 and self.config.d_model != self.config.d_embed: + self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[0] + elif tie_proj and self.config.div_val != 1: + self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[i] + + def reset_length(self, tgt_len, ext_len, mem_len): + self.transformer.reset_length(tgt_len, ext_len, mem_len) + + def init_mems(self, data): + return self.transformer.init_mems(data) + + def forward(self, input_ids, target=None, mems=None): + """ Params: + input_ids :: [bsz, len] + target :: [bsz, len] + Returns: + tuple(softmax_output, new_mems) where: + new_mems: list (num layers) of hidden states at the entry of each layer + shape :: [mem_len, bsz, self.config.d_model] :: Warning: shapes are transposed here w. regards to input_ids + softmax_output: output of the (adaptive) softmax: + if target is None: + Negative log likelihood of shape :: [bsz, len] + else: + log probabilities of tokens, shape :: [bsz, len, n_tokens] + """ + bsz = input_ids.size(0) + tgt_len = input_ids.size(1) + + last_hidden, new_mems = self.transformer(input_ids, mems) + + pred_hid = last_hidden[:, -tgt_len:] + if self.sample_softmax > 0 and self.training: + assert self.config.tie_weight + logit = sample_logits(self.transformer.word_emb, self.out_layer.bias, target, pred_hid, self.sampler) + softmax_output = -F.log_softmax(logit, -1)[:, :, 0] + else: + softmax_output = self.crit(pred_hid.view(-1, pred_hid.size(-1)), target) + if target is None: + softmax_output = softmax_output.view(bsz, tgt_len, -1) + else: + softmax_output = softmax_output.view(bsz, tgt_len) + + # We transpose back + return (softmax_output, new_mems) diff --git a/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py b/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py new file mode 100644 index 0000000..7fd67ad --- /dev/null +++ b/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py @@ -0,0 +1,402 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Utilities for PyTorch Transformer XL model. + Directly adapted from https://github.com/kimiyoung/transformer-xl. +""" + +from collections import defaultdict + +import numpy as np + +import torch +import torch.nn as nn +import torch.nn.functional as F + +# CUDA_MAJOR = int(torch.version.cuda.split('.')[0]) +# CUDA_MINOR = int(torch.version.cuda.split('.')[1]) + +class ProjectedAdaptiveLogSoftmax(nn.Module): + def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, + keep_order=False): + super(ProjectedAdaptiveLogSoftmax, self).__init__() + + self.n_token = n_token + self.d_embed = d_embed + self.d_proj = d_proj + + self.cutoffs = cutoffs + [n_token] + self.cutoff_ends = [0] + self.cutoffs + self.div_val = div_val + + self.shortlist_size = self.cutoffs[0] + self.n_clusters = len(self.cutoffs) - 1 + self.head_size = self.shortlist_size + self.n_clusters + + if self.n_clusters > 0: + self.cluster_weight = nn.Parameter(torch.zeros(self.n_clusters, self.d_embed)) + self.cluster_bias = nn.Parameter(torch.zeros(self.n_clusters)) + + self.out_layers = nn.ModuleList() + self.out_projs = nn.ParameterList() + + if div_val == 1: + for i in range(len(self.cutoffs)): + if d_proj != d_embed: + self.out_projs.append( + nn.Parameter(torch.Tensor(d_proj, d_embed)) + ) + else: + self.out_projs.append(None) + + self.out_layers.append(nn.Linear(d_embed, n_token)) + else: + for i in range(len(self.cutoffs)): + l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1] + d_emb_i = d_embed // (div_val ** i) + + self.out_projs.append( + nn.Parameter(torch.Tensor(d_proj, d_emb_i)) + ) + + self.out_layers.append(nn.Linear(d_emb_i, r_idx-l_idx)) + + self.keep_order = keep_order + + def _compute_logit(self, hidden, weight, bias, proj): + if proj is None: + logit = F.linear(hidden, weight, bias=bias) + else: + # if CUDA_MAJOR <= 9 and CUDA_MINOR <= 1: + proj_hid = F.linear(hidden, proj.t().contiguous()) + logit = F.linear(proj_hid, weight, bias=bias) + # else: + # logit = torch.einsum('bd,de,ev->bv', (hidden, proj, weight.t())) + # if bias is not None: + # logit = logit + bias + + return logit + + def forward(self, hidden, target=None, keep_order=False): + ''' + Params: + hidden :: [len*bsz x d_proj] + target :: [len*bsz] + Return: + if target is None: + out :: [len*bsz] Negative log likelihood + else: + out :: [len*bsz x n_tokens] log probabilities of tokens over the vocabulary + We could replace this implementation by the native PyTorch one + if their's had an option to set bias on all clusters in the native one. + here: https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138 + ''' + + if target is not None: + target = target.view(-1) + if hidden.size(0) != target.size(0): + raise RuntimeError('Input and target should have the same size ' + 'in the batch dimension.') + + if self.n_clusters == 0: + logit = self._compute_logit(hidden, self.out_layers[0].weight, + self.out_layers[0].bias, self.out_projs[0]) + if target is not None: + out = -F.log_softmax(logit, dim=-1) \ + .gather(1, target.unsqueeze(1)).squeeze(1) + else: + out = F.log_softmax(logit, dim=-1) + else: + # construct weights and biases + weights, biases = [], [] + for i in range(len(self.cutoffs)): + if self.div_val == 1: + l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] + weight_i = self.out_layers[0].weight[l_idx:r_idx] + bias_i = self.out_layers[0].bias[l_idx:r_idx] + else: + weight_i = self.out_layers[i].weight + bias_i = self.out_layers[i].bias + + if i == 0: + weight_i = torch.cat( + [weight_i, self.cluster_weight], dim=0) + bias_i = torch.cat( + [bias_i, self.cluster_bias], dim=0) + + weights.append(weight_i) + biases.append(bias_i) + + head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0] + + head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj) + head_logprob = F.log_softmax(head_logit, dim=1) + + if target is None: + out = hidden.new_empty((head_logit.size(0), self.n_token)) + else: + out = torch.zeros_like(target, dtype=hidden.dtype, device=hidden.device) + + offset = 0 + cutoff_values = [0] + self.cutoffs + for i in range(len(cutoff_values) - 1): + l_idx, r_idx = cutoff_values[i], cutoff_values[i + 1] + + if target is not None: + mask_i = (target >= l_idx) & (target < r_idx) + indices_i = mask_i.nonzero().squeeze() + + if indices_i.numel() == 0: + continue + + target_i = target.index_select(0, indices_i) - l_idx + head_logprob_i = head_logprob.index_select(0, indices_i) + hidden_i = hidden.index_select(0, indices_i) + else: + hidden_i = hidden + + if i == 0: + if target is not None: + logprob_i = head_logprob_i.gather(1, target_i[:, None]).squeeze(1) + else: + out[:, :self.cutoffs[0]] = head_logprob[:, :self.cutoffs[0]] + else: + weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i] + + tail_logit_i = self._compute_logit(hidden_i, weight_i, bias_i, proj_i) + tail_logprob_i = F.log_softmax(tail_logit_i, dim=1) + cluster_prob_idx = self.cutoffs[0] + i - 1 # No probability for the head cluster + if target is not None: + logprob_i = head_logprob_i[:, cluster_prob_idx] \ + + tail_logprob_i.gather(1, target_i[:, None]).squeeze(1) + else: + logprob_i = head_logprob[:, cluster_prob_idx, None] + tail_logprob_i + out[:, l_idx:r_idx] = logprob_i + + if target is not None: + if (hasattr(self, 'keep_order') and self.keep_order) or keep_order: + out.index_copy_(0, indices_i, -logprob_i) + else: + out[offset:offset+logprob_i.size(0)].copy_(-logprob_i) + offset += logprob_i.size(0) + + return out + + + def log_prob(self, hidden): + r""" Computes log probabilities for all :math:`n\_classes` + From: https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/adaptive.py + Args: + hidden (Tensor): a minibatch of examples + Returns: + log-probabilities of for each class :math:`c` + in range :math:`0 <= c <= n\_classes`, where :math:`n\_classes` is a + parameter passed to ``AdaptiveLogSoftmaxWithLoss`` constructor. + Shape: + - Input: :math:`(N, in\_features)` + - Output: :math:`(N, n\_classes)` + """ + if self.n_clusters == 0: + logit = self._compute_logit(hidden, self.out_layers[0].weight, + self.out_layers[0].bias, self.out_projs[0]) + return F.log_softmax(logit, dim=-1) + else: + # construct weights and biases + weights, biases = [], [] + for i in range(len(self.cutoffs)): + if self.div_val == 1: + l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] + weight_i = self.out_layers[0].weight[l_idx:r_idx] + bias_i = self.out_layers[0].bias[l_idx:r_idx] + else: + weight_i = self.out_layers[i].weight + bias_i = self.out_layers[i].bias + + if i == 0: + weight_i = torch.cat( + [weight_i, self.cluster_weight], dim=0) + bias_i = torch.cat( + [bias_i, self.cluster_bias], dim=0) + + weights.append(weight_i) + biases.append(bias_i) + + head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0] + head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj) + + out = hidden.new_empty((head_logit.size(0), self.n_token)) + head_logprob = F.log_softmax(head_logit, dim=1) + + cutoff_values = [0] + self.cutoffs + for i in range(len(cutoff_values) - 1): + start_idx, stop_idx = cutoff_values[i], cutoff_values[i + 1] + + if i == 0: + out[:, :self.cutoffs[0]] = head_logprob[:, :self.cutoffs[0]] + else: + weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i] + + tail_logit_i = self._compute_logit(hidden, weight_i, bias_i, proj_i) + tail_logprob_i = F.log_softmax(tail_logit_i, dim=1) + + logprob_i = head_logprob[:, -i] + tail_logprob_i + out[:, start_idx, stop_idx] = logprob_i + + return out + + +class LogUniformSampler(object): + def __init__(self, range_max, n_sample): + """ + Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py + `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)` + + expected count can be approximated by 1 - (1 - p)^n + and we use a numerically stable version -expm1(num_tries * log1p(-p)) + + Our implementation fixes num_tries at 2 * n_sample, and the actual #samples will vary from run to run + """ + with torch.no_grad(): + self.range_max = range_max + log_indices = torch.arange(1., range_max+2., 1.).log_() + self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1] + # print('P', self.dist.numpy().tolist()[-30:]) + + self.log_q = (- (-self.dist.double().log1p_() * 2 * n_sample).expm1_()).log_().float() + + self.n_sample = n_sample + + def sample(self, labels): + """ + labels: [b1, b2] + Return + true_log_probs: [b1, b2] + samp_log_probs: [n_sample] + neg_samples: [n_sample] + """ + + # neg_samples = torch.empty(0).long() + n_sample = self.n_sample + n_tries = 2 * n_sample + + with torch.no_grad(): + neg_samples = torch.multinomial(self.dist, n_tries, replacement=True).unique() + device = labels.device + neg_samples = neg_samples.to(device) + true_log_probs = self.log_q[labels].to(device) + samp_log_probs = self.log_q[neg_samples].to(device) + return true_log_probs, samp_log_probs, neg_samples + +def sample_logits(embedding, bias, labels, inputs, sampler): + """ + embedding: an nn.Embedding layer + bias: [n_vocab] + labels: [b1, b2] + inputs: [b1, b2, n_emb] + sampler: you may use a LogUniformSampler + Return + logits: [b1, b2, 1 + n_sample] + """ + true_log_probs, samp_log_probs, neg_samples = sampler.sample(labels) + n_sample = neg_samples.size(0) + b1, b2 = labels.size(0), labels.size(1) + all_ids = torch.cat([labels.view(-1), neg_samples]) + all_w = embedding(all_ids) + true_w = all_w[: -n_sample].view(b1, b2, -1) + sample_w = all_w[- n_sample:].view(n_sample, -1) + + all_b = bias[all_ids] + true_b = all_b[: -n_sample].view(b1, b2) + sample_b = all_b[- n_sample:] + + hit = (labels[:, :, None] == neg_samples).detach() + + true_logits = torch.einsum('ijk,ijk->ij', + [true_w, inputs]) + true_b - true_log_probs + sample_logits = torch.einsum('lk,ijk->ijl', + [sample_w, inputs]) + sample_b - samp_log_probs + sample_logits.masked_fill_(hit, -1e30) + logits = torch.cat([true_logits[:, :, None], sample_logits], -1) + + return logits + + +# class LogUniformSampler(object): +# def __init__(self, range_max, unique=False): +# """ +# Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py +# `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)` +# """ +# self.range_max = range_max +# log_indices = torch.arange(1., range_max+2., 1.).log_() +# self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1] + +# self.unique = unique + +# if self.unique: +# self.exclude_mask = torch.ByteTensor(range_max).fill_(0) + +# def sample(self, n_sample, labels): +# pos_sample, new_labels = labels.unique(return_inverse=True) +# n_pos_sample = pos_sample.size(0) +# n_neg_sample = n_sample - n_pos_sample + +# if self.unique: +# self.exclude_mask.index_fill_(0, pos_sample, 1) +# sample_dist = self.dist.clone().masked_fill_(self.exclude_mask, 0) +# self.exclude_mask.index_fill_(0, pos_sample, 0) +# else: +# sample_dist = self.dist + +# neg_sample = torch.multinomial(sample_dist, n_neg_sample) + +# sample = torch.cat([pos_sample, neg_sample]) +# sample_prob = self.dist[sample] + +# return new_labels, sample, sample_prob + + +if __name__ == '__main__': + S, B = 3, 4 + n_vocab = 10000 + n_sample = 5 + H = 32 + + labels = torch.LongTensor(S, B).random_(0, n_vocab) + + # sampler = LogUniformSampler(n_vocab, unique=False) + # new_labels, sample, sample_prob = sampler.sample(n_sample, labels) + + sampler = LogUniformSampler(n_vocab, n_sample)#, unique=True) + # true_probs, samp_probs, neg_samples = sampler.sample(n_sample, labels) + + # print('true_probs', true_probs.numpy().tolist()) + # print('samp_probs', samp_probs.numpy().tolist()) + # print('neg_samples', neg_samples.numpy().tolist()) + + # print('sum', torch.sum(sampler.dist).item()) + + # assert torch.all(torch.sort(sample.unique())[0].eq(torch.sort(sample)[0])).item() + + embedding = nn.Embedding(n_vocab, H) + bias = torch.zeros(n_vocab) + inputs = torch.Tensor(S, B, H).normal_() + + logits, out_labels = sample_logits(embedding, bias, labels, inputs, sampler, n_sample) + print('logits', logits.detach().numpy().tolist()) + print('logits shape', logits.size()) + print('out_labels', out_labels.detach().numpy().tolist()) + print('out_labels shape', out_labels.size()) + diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py new file mode 100644 index 0000000..0385695 --- /dev/null +++ b/pytorch_pretrained_bert/optimization.py @@ -0,0 +1,302 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch optimization for BERT model.""" + +import math +import torch +from torch.optim import Optimizer +from torch.optim.optimizer import required +from torch.nn.utils import clip_grad_norm_ +import logging +import abc +import sys + +logger = logging.getLogger(__name__) + + +if sys.version_info >= (3, 4): + ABC = abc.ABC +else: + ABC = abc.ABCMeta('ABC', (), {}) + + +class _LRSchedule(ABC): + """ Parent of all LRSchedules here. """ + warn_t_total = False # is set to True for schedules where progressing beyond t_total steps doesn't make sense + def __init__(self, warmup=0.002, t_total=-1, **kw): + """ + :param warmup: what fraction of t_total steps will be used for linear warmup + :param t_total: how many training steps (updates) are planned + :param kw: + """ + super(_LRSchedule, self).__init__(**kw) + if t_total < 0: + logger.warning("t_total value of {} results in schedule not being applied".format(t_total)) + if not 0.0 <= warmup < 1.0 and not warmup == -1: + raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup)) + warmup = max(warmup, 0.) + self.warmup, self.t_total = float(warmup), float(t_total) + self.warned_for_t_total_at_progress = -1 + + def get_lr(self, step, nowarn=False): + """ + :param step: which of t_total steps we're on + :param nowarn: set to True to suppress warning regarding training beyond specified 't_total' steps + :return: learning rate multiplier for current update + """ + if self.t_total < 0: + return 1. + progress = float(step) / self.t_total + ret = self.get_lr_(progress) + # warning for exceeding t_total (only active with warmup_linear + if not nowarn and self.warn_t_total and progress > 1. and progress > self.warned_for_t_total_at_progress: + logger.warning( + "Training beyond specified 't_total'. Learning rate multiplier set to {}. Please set 't_total' of {} correctly." + .format(ret, self.__class__.__name__)) + self.warned_for_t_total_at_progress = progress + # end warning + return ret + + @abc.abstractmethod + def get_lr_(self, progress): + """ + :param progress: value between 0 and 1 (unless going beyond t_total steps) specifying training progress + :return: learning rate multiplier for current update + """ + return 1. + + +class ConstantLR(_LRSchedule): + def get_lr_(self, progress): + return 1. + + +class WarmupCosineSchedule(_LRSchedule): + """ + Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps. + Decreases learning rate from 1. to 0. over remaining `1 - warmup` steps following a cosine curve. + If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup. + """ + warn_t_total = True + def __init__(self, warmup=0.002, t_total=-1, cycles=.5, **kw): + """ + :param warmup: see LRSchedule + :param t_total: see LRSchedule + :param cycles: number of cycles. Default: 0.5, corresponding to cosine decay from 1. at progress==warmup and 0 at progress==1. + :param kw: + """ + super(WarmupCosineSchedule, self).__init__(warmup=warmup, t_total=t_total, **kw) + self.cycles = cycles + + def get_lr_(self, progress): + if progress < self.warmup: + return progress / self.warmup + else: + progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup + return 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress)) + + +class WarmupCosineWithHardRestartsSchedule(WarmupCosineSchedule): + """ + Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps. + If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying + learning rate (with hard restarts). + """ + def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw): + super(WarmupCosineWithHardRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw) + assert(cycles >= 1.) + + def get_lr_(self, progress): + if progress < self.warmup: + return progress / self.warmup + else: + progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup + ret = 0.5 * (1. + math.cos(math.pi * ((self.cycles * progress) % 1))) + return ret + + +class WarmupCosineWithWarmupRestartsSchedule(WarmupCosineWithHardRestartsSchedule): + """ + All training progress is divided in `cycles` (default=1.) parts of equal length. + Every part follows a schedule with the first `warmup` fraction of the training steps linearly increasing from 0. to 1., + followed by a learning rate decreasing from 1. to 0. following a cosine curve. + """ + def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw): + assert(warmup * cycles < 1.) + warmup = warmup * cycles if warmup >= 0 else warmup + super(WarmupCosineWithWarmupRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw) + + def get_lr_(self, progress): + progress = progress * self.cycles % 1. + if progress < self.warmup: + return progress / self.warmup + else: + progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup + ret = 0.5 * (1. + math.cos(math.pi * progress)) + return ret + + +class WarmupConstantSchedule(_LRSchedule): + """ + Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps. + Keeps learning rate equal to 1. after warmup. + """ + def get_lr_(self, progress): + if progress < self.warmup: + return progress / self.warmup + return 1. + + +class WarmupLinearSchedule(_LRSchedule): + """ + Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps. + Linearly decreases learning rate from 1. to 0. over remaining `1 - warmup` steps. + """ + warn_t_total = True + def get_lr_(self, progress): + if progress < self.warmup: + return progress / self.warmup + return max((progress - 1.) / (self.warmup - 1.), 0.) + + +SCHEDULES = { + None: ConstantLR, + "none": ConstantLR, + "warmup_cosine": WarmupCosineSchedule, + "warmup_constant": WarmupConstantSchedule, + "warmup_linear": WarmupLinearSchedule +} + + +class BertAdam(Optimizer): + """Implements BERT version of Adam algorithm with weight decay fix. + Params: + lr: learning rate + warmup: portion of t_total for the warmup, -1 means no warmup. Default: -1 + t_total: total number of training steps for the learning + rate schedule, -1 means constant learning rate of 1. (no warmup regardless of warmup setting). Default: -1 + schedule: schedule to use for the warmup (see above). + Can be `'warmup_linear'`, `'warmup_constant'`, `'warmup_cosine'`, `'none'`, `None` or a `_LRSchedule` object (see below). + If `None` or `'none'`, learning rate is always kept constant. + Default : `'warmup_linear'` + b1: Adams b1. Default: 0.9 + b2: Adams b2. Default: 0.999 + e: Adams epsilon. Default: 1e-6 + weight_decay: Weight decay. Default: 0.01 + max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0 + """ + def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear', + b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01, max_grad_norm=1.0, **kwargs): + if lr is not required and lr < 0.0: + raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) + if not isinstance(schedule, _LRSchedule) and schedule not in SCHEDULES: + raise ValueError("Invalid schedule parameter: {}".format(schedule)) + if not 0.0 <= b1 < 1.0: + raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1)) + if not 0.0 <= b2 < 1.0: + raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2)) + if not e >= 0.0: + raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e)) + # initialize schedule object + if not isinstance(schedule, _LRSchedule): + schedule_type = SCHEDULES[schedule] + schedule = schedule_type(warmup=warmup, t_total=t_total) + else: + if warmup != -1 or t_total != -1: + logger.warning("warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. " + "Please specify custom warmup and t_total in _LRSchedule object.") + defaults = dict(lr=lr, schedule=schedule, + b1=b1, b2=b2, e=e, weight_decay=weight_decay, + max_grad_norm=max_grad_norm) + super(BertAdam, self).__init__(params, defaults) + + def get_lr(self): + lr = [] + for group in self.param_groups: + for p in group['params']: + state = self.state[p] + if len(state) == 0: + return [0] + lr_scheduled = group['lr'] + lr_scheduled *= group['schedule'].get_lr(state['step']) + lr.append(lr_scheduled) + return lr + + def step(self, closure=None): + """Performs a single optimization step. + + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data + if grad.is_sparse: + raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['next_m'] = torch.zeros_like(p.data) + # Exponential moving average of squared gradient values + state['next_v'] = torch.zeros_like(p.data) + + next_m, next_v = state['next_m'], state['next_v'] + beta1, beta2 = group['b1'], group['b2'] + + # Add grad clipping + if group['max_grad_norm'] > 0: + clip_grad_norm_(p, group['max_grad_norm']) + + # Decay the first and second moment running average coefficient + # In-place operations to update the averages at the same time + next_m.mul_(beta1).add_(1 - beta1, grad) + next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad) + update = next_m / (next_v.sqrt() + group['e']) + + # Just adding the square of the weights to the loss function is *not* + # the correct way of using L2 regularization/weight decay with Adam, + # since that will interact with the m and v parameters in strange ways. + # + # Instead we want to decay the weights in a manner that doesn't interact + # with the m/v parameters. This is equivalent to adding the square + # of the weights to the loss with plain (non-momentum) SGD. + if group['weight_decay'] > 0.0: + update += group['weight_decay'] * p.data + + lr_scheduled = group['lr'] + lr_scheduled *= group['schedule'].get_lr(state['step']) + + update_with_lr = lr_scheduled * update + p.data.add_(-update_with_lr) + + state['step'] += 1 + + # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1 + # No bias correction + # bias_correction1 = 1 - beta1 ** state['step'] + # bias_correction2 = 1 - beta2 ** state['step'] + + return loss diff --git a/pytorch_pretrained_bert/optimization_openai.py b/pytorch_pretrained_bert/optimization_openai.py new file mode 100644 index 0000000..bff4ebe --- /dev/null +++ b/pytorch_pretrained_bert/optimization_openai.py @@ -0,0 +1,127 @@ +# coding=utf-8 +# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch optimization for OpenAI GPT model.""" + +import math +import torch +from torch.optim import Optimizer +from torch.optim.optimizer import required +from torch.nn.utils import clip_grad_norm_ +import logging +from .optimization import SCHEDULES, _LRSchedule, WarmupCosineWithWarmupRestartsSchedule, \ + WarmupCosineWithHardRestartsSchedule, WarmupCosineSchedule, WarmupLinearSchedule, WarmupConstantSchedule + +logger = logging.getLogger(__name__) + + +class OpenAIAdam(Optimizer): + """Implements Open AI version of Adam algorithm with weight decay fix. + """ + def __init__(self, params, lr=required, schedule='warmup_linear', warmup=-1, t_total=-1, + b1=0.9, b2=0.999, e=1e-8, weight_decay=0, + vector_l2=False, max_grad_norm=-1, **kwargs): + if lr is not required and lr < 0.0: + raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) + if not isinstance(schedule, _LRSchedule) and schedule not in SCHEDULES: + raise ValueError("Invalid schedule parameter: {}".format(schedule)) + if not 0.0 <= b1 < 1.0: + raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1)) + if not 0.0 <= b2 < 1.0: + raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2)) + if not e >= 0.0: + raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e)) + # initialize schedule object + if not isinstance(schedule, _LRSchedule): + schedule_type = SCHEDULES[schedule] + schedule = schedule_type(warmup=warmup, t_total=t_total) + else: + if warmup != -1 or t_total != -1: + logger.warning("warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. " + "Please specify custom warmup and t_total in _LRSchedule object.") + defaults = dict(lr=lr, schedule=schedule, + b1=b1, b2=b2, e=e, weight_decay=weight_decay, vector_l2=vector_l2, + max_grad_norm=max_grad_norm) + super(OpenAIAdam, self).__init__(params, defaults) + + def get_lr(self): + lr = [] + for group in self.param_groups: + for p in group['params']: + state = self.state[p] + if len(state) == 0: + return [0] + lr_scheduled = group['lr'] + lr_scheduled *= group['schedule'].get_lr(state['step']) + lr.append(lr_scheduled) + return lr + + def step(self, closure=None): + """Performs a single optimization step. + + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data + if grad.is_sparse: + raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p.data) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like(p.data) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + beta1, beta2 = group['b1'], group['b2'] + + state['step'] += 1 + + # Add grad clipping + if group['max_grad_norm'] > 0: + clip_grad_norm_(p, group['max_grad_norm']) + + # Decay the first and second moment running average coefficient + exp_avg.mul_(beta1).add_(1 - beta1, grad) + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + denom = exp_avg_sq.sqrt().add_(group['e']) + + bias_correction1 = 1 - beta1 ** state['step'] + bias_correction2 = 1 - beta2 ** state['step'] + + lr_scheduled = group['lr'] + lr_scheduled *= group['schedule'].get_lr(state['step']) + + step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1 + + p.data.addcdiv_(-step_size, exp_avg, denom) + + # Add weight decay at the end (fixed version) + if (len(p.size()) > 1 or group['vector_l2']) and group['weight_decay'] > 0: + p.data.add_(-lr_scheduled * group['weight_decay'], p.data) + + return loss diff --git a/pytorch_pretrained_bert/tokenization.py b/pytorch_pretrained_bert/tokenization.py new file mode 100644 index 0000000..4199870 --- /dev/null +++ b/pytorch_pretrained_bert/tokenization.py @@ -0,0 +1,434 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes.""" + +from __future__ import absolute_import, division, print_function, unicode_literals + +import collections +import logging +import os +import unicodedata +from io import open + +from .file_utils import cached_path + +logger = logging.getLogger(__name__) + +PRETRAINED_VOCAB_ARCHIVE_MAP = { + 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", + 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", + 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt", + 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt", + 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt", + 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt", + 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt", + 'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt", +} +PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = { + 'bert-base-uncased': 512, + 'bert-large-uncased': 512, + 'bert-base-cased': 512, + 'bert-large-cased': 512, + 'bert-base-multilingual-uncased': 512, + 'bert-base-multilingual-cased': 512, + 'bert-base-chinese': 512, + 'bert-base-german-cased': 512, +} +VOCAB_NAME = 'vocab.txt' + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + index = 0 + with open(vocab_file, "r", encoding="utf-8") as reader: + while True: + token = reader.readline() + if not token: + break + token = token.strip() + vocab[token] = index + index += 1 + return vocab + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class BertTokenizer(object): + """Runs end-to-end tokenization: punctuation splitting + wordpiece""" + + def __init__(self, vocab_file, do_lower_case=True, max_len=None, do_basic_tokenize=True, + never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", + "[MASK]", "[BOS]", "[EOS]", "[SPEAKER1]", "[SPEAKER2]")): + """Constructs a BertTokenizer. + + Args: + vocab_file: Path to a one-wordpiece-per-line vocabulary file + do_lower_case: Whether to lower case the input + Only has an effect when do_wordpiece_only=False + do_basic_tokenize: Whether to do basic tokenization before wordpiece. + max_len: An artificial maximum length to truncate tokenized sequences to; + Effective maximum length is always the minimum of this + value (if specified) and the underlying BERT model's + sequence length. + never_split: List of tokens which will never be split during tokenization. + Only has an effect when do_wordpiece_only=False + """ + if not os.path.isfile(vocab_file): + raise ValueError( + "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " + "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)) + self.vocab = load_vocab(vocab_file) + self.ids_to_tokens = collections.OrderedDict( + [(ids, tok) for tok, ids in self.vocab.items()]) + self.do_basic_tokenize = do_basic_tokenize + if do_basic_tokenize: + self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case, + never_split=never_split) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) + self.max_len = max_len if max_len is not None else int(1e12) + + def tokenize(self, text): + split_tokens = [] + if self.do_basic_tokenize: + for token in self.basic_tokenizer.tokenize(text): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + else: + split_tokens = self.wordpiece_tokenizer.tokenize(text) + return split_tokens + + def convert_tokens_to_ids(self, tokens): + """Converts a sequence of tokens into ids using the vocab.""" + ids = [] + for token in tokens: + ids.append(self.vocab[token]) + if len(ids) > self.max_len: + logger.warning( + "Token indices sequence length is longer than the specified maximum " + " sequence length for this BERT model ({} > {}). Running this" + " sequence through BERT will result in indexing errors".format(len(ids), self.max_len) + ) + return ids + + + def convert_ids_to_tokens(self, ids): + """Converts a sequence of ids in wordpiece tokens using the vocab.""" + tokens = [] + for i in ids: + tokens.append(self.ids_to_tokens[i]) + return tokens + + def save_vocabulary(self, vocab_path): + """Save the tokenizer vocabulary to a directory or file.""" + index = 0 + if os.path.isdir(vocab_path): + vocab_file = os.path.join(vocab_path, VOCAB_NAME) + with open(vocab_file, "w", encoding="utf-8") as writer: + for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning("Saving vocabulary to {}: vocabulary indices are not consecutive." + " Please check that the vocabulary is not corrupted!".format(vocab_file)) + index = token_index + writer.write(token + u'\n') + index += 1 + return vocab_file + + def encode(self, text): + return self.convert_tokens_to_ids(self.tokenize(text)) + + def decode(self, tokens, skip_special_tokens=False, clean_up_tokenization_spaces=True): + text = ' '.join(self.convert_ids_to_tokens(tokens)) + if clean_up_tokenization_spaces: + text = text.replace('', '') + text = text.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ',' + ).replace(" ' ", "'").replace( + " n't", "n't").replace(" 'm", "'m").replace(" do not", " don't" + ).replace(" 's", "'s").replace(" 've", "'ve").replace( + " 're", "'re") + return text + + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs): + """ + Instantiate a PreTrainedBertModel from a pre-trained model file. + Download and cache the pre-trained model file if needed. + """ + if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP: + vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path] + if '-cased' in pretrained_model_name_or_path and kwargs.get('do_lower_case', True): + logger.warning("The pre-trained model you are loading is a cased model but you have not set " + "`do_lower_case` to False. We are setting `do_lower_case=False` for you but " + "you may want to check this behavior.") + kwargs['do_lower_case'] = False + elif '-cased' not in pretrained_model_name_or_path and not kwargs.get('do_lower_case', True): + logger.warning("The pre-trained model you are loading is an uncased model but you have set " + "`do_lower_case` to False. We are setting `do_lower_case=True` for you " + "but you may want to check this behavior.") + kwargs['do_lower_case'] = True + else: + vocab_file = pretrained_model_name_or_path + if os.path.isdir(vocab_file): + vocab_file = os.path.join(vocab_file, VOCAB_NAME) + # redirect to the cache, if necessary + try: + resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) + except EnvironmentError: + logger.error( + "Model name '{}' was not found in model name list ({}). " + "We assumed '{}' was a path or url but couldn't find any file " + "associated to this path or url.".format( + pretrained_model_name_or_path, + ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()), + vocab_file)) + return None + if resolved_vocab_file == vocab_file: + logger.info("loading vocabulary file {}".format(vocab_file)) + else: + logger.info("loading vocabulary file {} from cache at {}".format( + vocab_file, resolved_vocab_file)) + if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP: + # if we're using a pretrained model, ensure the tokenizer wont index sequences longer + # than the number of positional embeddings + max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path] + kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) + # Instantiate tokenizer. + tokenizer = cls(resolved_vocab_file, *inputs, **kwargs) + return tokenizer + + +class BasicTokenizer(object): + """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" + + def __init__(self, + do_lower_case=True, + never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")): + """Constructs a BasicTokenizer. + + Args: + do_lower_case: Whether to lower case the input. + """ + self.do_lower_case = do_lower_case + self.never_split = never_split + + def tokenize(self, text): + """Tokenizes a piece of text.""" + text = self._clean_text(text) + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + text = self._tokenize_chinese_chars(text) + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if self.do_lower_case and token not in self.never_split: + token = token.lower() + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text): + """Splits punctuation on a piece of text.""" + if text in self.never_split: + return [text] + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xfffd or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +class WordpieceTokenizer(object): + """Runs WordPiece tokenization.""" + + def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """Tokenizes a piece of text into its word pieces. + + This uses a greedy longest-match-first algorithm to perform tokenization + using the given vocabulary. + + For example: + input = "unaffable" + output = ["un", "##aff", "##able"] + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through `BasicTokenizer`. + + Returns: + A list of wordpiece tokens. + """ + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +def _is_whitespace(char): + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically contorl characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat.startswith("C"): + return True + return False + + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or + (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False diff --git a/pytorch_pretrained_bert/tokenization_gpt2.py b/pytorch_pretrained_bert/tokenization_gpt2.py new file mode 100644 index 0000000..af75cac --- /dev/null +++ b/pytorch_pretrained_bert/tokenization_gpt2.py @@ -0,0 +1,311 @@ +# coding=utf-8 +# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for OpenAI GPT.""" +from __future__ import (absolute_import, division, print_function, + unicode_literals) + +import sys +import json +import logging +import os +import regex as re +from io import open + +try: + from functools import lru_cache +except ImportError: + # Just a dummy decorator to get the checks to run on python2 + # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now. + def lru_cache(): + return lambda func: func + +from .file_utils import cached_path + +logger = logging.getLogger(__name__) + +PRETRAINED_VOCAB_ARCHIVE_MAP = { + 'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json", + 'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json", +} +PRETRAINED_MERGES_ARCHIVE_MAP = { + 'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt", + 'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt", +} +PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = { + 'gpt2': 1024, +} +VOCAB_NAME = 'vocab.json' +MERGES_NAME = 'merges.txt' +SPECIAL_TOKENS_NAME = 'special_tokens.txt' + +@lru_cache() +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a signficant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + _chr = unichr if sys.version_info[0] == 2 else chr + bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8+n) + n += 1 + cs = [_chr(n) for n in cs] + return dict(zip(bs, cs)) + +def get_pairs(word): + """Return set of symbol pairs in a word. + + Word is represented as tuple of symbols (symbols being variable-length strings). + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + +class GPT2Tokenizer(object): + """ + GPT-2 BPE tokenizer. Peculiarities: + - Byte-level BPE + """ + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs): + """ + Instantiate a GPT2Tokenizer from a pre-trained model file. + Download and cache the pre-trained model file if needed. + """ + if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP: + vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path] + merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path] + special_tokens_file = None + else: + vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME) + merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME) + special_tokens_file = os.path.join(pretrained_model_name_or_path, SPECIAL_TOKENS_NAME) + if not os.path.exists(special_tokens_file): + special_tokens_file = None + else: + logger.info("loading special tokens file {}".format(special_tokens_file)) + # redirect to the cache, if necessary + try: + resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) + resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir) + except EnvironmentError: + logger.error( + "Model name '{}' was not found in model name list ({}). " + "We assumed '{}' was a path or url but couldn't find files {} and {} " + "at this path or url.".format( + pretrained_model_name_or_path, + ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()), + pretrained_model_name_or_path, + vocab_file, merges_file)) + return None + if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file: + logger.info("loading vocabulary file {}".format(vocab_file)) + logger.info("loading merges file {}".format(merges_file)) + else: + logger.info("loading vocabulary file {} from cache at {}".format( + vocab_file, resolved_vocab_file)) + logger.info("loading merges file {} from cache at {}".format( + merges_file, resolved_merges_file)) + if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP: + # if we're using a pretrained model, ensure the tokenizer wont index sequences longer + # than the number of positional embeddings + max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path] + kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) + # Instantiate tokenizer. + if special_tokens_file and 'special_tokens' not in kwargs: + special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1] + else: + special_tokens = kwargs.pop('special_tokens', []) + tokenizer = cls(resolved_vocab_file, resolved_merges_file, special_tokens=special_tokens, *inputs, **kwargs) + return tokenizer + + def __init__(self, vocab_file, merges_file, errors='replace', special_tokens=None, max_len=None): + self.max_len = max_len if max_len is not None else int(1e12) + self.encoder = json.load(open(vocab_file)) + self.decoder = {v:k for k,v in self.encoder.items()} + self.errors = errors # how to handle errors in decoding + self.byte_encoder = bytes_to_unicode() + self.byte_decoder = {v:k for k, v in self.byte_encoder.items()} + bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] + bpe_merges = [tuple(merge.split()) for merge in bpe_data] + self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) + self.cache = {} + + # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions + self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") + + self.special_tokens = {} + self.special_tokens_decoder = {} + self.set_special_tokens(special_tokens) + + def __len__(self): + return len(self.encoder) + len(self.special_tokens) + + def set_special_tokens(self, special_tokens): + """ Add a list of additional tokens to the encoder. + The additional tokens are indexed starting from the last index of the + current vocabulary in the order of the `special_tokens` list. + """ + if not special_tokens: + self.special_tokens = {} + self.special_tokens_decoder = {} + return + self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens)) + self.special_tokens_decoder = {v:k for k, v in self.special_tokens.items()} + logger.info("Special tokens {}".format(self.special_tokens)) + + def bpe(self, token): + if token in self.cache: + return self.cache[token] + word = tuple(token) + pairs = get_pairs(word) + + if not pairs: + return token + + while True: + bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf'))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + new_word.extend(word[i:j]) + i = j + except: + new_word.extend(word[i:]) + break + + if word[i] == first and i < len(word)-1 and word[i+1] == second: + new_word.append(first+second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = ' '.join(word) + self.cache[token] = word + return word + + def tokenize(self, text): + """ Tokenize a string. """ + bpe_tokens = [] + for token in re.findall(self.pat, text): + if sys.version_info[0] == 2: + token = ''.join(self.byte_encoder[ord(b)] for b in token) + else: + token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) + bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' ')) + return bpe_tokens + + def convert_tokens_to_ids(self, tokens): + """ Converts a sequence of tokens into ids using the vocab. """ + ids = [] + if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)): + if tokens in self.special_tokens: + return self.special_tokens[tokens] + else: + return self.encoder.get(tokens, 0) + for token in tokens: + if token in self.special_tokens: + ids.append(self.special_tokens[token]) + else: + ids.append(self.encoder.get(token, 0)) + if len(ids) > self.max_len: + logger.warning( + "Token indices sequence length is longer than the specified maximum " + " sequence length for this OpenAI GPT model ({} > {}). Running this" + " sequence through the model will result in indexing errors".format(len(ids), self.max_len) + ) + return ids + + def convert_ids_to_tokens(self, ids, skip_special_tokens=False): + """Converts a sequence of ids in BPE tokens using the vocab.""" + tokens = [] + for i in ids: + if i in self.special_tokens_decoder: + if not skip_special_tokens: + tokens.append(self.special_tokens_decoder[i]) + else: + tokens.append(self.decoder[i]) + return tokens + + def encode(self, text): + return self.convert_tokens_to_ids(self.tokenize(text)) + + def decode(self, tokens, skip_special_tokens=False, clean_up_tokenization_spaces=True): + text = ''.join(self.convert_ids_to_tokens(tokens, skip_special_tokens=skip_special_tokens)) + text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors) + if clean_up_tokenization_spaces: + text = text.replace('', '') + text = text.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ',' + ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't" + ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re") + return text + + def save_vocabulary(self, vocab_path): + """Save the tokenizer vocabulary and merge files to a directory.""" + if not os.path.isdir(vocab_path): + logger.error("Vocabulary path ({}) should be a directory".format(vocab_path)) + return + vocab_file = os.path.join(vocab_path, VOCAB_NAME) + merge_file = os.path.join(vocab_path, MERGES_NAME) + special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME) + + with open(vocab_file, 'w', encoding='utf-8') as f: + f.write(json.dumps(self.encoder, ensure_ascii=False)) + + index = 0 + with open(merge_file, "w", encoding="utf-8") as writer: + writer.write(u'#version: 0.2\n') + for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive." + " Please check that the tokenizer is not corrupted!".format(merge_file)) + index = token_index + writer.write(' '.join(bpe_tokens) + u'\n') + index += 1 + + index = len(self.encoder) + with open(special_tokens_file, 'w', encoding='utf-8') as writer: + for token, token_index in sorted(self.special_tokens.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning("Saving special tokens vocabulary to {}: BPE indices are not consecutive." + " Please check that the tokenizer is not corrupted!".format(special_tokens_file)) + index = token_index + writer.write(token + u'\n') + index += 1 + + return vocab_file, merge_file, special_tokens_file diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_pretrained_bert/tokenization_openai.py new file mode 100644 index 0000000..d6fbc68 --- /dev/null +++ b/pytorch_pretrained_bert/tokenization_openai.py @@ -0,0 +1,313 @@ +# coding=utf-8 +# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for OpenAI GPT.""" +from __future__ import (absolute_import, division, print_function, + unicode_literals) + +import json +import logging +import os +import re +import sys +from io import open + +from tqdm import tqdm + +from .file_utils import cached_path +from .tokenization import BasicTokenizer + +logger = logging.getLogger(__name__) + +PRETRAINED_VOCAB_ARCHIVE_MAP = { + 'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json", +} +PRETRAINED_MERGES_ARCHIVE_MAP = { + 'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt", +} +PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = { + 'openai-gpt': 512, +} +VOCAB_NAME = 'vocab.json' +MERGES_NAME = 'merges.txt' +SPECIAL_TOKENS_NAME = 'special_tokens.txt' + +def get_pairs(word): + """ + Return set of symbol pairs in a word. + word is represented as tuple of symbols (symbols being variable-length strings) + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + +def text_standardize(text): + """ + fixes some issues the spacy tokenizer had on books corpus + also does some whitespace standardization + """ + text = text.replace('—', '-') + text = text.replace('–', '-') + text = text.replace('―', '-') + text = text.replace('…', '...') + text = text.replace('´', "'") + text = re.sub(r'''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text) + text = re.sub(r'\s*\n\s*', ' \n ', text) + text = re.sub(r'[^\S\n]+', ' ', text) + return text.strip() + +class OpenAIGPTTokenizer(object): + """ + BPE tokenizer. Peculiarities: + - lower case all inputs + - uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not. + - argument special_tokens and function set_special_tokens: + can be used to add additional symbols (ex: "__classify__") to a vocabulary. + """ + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs): + """ + Instantiate a PreTrainedBertModel from a pre-trained model file. + Download and cache the pre-trained model file if needed. + """ + if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP: + vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path] + merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path] + special_tokens_file = None + else: + vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME) + merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME) + special_tokens_file = os.path.join(pretrained_model_name_or_path, SPECIAL_TOKENS_NAME) + if not os.path.exists(special_tokens_file): + special_tokens_file = None + else: + logger.info("loading special tokens file {}".format(special_tokens_file)) + # redirect to the cache, if necessary + try: + resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) + resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir) + except EnvironmentError: + logger.error( + "Model name '{}' was not found in model name list ({}). " + "We assumed '{}' was a path or url but couldn't find files {} and {} " + "at this path or url.".format( + pretrained_model_name_or_path, + ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()), + pretrained_model_name_or_path, + vocab_file, merges_file)) + return None + if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file: + logger.info("loading vocabulary file {}".format(vocab_file)) + logger.info("loading merges file {}".format(merges_file)) + else: + logger.info("loading vocabulary file {} from cache at {}".format( + vocab_file, resolved_vocab_file)) + logger.info("loading merges file {} from cache at {}".format( + merges_file, resolved_merges_file)) + if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP: + # if we're using a pretrained model, ensure the tokenizer wont index sequences longer + # than the number of positional embeddings + max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path] + kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) + # Instantiate tokenizer. + if special_tokens_file and 'special_tokens' not in kwargs: + special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1] + else: + special_tokens = kwargs.pop('special_tokens', []) + tokenizer = cls(resolved_vocab_file, resolved_merges_file, special_tokens=special_tokens, *inputs, **kwargs) + return tokenizer + + def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None): + try: + import ftfy + import spacy + self.nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger', 'ner', 'textcat']) + self.fix_text = ftfy.fix_text + except ImportError: + logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.") + self.nlp = BasicTokenizer(do_lower_case=True, + never_split=special_tokens if special_tokens is not None else []) + self.fix_text = None + + self.max_len = max_len if max_len is not None else int(1e12) + self.encoder = json.load(open(vocab_file, encoding="utf-8")) + self.decoder = {v:k for k,v in self.encoder.items()} + merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] + merges = [tuple(merge.split()) for merge in merges] + self.bpe_ranks = dict(zip(merges, range(len(merges)))) + self.cache = {} + self.special_tokens = {} + self.special_tokens_decoder = {} + self.set_special_tokens(special_tokens) + + def __len__(self): + return len(self.encoder) + len(self.special_tokens) + + def set_special_tokens(self, special_tokens): + """ Add a list of additional tokens to the encoder. + The additional tokens are indexed starting from the last index of the + current vocabulary in the order of the `special_tokens` list. + """ + if not special_tokens: + self.special_tokens = {} + self.special_tokens_decoder = {} + return + self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens)) + self.special_tokens_decoder = {v:k for k, v in self.special_tokens.items()} + if self.fix_text is None: + # Using BERT's BasicTokenizer: we can update the tokenizer + self.nlp.never_split = special_tokens + logger.info("Special tokens {}".format(self.special_tokens)) + + def bpe(self, token): + word = tuple(token[:-1]) + (token[-1] + '',) + if token in self.cache: + return self.cache[token] + pairs = get_pairs(word) + + if not pairs: + return token+'' + + while True: + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf'))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + new_word.extend(word[i:j]) + i = j + except: + new_word.extend(word[i:]) + break + + if word[i] == first and i < len(word)-1 and word[i+1] == second: + new_word.append(first+second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = ' '.join(word) + if word == '\n ': + word = '\n' + self.cache[token] = word + return word + + def tokenize(self, text): + """ Tokenize a string. """ + split_tokens = [] + if self.fix_text is None: + # Using BERT's BasicTokenizer + text = self.nlp.tokenize(text) + for token in text: + split_tokens.extend([t for t in self.bpe(token).split(' ')]) + else: + # Using SpaCy & ftfy (original tokenization process of OpenAI GPT) + text = self.nlp(text_standardize(self.fix_text(text))) + for token in text: + split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')]) + return split_tokens + + def convert_tokens_to_ids(self, tokens): + """ Converts a sequence of tokens into ids using the vocab. """ + ids = [] + if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)): + if tokens in self.special_tokens: + return self.special_tokens[tokens] + else: + return self.encoder.get(tokens, 0) + for token in tokens: + if token in self.special_tokens: + ids.append(self.special_tokens[token]) + else: + ids.append(self.encoder.get(token, 0)) + if len(ids) > self.max_len: + logger.warning( + "Token indices sequence length is longer than the specified maximum " + " sequence length for this OpenAI GPT model ({} > {}). Running this" + " sequence through the model will result in indexing errors".format(len(ids), self.max_len) + ) + return ids + + def convert_ids_to_tokens(self, ids, skip_special_tokens=False): + """Converts a sequence of ids in BPE tokens using the vocab.""" + tokens = [] + for i in ids: + if i in self.special_tokens_decoder: + if not skip_special_tokens: + tokens.append(self.special_tokens_decoder[i]) + else: + tokens.append(self.decoder[i]) + return tokens + + def encode(self, text): + return self.convert_tokens_to_ids(self.tokenize(text)) + + def decode(self, ids, skip_special_tokens=False, clean_up_tokenization_spaces=True): + """Converts a sequence of ids in a string.""" + tokens = self.convert_ids_to_tokens(ids, skip_special_tokens=skip_special_tokens) + out_string = ''.join(tokens).replace('', ' ').strip() + if clean_up_tokenization_spaces: + out_string = out_string.replace('', '') + out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ',' + ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't" + ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re") + return out_string + + def save_vocabulary(self, vocab_path): + """Save the tokenizer vocabulary and merge files to a directory.""" + if not os.path.isdir(vocab_path): + logger.error("Vocabulary path ({}) should be a directory".format(vocab_path)) + return + vocab_file = os.path.join(vocab_path, VOCAB_NAME) + merge_file = os.path.join(vocab_path, MERGES_NAME) + special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME) + + with open(vocab_file, 'w', encoding='utf-8') as f: + f.write(json.dumps(self.encoder, ensure_ascii=False)) + + index = 0 + with open(merge_file, "w", encoding="utf-8") as writer: + writer.write(u'#version: 0.2\n') + for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive." + " Please check that the tokenizer is not corrupted!".format(merge_file)) + index = token_index + writer.write(' '.join(bpe_tokens) + u'\n') + index += 1 + + index = len(self.encoder) + with open(special_tokens_file, 'w', encoding='utf-8') as writer: + for token, token_index in sorted(self.special_tokens.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning("Saving special tokens vocabulary to {}: BPE indices are not consecutive." + " Please check that the tokenizer is not corrupted!".format(special_tokens_file)) + index = token_index + writer.write(token + u'\n') + index += 1 + + return vocab_file, merge_file, special_tokens_file diff --git a/pytorch_pretrained_bert/tokenization_transfo_xl.py b/pytorch_pretrained_bert/tokenization_transfo_xl.py new file mode 100644 index 0000000..ddebc57 --- /dev/null +++ b/pytorch_pretrained_bert/tokenization_transfo_xl.py @@ -0,0 +1,586 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Tokenization classes for Transformer XL model. + Adapted from https://github.com/kimiyoung/transformer-xl. +""" +from __future__ import (absolute_import, division, print_function, + unicode_literals) + +import glob +import logging +import os +import sys +from collections import Counter, OrderedDict +from io import open +import unicodedata + +import torch +import numpy as np + +from .file_utils import cached_path + +if sys.version_info[0] == 2: + import cPickle as pickle +else: + import pickle + + +logger = logging.getLogger(__name__) + +PRETRAINED_VOCAB_ARCHIVE_MAP = { + 'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-vocab.bin", +} +VOCAB_NAME = 'vocab.bin' + +PRETRAINED_CORPUS_ARCHIVE_MAP = { + 'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-corpus.bin", +} +CORPUS_NAME = 'corpus.bin' + +class TransfoXLTokenizer(object): + """ + Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl + """ + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs): + """ + Instantiate a TransfoXLTokenizer. + The TransfoXLTokenizer. + """ + if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP: + vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path] + else: + if os.path.isdir(pretrained_model_name_or_path): + vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME) + else: + vocab_file = pretrained_model_name_or_path + # redirect to the cache, if necessary + try: + resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) + except EnvironmentError: + logger.error( + "Model name '{}' was not found in model name list ({}). " + "We assumed '{}' was a path or url but couldn't find files {} " + "at this path or url.".format( + pretrained_model_name_or_path, + ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()), + pretrained_model_name_or_path, + vocab_file)) + return None + if resolved_vocab_file == vocab_file: + logger.info("loading vocabulary file {}".format(vocab_file)) + else: + logger.info("loading vocabulary file {} from cache at {}".format( + vocab_file, resolved_vocab_file)) + + # Instantiate tokenizer. + tokenizer = cls(*inputs, **kwargs) + vocab_dict = torch.load(resolved_vocab_file) + for key, value in vocab_dict.items(): + tokenizer.__dict__[key] = value + return tokenizer + + def __init__(self, special=[], min_freq=0, max_size=None, lower_case=False, + delimiter=None, vocab_file=None, never_split=("", "", "")): + self.counter = Counter() + self.special = special + self.min_freq = min_freq + self.max_size = max_size + self.lower_case = lower_case + self.delimiter = delimiter + self.vocab_file = vocab_file + self.never_split = never_split + + def count_file(self, path, verbose=False, add_eos=False): + if verbose: print('counting file {} ...'.format(path)) + assert os.path.exists(path) + + sents = [] + with open(path, 'r', encoding='utf-8') as f: + for idx, line in enumerate(f): + if verbose and idx > 0 and idx % 500000 == 0: + print(' line {}'.format(idx)) + symbols = self.tokenize(line, add_eos=add_eos) + self.counter.update(symbols) + sents.append(symbols) + + return sents + + def count_sents(self, sents, verbose=False): + """ + sents : a list of sentences, each a list of tokenized symbols + """ + if verbose: print('counting {} sents ...'.format(len(sents))) + for idx, symbols in enumerate(sents): + if verbose and idx > 0 and idx % 500000 == 0: + print(' line {}'.format(idx)) + self.counter.update(symbols) + + def _build_from_file(self, vocab_file): + self.idx2sym = [] + self.sym2idx = OrderedDict() + + with open(vocab_file, 'r', encoding='utf-8') as f: + for line in f: + symb = line.strip().split()[0] + self.add_symbol(symb) + if '' in self.sym2idx: + self.unk_idx = self.sym2idx[''] + elif '' in self.sym2idx: + self.unk_idx = self.sym2idx[''] + else: + raise ValueError('No token in vocabulary') + + def save_vocabulary(self, vocab_path): + """Save the tokenizer vocabulary to a directory or file.""" + index = 0 + if os.path.isdir(vocab_path): + vocab_file = os.path.join(vocab_path, VOCAB_NAME) + torch.save(self.__dict__, vocab_file) + return vocab_file + + def build_vocab(self): + if self.vocab_file: + print('building vocab from {}'.format(self.vocab_file)) + self._build_from_file(self.vocab_file) + print('final vocab size {}'.format(len(self))) + else: + print('building vocab with min_freq={}, max_size={}'.format( + self.min_freq, self.max_size)) + self.idx2sym = [] + self.sym2idx = OrderedDict() + + for sym in self.special: + self.add_special(sym) + + for sym, cnt in self.counter.most_common(self.max_size): + if cnt < self.min_freq: break + self.add_symbol(sym) + + print('final vocab size {} from {} unique tokens'.format( + len(self), len(self.counter))) + + def encode_file(self, path, ordered=False, verbose=False, add_eos=True, + add_double_eos=False): + if verbose: print('encoding file {} ...'.format(path)) + assert os.path.exists(path) + encoded = [] + with open(path, 'r', encoding='utf-8') as f: + for idx, line in enumerate(f): + if verbose and idx > 0 and idx % 500000 == 0: + print(' line {}'.format(idx)) + symbols = self.tokenize(line, add_eos=add_eos, + add_double_eos=add_double_eos) + encoded.append(self.convert_to_tensor(symbols)) + + if ordered: + encoded = torch.cat(encoded) + + return encoded + + def encode_sents(self, sents, ordered=False, verbose=False): + if verbose: print('encoding {} sents ...'.format(len(sents))) + encoded = [] + for idx, symbols in enumerate(sents): + if verbose and idx > 0 and idx % 500000 == 0: + print(' line {}'.format(idx)) + encoded.append(self.convert_to_tensor(symbols)) + + if ordered: + encoded = torch.cat(encoded) + + return encoded + + def add_special(self, sym): + if sym not in self.sym2idx: + self.idx2sym.append(sym) + self.sym2idx[sym] = len(self.idx2sym) - 1 + setattr(self, '{}_idx'.format(sym.strip('<>')), self.sym2idx[sym]) + + def add_symbol(self, sym): + if sym not in self.sym2idx: + self.idx2sym.append(sym) + self.sym2idx[sym] = len(self.idx2sym) - 1 + + def get_sym(self, idx): + assert 0 <= idx < len(self), 'Index {} out of vocabulary range'.format(idx) + return self.idx2sym[idx] + + def get_idx(self, sym): + if sym in self.sym2idx: + return self.sym2idx[sym] + else: + # print('encounter unk {}'.format(sym)) + # assert '' not in sym + if hasattr(self, 'unk_idx'): + return self.sym2idx.get(sym, self.unk_idx) + # Backward compatibility with pre-trained models + elif '' in self.sym2idx: + return self.sym2idx[''] + elif '' in self.sym2idx: + return self.sym2idx[''] + else: + raise ValueError('Token not in vocabulary and no token in vocabulary for replacement') + + def convert_ids_to_tokens(self, indices): + """Converts a sequence of indices in symbols using the vocab.""" + return [self.get_sym(idx) for idx in indices] + + def convert_tokens_to_ids(self, symbols): + """Converts a sequence of symbols into ids using the vocab.""" + return [self.get_idx(sym) for sym in symbols] + + def convert_to_tensor(self, symbols): + return torch.LongTensor(self.convert_tokens_to_ids(symbols)) + + def decode(self, indices, exclude=None): + """Converts a sequence of indices in a string.""" + if exclude is None: + return ' '.join([self.get_sym(idx) for idx in indices]) + else: + return ' '.join([self.get_sym(idx) for idx in indices if idx not in exclude]) + + def __len__(self): + return len(self.idx2sym) + + def tokenize(self, line, add_eos=False, add_double_eos=False): + line = line.strip() + # convert to lower case + if self.lower_case: + line = line.lower() + + # empty delimiter '' will evaluate False + if self.delimiter == '': + symbols = line + else: + symbols = line.split(self.delimiter) + + if add_double_eos: # lm1b + return [''] + symbols + [''] + elif add_eos: + return symbols + [''] + else: + return symbols + + +class LMOrderedIterator(object): + def __init__(self, data, bsz, bptt, device='cpu', ext_len=None): + """ + data -- LongTensor -- the LongTensor is strictly ordered + """ + self.bsz = bsz + self.bptt = bptt + self.ext_len = ext_len if ext_len is not None else 0 + + self.device = device + + # Work out how cleanly we can divide the dataset into bsz parts. + self.n_step = data.size(0) // bsz + + # Trim off any extra elements that wouldn't cleanly fit (remainders). + data = data.narrow(0, 0, self.n_step * bsz) + + # Evenly divide the data across the bsz batches. + self.data = data.view(bsz, -1).t().contiguous().to(device) + + # Number of mini-batches + self.n_batch = (self.n_step + self.bptt - 1) // self.bptt + + def get_batch(self, i, bptt=None): + if bptt is None: bptt = self.bptt + seq_len = min(bptt, self.data.size(0) - 1 - i) + + end_idx = i + seq_len + beg_idx = max(0, i - self.ext_len) + + data = self.data[beg_idx:end_idx] + target = self.data[i+1:i+1+seq_len] + + data_out = data.transpose(0, 1).contiguous().to(self.device) + target_out = target.transpose(0, 1).contiguous().to(self.device) + + return data_out, target_out, seq_len + + def get_fixlen_iter(self, start=0): + for i in range(start, self.data.size(0) - 1, self.bptt): + yield self.get_batch(i) + + def get_varlen_iter(self, start=0, std=5, min_len=5, max_deviation=3): + max_len = self.bptt + max_deviation * std + i = start + while True: + bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2. + bptt = min(max_len, max(min_len, int(np.random.normal(bptt, std)))) + data, target, seq_len = self.get_batch(i, bptt) + i += seq_len + yield data, target, seq_len + if i >= self.data.size(0) - 2: + break + + def __iter__(self): + return self.get_fixlen_iter() + + +class LMShuffledIterator(object): + def __init__(self, data, bsz, bptt, device='cpu', ext_len=None, shuffle=False): + """ + data -- list[LongTensor] -- there is no order among the LongTensors + """ + self.data = data + + self.bsz = bsz + self.bptt = bptt + self.ext_len = ext_len if ext_len is not None else 0 + + self.device = device + self.shuffle = shuffle + + def get_sent_stream(self): + # index iterator + epoch_indices = np.random.permutation(len(self.data)) if self.shuffle \ + else np.array(range(len(self.data))) + + # sentence iterator + for idx in epoch_indices: + yield self.data[idx] + + def stream_iterator(self, sent_stream): + # streams for each data in the batch + streams = [None] * self.bsz + + data = torch.LongTensor(self.bptt, self.bsz) + target = torch.LongTensor(self.bptt, self.bsz) + + n_retain = 0 + + while True: + # data : [n_retain+bptt x bsz] + # target : [bptt x bsz] + data[n_retain:].fill_(-1) + target.fill_(-1) + + valid_batch = True + + for i in range(self.bsz): + n_filled = 0 + try: + while n_filled < self.bptt: + if streams[i] is None or len(streams[i]) <= 1: + streams[i] = next(sent_stream) + # number of new tokens to fill in + n_new = min(len(streams[i]) - 1, self.bptt - n_filled) + # first n_retain tokens are retained from last batch + data[n_retain+n_filled:n_retain+n_filled+n_new, i] = \ + streams[i][:n_new] + target[n_filled:n_filled+n_new, i] = \ + streams[i][1:n_new+1] + streams[i] = streams[i][n_new:] + n_filled += n_new + except StopIteration: + valid_batch = False + break + + if not valid_batch: + return + + data_out = data.transpose(0, 1).contiguous().to(self.device) + target_out = target.transpose(0, 1).contiguous().to(self.device) + + yield data_out, target_out, self.bptt + + n_retain = min(data.size(0), self.ext_len) + if n_retain > 0: + data[:n_retain] = data[-n_retain:] + data.resize_(n_retain + self.bptt, data.size(1)) + + def __iter__(self): + # sent_stream is an iterator + sent_stream = self.get_sent_stream() + + for batch in self.stream_iterator(sent_stream): + yield batch + + +class LMMultiFileIterator(LMShuffledIterator): + def __init__(self, paths, vocab, bsz, bptt, device='cpu', ext_len=None, + shuffle=False): + + self.paths = paths + self.vocab = vocab + + self.bsz = bsz + self.bptt = bptt + self.ext_len = ext_len if ext_len is not None else 0 + + self.device = device + self.shuffle = shuffle + + def get_sent_stream(self, path): + sents = self.vocab.encode_file(path, add_double_eos=True) + if self.shuffle: + np.random.shuffle(sents) + sent_stream = iter(sents) + + return sent_stream + + def __iter__(self): + if self.shuffle: + np.random.shuffle(self.paths) + + for path in self.paths: + # sent_stream is an iterator + sent_stream = self.get_sent_stream(path) + for batch in self.stream_iterator(sent_stream): + yield batch + + +class TransfoXLCorpus(object): + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs): + """ + Instantiate a pre-processed corpus. + """ + vocab = TransfoXLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + if pretrained_model_name_or_path in PRETRAINED_CORPUS_ARCHIVE_MAP: + corpus_file = PRETRAINED_CORPUS_ARCHIVE_MAP[pretrained_model_name_or_path] + else: + corpus_file = os.path.join(pretrained_model_name_or_path, CORPUS_NAME) + # redirect to the cache, if necessary + try: + resolved_corpus_file = cached_path(corpus_file, cache_dir=cache_dir) + except EnvironmentError: + logger.error( + "Corpus '{}' was not found in corpus list ({}). " + "We assumed '{}' was a path or url but couldn't find files {} " + "at this path or url.".format( + pretrained_model_name_or_path, + ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()), + pretrained_model_name_or_path, + corpus_file)) + return None + if resolved_corpus_file == corpus_file: + logger.info("loading corpus file {}".format(corpus_file)) + else: + logger.info("loading corpus file {} from cache at {}".format( + corpus_file, resolved_corpus_file)) + + # Instantiate tokenizer. + corpus = cls(*inputs, **kwargs) + corpus_dict = torch.load(resolved_corpus_file) + for key, value in corpus_dict.items(): + corpus.__dict__[key] = value + corpus.vocab = vocab + if corpus.train is not None: + corpus.train = torch.tensor(corpus.train, dtype=torch.long) + if corpus.valid is not None: + corpus.valid = torch.tensor(corpus.valid, dtype=torch.long) + if corpus.test is not None: + corpus.test = torch.tensor(corpus.test, dtype=torch.long) + return corpus + + def __init__(self, *args, **kwargs): + self.vocab = TransfoXLTokenizer(*args, **kwargs) + self.dataset = None + self.train = None + self.valid = None + self.test = None + + def build_corpus(self, path, dataset): + self.dataset = dataset + + if self.dataset in ['ptb', 'wt2', 'enwik8', 'text8']: + self.vocab.count_file(os.path.join(path, 'train.txt')) + self.vocab.count_file(os.path.join(path, 'valid.txt')) + self.vocab.count_file(os.path.join(path, 'test.txt')) + elif self.dataset == 'wt103': + self.vocab.count_file(os.path.join(path, 'train.txt')) + elif self.dataset == 'lm1b': + train_path_pattern = os.path.join( + path, '1-billion-word-language-modeling-benchmark-r13output', + 'training-monolingual.tokenized.shuffled', 'news.en-*') + train_paths = glob.glob(train_path_pattern) + # the vocab will load from file when build_vocab() is called + + self.vocab.build_vocab() + + if self.dataset in ['ptb', 'wt2', 'wt103']: + self.train = self.vocab.encode_file( + os.path.join(path, 'train.txt'), ordered=True) + self.valid = self.vocab.encode_file( + os.path.join(path, 'valid.txt'), ordered=True) + self.test = self.vocab.encode_file( + os.path.join(path, 'test.txt'), ordered=True) + elif self.dataset in ['enwik8', 'text8']: + self.train = self.vocab.encode_file( + os.path.join(path, 'train.txt'), ordered=True, add_eos=False) + self.valid = self.vocab.encode_file( + os.path.join(path, 'valid.txt'), ordered=True, add_eos=False) + self.test = self.vocab.encode_file( + os.path.join(path, 'test.txt'), ordered=True, add_eos=False) + elif self.dataset == 'lm1b': + self.train = train_paths + self.valid = self.vocab.encode_file( + os.path.join(path, 'valid.txt'), ordered=False, add_double_eos=True) + self.test = self.vocab.encode_file( + os.path.join(path, 'test.txt'), ordered=False, add_double_eos=True) + + def get_iterator(self, split, *args, **kwargs): + if split == 'train': + if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']: + data_iter = LMOrderedIterator(self.train, *args, **kwargs) + elif self.dataset == 'lm1b': + kwargs['shuffle'] = True + data_iter = LMMultiFileIterator(self.train, self.vocab, *args, **kwargs) + elif split in ['valid', 'test']: + data = self.valid if split == 'valid' else self.test + if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']: + data_iter = LMOrderedIterator(data, *args, **kwargs) + elif self.dataset == 'lm1b': + data_iter = LMShuffledIterator(data, *args, **kwargs) + + return data_iter + + +def get_lm_corpus(datadir, dataset): + fn = os.path.join(datadir, 'cache.pt') + fn_pickle = os.path.join(datadir, 'cache.pkl') + if os.path.exists(fn): + print('Loading cached dataset...') + corpus = torch.load(fn_pickle) + elif os.path.exists(fn): + print('Loading cached dataset from pickle...') + with open(fn, "rb") as fp: + corpus = pickle.load(fp) + else: + print('Producing dataset {}...'.format(dataset)) + kwargs = {} + if dataset in ['wt103', 'wt2']: + kwargs['special'] = [''] + kwargs['lower_case'] = False + elif dataset == 'ptb': + kwargs['special'] = [''] + kwargs['lower_case'] = True + elif dataset == 'lm1b': + kwargs['special'] = [] + kwargs['lower_case'] = False + kwargs['vocab_file'] = os.path.join(datadir, '1b_word_vocab.txt') + elif dataset in ['enwik8', 'text8']: + pass + + corpus = TransfoXLCorpus(datadir, dataset, **kwargs) + torch.save(corpus, fn) + + return corpus diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8c47219 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +torch +pytorch-ignite +#pytorch-pretrained-bert >= 0.6.2 +tensorboardX==1.6 +tensorflow # for tensorboardX +boto3 +requests +tqdm +regex \ No newline at end of file diff --git a/train.py b/train.py new file mode 100644 index 0000000..688f1ca --- /dev/null +++ b/train.py @@ -0,0 +1,239 @@ +# Copyright (c) 2019-present, HuggingFace Inc. +# All rights reserved. This source code is licensed under the BSD-style license found in the LICENSE file in the root directory of this source tree. +import os +import math +import logging +from pprint import pformat +from argparse import ArgumentParser +from collections import defaultdict +from itertools import chain +from config import Config + +import torch +from torch.nn.parallel import DistributedDataParallel +from torch.utils.data import DataLoader, TensorDataset +from ignite.engine import Engine, Events +from ignite.handlers import ModelCheckpoint +from ignite.metrics import Accuracy, Loss, MetricsLambda, RunningAverage +from ignite.contrib.handlers import ProgressBar, PiecewiseLinear +from ignite.contrib.handlers.tensorboard_logger import TensorboardLogger, OutputHandler, OptimizerParamsHandler +from pytorch_pretrained_bert import (OpenAIAdam, OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer, + GPT2DoubleHeadsModel, GPT2Tokenizer, WEIGHTS_NAME, CONFIG_NAME, + BertModel, BertTokenizer) + +from utils import get_dataset + +SPECIAL_TOKENS = ["", "", "", "", ""] +MODEL_INPUTS = ["input_ids", "mc_token_ids", "lm_labels", "mc_labels", "token_type_ids"] +PADDED_INPUTS = ["input_ids", "lm_labels", "token_type_ids"] + +logger = logging.getLogger(__file__) + +def average_distributed_scalar(scalar, config): + """ Average a scalar over the nodes if we are in distributed training. We use this for distributed evaluation. """ + if config.local_rank == -1: + return scalar + scalar_t = torch.tensor(scalar, dtype=torch.float, device=config.device) / torch.distributed.get_world_size() + torch.distributed.all_reduce(scalar_t, op=torch.distributed.ReduceOp.SUM) + return scalar_t.item() + + +def pad_dataset(dataset, padding=0): + """ Pad the dataset. This could be optimized by defining a Dataset class and padd only batches but this is simpler. """ + max_l = max(len(x) for x in dataset["input_ids"]) + for name in PADDED_INPUTS: + dataset[name] = [x + [padding if name != "lm_labels" else -1] * (max_l - len(x)) for x in dataset[name]] + return dataset + + +def build_input_from_segments(history, reply, tokenizer, lm_labels=False, with_eos=True): + """ Build a sequence of input from 3 segments: persona, history and last reply """ + bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1]) + + instance = {} + sequence = [[bos] + history[0]] + history[1:] +[reply +([eos] if with_eos else [])] + sequence = [sequence[0]] + [[speaker2 if (len(sequence)-i) % 2 else speaker1] + s for i, s in enumerate(sequence[1:])] + + instance["input_ids"] = list(chain(*sequence)) + instance["token_type_ids"] = [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in s] # the last for is for repeating the speaker1 and speaker2 for all tokens + instance["mc_token_ids"] = len(instance["input_ids"]) - 1 + instance["lm_labels"] = [-1] * len(instance["input_ids"]) + if lm_labels: + instance["lm_labels"] = ([-1] * sum(len(s) for s in sequence[:-1])) + [-1] + sequence[-1][1:] #all -1 except for reply, reply is just the ids + return instance, sequence + + +def get_data_loaders(config, tokenizer): + """ Prepare the dataset for training and evaluation """ + personachat = get_dataset(tokenizer, config.dataset_path, config.dataset_cache) + + logger.info("Build inputs and labels") + datasets = {"train": defaultdict(list), "valid": defaultdict(list)} + + gpu_max_length = 310 #this depends on the gpu memory size, using bigger gpu memory you can increase this to include longer inputs + for dataset_name, dataset in personachat.items(): + num_candidates = len(dataset[0]["utterances"][0]["candidates"]) + if config.num_candidates > 0 and dataset_name == 'train': + num_candidates = min(config.num_candidates, num_candidates) + for dialog in dataset: + for utterance in dialog["utterances"]: + history = utterance["history"][-(2*config.max_history+1):] + for j, candidate in enumerate(utterance["candidates"][-num_candidates:]): + lm_labels = bool(j == num_candidates-1) #the true label is always the last one in list of candidates + instance, _ = build_input_from_segments(history, candidate, tokenizer, lm_labels) + #print(len(instance["input_ids"])) + ## + if len(instance["input_ids"]) > gpu_max_length: + truncated_history = [hist[:10] for hist in history] + truncated_candidate = candidate[:10] + instance, _ = build_input_from_segments(truncated_history, truncated_candidate, tokenizer, lm_labels) + + for input_name, input_array in instance.items(): + datasets[dataset_name][input_name].append(input_array) + datasets[dataset_name]["mc_labels"].append(num_candidates - 1) + datasets[dataset_name]["n_candidates"] = num_candidates + logger.info("Pad inputs and convert to Tensor") + tensor_datasets = {"train": [], "valid": []} + for dataset_name, dataset in datasets.items(): + dataset = pad_dataset(dataset, padding=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1])) + for input_name in MODEL_INPUTS: + tensor = torch.tensor(dataset[input_name]) + if input_name != "mc_labels": + tensor = tensor.view((-1, datasets[dataset_name]["n_candidates"]) + tensor.shape[1:]) + tensor_datasets[dataset_name].append(tensor) + + logger.info("Build train and validation dataloaders") + train_dataset, valid_dataset = TensorDataset(*tensor_datasets["train"]), TensorDataset(*tensor_datasets["valid"]) + train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if config.distributed else None + valid_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset) if config.distributed else None + train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=config.train_batch_size, shuffle=False) + valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=config.valid_batch_size, shuffle=False) + + logger.info("Train dataset (Batch, Candidates, Seq length): {}".format(train_dataset.tensors[0].shape)) + logger.info("Valid dataset (Batch, Candidates, Seq length): {}".format(valid_dataset.tensors[0].shape)) + return train_loader, valid_loader, train_sampler, valid_sampler + + +def train(): + config_file = "configs/train_full_config.json" + config = Config.from_json_file(config_file) + + # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes + logging.basicConfig(level=logging.INFO if config.local_rank in [-1, 0] else logging.WARN) + logger.warning("Running process %d", config.local_rank) # This is a logger.warning: it will be printed by all distributed processes + logger.info("Arguments: %s", pformat(config)) + + # Initialize distributed training if needed + config.distributed = (config.local_rank != -1) + if config.distributed: + torch.cuda.set_device(config.local_rank) + config.device = torch.device("cuda", config.local_rank) + torch.distributed.init_process_group(backend='nccl', init_method='env://') + + logger.info("Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning") + tokenizer_class = GPT2Tokenizer if "gpt2" in config.model_checkpoint else OpenAIGPTTokenizer + tokenizer = tokenizer_class.from_pretrained(config.model_checkpoint) + model_class = GPT2DoubleHeadsModel if "gpt2" in config.model_checkpoint else OpenAIGPTDoubleHeadsModel + model = model_class.from_pretrained(config.model_checkpoint) + tokenizer.set_special_tokens(SPECIAL_TOKENS) + model.set_num_special_tokens(len(SPECIAL_TOKENS)) + model.to(config.device) + optimizer = OpenAIAdam(model.parameters(), lr=config.lr) + + # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) + if config.fp16: + from apex import amp # Apex is only required if we use fp16 training + model, optimizer = amp.initialize(model, optimizer, opt_level=config.fp16) + if config.distributed: + model = DistributedDataParallel(model, device_ids=[config.local_rank], output_device=config.local_rank) + + logger.info("Prepare datasets") + train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(config, tokenizer) + + # Training function and trainer + def update(engine, batch): + model.train() + batch = tuple(input_tensor.to(config.device) for input_tensor in batch) + lm_loss, mc_loss = model(*batch) + loss = (lm_loss * config.lm_coef + mc_loss * config.mc_coef) / config.gradient_accumulation_steps + if config.fp16: + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), config.max_norm) + else: + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_norm) + if engine.state.iteration % config.gradient_accumulation_steps == 0: + optimizer.step() + optimizer.zero_grad() + return loss.item() + trainer = Engine(update) + + # Evaluation function and evaluator (evaluator output is the input of the metrics) + def inference(engine, batch): + model.eval() + with torch.no_grad(): + batch = tuple(input_tensor.to(config.device) for input_tensor in batch) + input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch + #logger.info(tokenizer.decode(input_ids[0, -1, :].tolist())) + model_outputs = model(input_ids, mc_token_ids, token_type_ids=token_type_ids) + lm_logits, mc_logits = model_outputs[0], model_outputs[1] # So we can also use GPT2 outputs + lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1)) + lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) + return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels) + evaluator = Engine(inference) + + # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch + trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) + if config.n_epochs < 1: + trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) + if config.eval_before_start: + trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) + + # Make sure distributed data samplers split the dataset nicely between the distributed processes + if config.distributed: + trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) + evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) + + # Linearly decrease the learning rate from lr to zero + scheduler = PiecewiseLinear(optimizer, "lr", [(0, config.lr), (config.n_epochs * len(train_loader), 0.0)]) + trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) + + # Prepare metrics - note how we compute distributed metrics + RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") + metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0])), + "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))} + metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], config), + "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], config)}) + metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) + for name, metric in metrics.items(): + metric.attach(evaluator, name) + + # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train + if config.local_rank in [-1, 0]: + pbar = ProgressBar(persist=True) + pbar.attach(trainer, metric_names=["loss"]) + evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics))) + + tb_logger = TensorboardLogger(log_dir=config.log_dir) + tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) + tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) + tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) + + checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir, 'checkpoint', save_interval=1, n_saved=3) + trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)}) # "getattr" take care of distributed encapsulation + + torch.save(config, tb_logger.writer.log_dir + '/model_training_args.bin') + getattr(model, 'module', model).config.to_json_file(os.path.join(tb_logger.writer.log_dir, CONFIG_NAME)) + tokenizer.save_vocabulary(tb_logger.writer.log_dir) + + # Run the training + trainer.run(train_loader, max_epochs=config.n_epochs) + + # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) + if config.local_rank in [-1, 0] and config.n_epochs > 0: + os.rename(checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.log_dir, WEIGHTS_NAME)) # TODO: PR in ignite to have better access to saved file paths (cleaner) + tb_logger.close() + +if __name__ == "__main__": + train() diff --git a/train_emotion_recognition.py b/train_emotion_recognition.py new file mode 100644 index 0000000..e232066 --- /dev/null +++ b/train_emotion_recognition.py @@ -0,0 +1,286 @@ +# Copyright (c) 2019-present, HuggingFace Inc. +# All rights reserved. This source code is licensed under the BSD-style license found in the LICENSE file in the root directory of this source tree. +import os +import math +import logging +from pprint import pformat +from argparse import ArgumentParser +from collections import defaultdict +from itertools import chain + +import torch +from torch.nn.parallel import DistributedDataParallel +from torch.utils.data import DataLoader, TensorDataset +from ignite.engine import Engine, Events +from ignite.handlers import ModelCheckpoint +from ignite.metrics import Accuracy, Recall, Loss, MetricsLambda, RunningAverage, Precision, ConfusionMatrix +from ignite.contrib.handlers import ProgressBar, PiecewiseLinear +from ignite.contrib.handlers.tensorboard_logger import TensorboardLogger, OutputHandler, OptimizerParamsHandler + +from config import Config +from pytorch_pretrained_bert import (OpenAIAdam, OpenAIGPTDoubleHeadLMEmotionRecognitionModel, OpenAIGPTTokenizer, + GPT2DoubleHeadsModel, GPT2Tokenizer, WEIGHTS_NAME, CONFIG_NAME) + +from utils import get_dataset, get_dataset_for_daily_dialog + +SPECIAL_TOKENS = ["", "", "", "", + "", "", "", "", "", "", "", + "", "", "", "", + ""] +MODEL_INPUTS = ["input_ids", "mc_token_ids", "lm_labels", "mc_labels", "token_type_ids", "token_emotion_ids"] +PADDED_INPUTS = ["input_ids", "lm_labels", "token_type_ids", "token_emotion_ids"] + +logger = logging.getLogger(__file__) + +def average_distributed_scalar(scalar, config): + """ Average a scalar over the nodes if we are in distributed training. We use this for distributed evaluation. """ + if config.local_rank == -1: + return scalar + scalar_t = torch.tensor(scalar, dtype=torch.float, device=config.device) / torch.distributed.get_world_size() + torch.distributed.all_reduce(scalar_t, op=torch.distributed.ReduceOp.SUM) + return scalar_t.item() + + +def pad_dataset(dataset, padding=0): + """ Pad the dataset. This could be optimized by defining a Dataset class and padd only batches but this is simpler. """ + max_l = max(len(x) for x in dataset["input_ids"]) + for name in PADDED_INPUTS: + dataset[name] = [x + [padding if name != "lm_labels" else -1] * (max_l - len(x)) for x in dataset[name]] + return dataset + + +def get_emotion_label(tokenizer, candidate_emotion): + _, _, _, _, no_emotion_id, happiness_id, surprise_id, sadness_id, disgust_id, anger_id, fear_id, _, _, _, _, _ = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS) + if candidate_emotion == happiness_id: + return 0 + elif candidate_emotion == surprise_id: + return 1 + elif candidate_emotion == sadness_id: + return 2 + elif candidate_emotion == disgust_id: + return 3 + elif candidate_emotion == anger_id: + return 4 + elif candidate_emotion == fear_id: + return 5 + elif candidate_emotion == no_emotion_id: + return 6 + + +def build_input_from_segments(history, emotions, reply, true_emotion, tokenizer, with_eos=True): + """ Build a sequence of input from 3 segments: persona, history and last reply """ + bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:4]) + #tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]) + + instance = {} + # sequence = [[bos] + history[0] + list(chain(*history[1:]))] + [reply + ([eos] if with_eos else [])] #seq = [personas, history, reply] concatenate all persona sentences + sequence = [[bos] + history[0]] + history[1:] + [reply + ([eos] if with_eos else [])] + sequence = [[speaker2 if (len(sequence)-i) % 2 else speaker1] + s for i, s in enumerate(sequence)] + + instance["input_ids"] = list(chain(*sequence)) + instance["token_type_ids"] = [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in s] # the last for is for repeating the speaker1 and speaker2 for all tokens + #instance["token_emotion_ids"] = [emotions[i] for i, s in enumerate(sequence[:-1]) for _ in s] + [true_emotion] * len(sequence[-1]) + instance["token_emotion_ids"] = [emotions[i] for i, s in enumerate(sequence[:-1]) for _ in s] + + instance["mc_token_ids"] = len(instance["input_ids"]) - 1 + instance["mc_labels"] = get_emotion_label(tokenizer, true_emotion) + instance["lm_labels"] = ([-1] * sum(len(s) for s in sequence[:-1])) + [-1] + sequence[-1][1:] #all -1 except for reply, reply is just the ids + return instance, sequence + + +def get_data_loaders(config, tokenizer): + """ Prepare the dataset for training and evaluation """ + personachat = get_dataset_for_daily_dialog(tokenizer, config.dataset_path, config.dataset_cache, SPECIAL_TOKENS) + + # personachat["train"] = personachat["train"][:100] + # personachat["valid"] = personachat["valid"][:10] + + logger.info("Build inputs and labels") + datasets = {"train": defaultdict(list), "valid": defaultdict(list)} + gpu_max_length = 310 + for dataset_name, dataset in personachat.items(): + num_candidates = 2#len(dataset[0]["utterances"][0]["candidates"]) + if config.num_candidates > 0 and dataset_name == 'train': + num_candidates = min(config.num_candidates, num_candidates) + for dialog in dataset: + for utterance in dialog["utterances"]: + history = utterance["history"][-(2 * config.max_history + 1):] + emotions = utterance["emotion"][-(2 * config.max_history + 1):] + reply = utterance["candidates"][-1] + true_emotion = utterance['candidates_emotions'][-1] + if true_emotion == tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS)[4]: + continue + instance, _ = build_input_from_segments(history, + emotions, + reply, + true_emotion, + tokenizer) + + if len(instance["input_ids"]) > gpu_max_length: + truncated_history = [hist[:10] for hist in history] + truncated_candidate = reply[:10] + true_emotion = utterance['candidates_emotions'][-1] + instance, _ = build_input_from_segments(truncated_history, + emotions, + truncated_candidate, + true_emotion, + tokenizer) + + + for input_name, input_array in instance.items(): + datasets[dataset_name][input_name].append(input_array) + + datasets[dataset_name]["n_candidates"] = num_candidates + + logger.info("Pad inputs and convert to Tensor") + tensor_datasets = {"train": [], "valid": []} + for dataset_name, dataset in datasets.items(): + dataset = pad_dataset(dataset, padding=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1])) + for input_name in MODEL_INPUTS: + tensor = torch.tensor(dataset[input_name]) + #if input_name != "mc_labels": + # tensor = tensor.view((-1, datasets[dataset_name]["n_candidates"]) + tensor.shape[1:]) + tensor_datasets[dataset_name].append(tensor) + + logger.info("Build train and validation dataloaders") + train_dataset, valid_dataset = TensorDataset(*tensor_datasets["train"]), TensorDataset(*tensor_datasets["valid"]) + train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if config.distributed else None + valid_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset) if config.distributed else None + train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=config.train_batch_size, shuffle=False) + valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=config.valid_batch_size, shuffle=False) + + logger.info("Train dataset (Batch, Candidates, Seq length): {}".format(train_dataset.tensors[0].shape)) + logger.info("Valid dataset (Batch, Candidates, Seq length): {}".format(valid_dataset.tensors[0].shape)) + return train_loader, valid_loader, train_sampler, valid_sampler + + +def train(): + config_file = "configs/train_emotion_recognition_config.json" + config = Config.from_json_file(config_file) + + # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes + logging.basicConfig(level=logging.INFO if config.local_rank in [-1, 0] else logging.WARN) + logger.warning("Running process %d", config.local_rank) # This is a logger.warning: it will be printed by all distributed processes + logger.info("Arguments: %s", pformat(config)) + + # Initialize distributed training if needed + config.distributed = (config.local_rank != -1) + if config.distributed: + torch.cuda.set_device(config.local_rank) + config.device = torch.device("cuda", config.local_rank) + torch.distributed.init_process_group(backend='nccl', init_method='env://') + + logger.info("Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning") + tokenizer_class = GPT2Tokenizer if "gpt2" in config.model_checkpoint else OpenAIGPTTokenizer + tokenizer = tokenizer_class.from_pretrained(config.model_checkpoint) + model_class = OpenAIGPTDoubleHeadLMEmotionRecognitionModel + model = model_class.from_pretrained(config.model_checkpoint) + tokenizer.set_special_tokens(SPECIAL_TOKENS) + model.set_num_special_tokens(len(SPECIAL_TOKENS)) + model.to(config.device) + optimizer = OpenAIAdam(model.parameters(), lr=config.lr) + + # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) + if config.fp16: + from apex import amp # Apex is only required if we use fp16 training + model, optimizer = amp.initialize(model, optimizer, opt_level=config.fp16) + if config.distributed: + model = DistributedDataParallel(model, device_ids=[config.local_rank], output_device=config.local_rank) + + logger.info("Prepare datasets") + train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(config, tokenizer) + + # Training function and trainer + def update(engine, batch): + model.train() + input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids = tuple(input_tensor.to(config.device) for input_tensor in batch) + #token_emotion_ids = None + lm_loss, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids) + loss = (lm_loss * config.lm_coef + mc_loss * config.mc_coef) / config.gradient_accumulation_steps + if config.fp16: + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), config.max_norm) + else: + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_norm) + if engine.state.iteration % config.gradient_accumulation_steps == 0: + optimizer.step() + optimizer.zero_grad() + return loss.item() + trainer = Engine(update) + + # Evaluation function and evaluator (evaluator output is the input of the metrics) + def inference(engine, batch): + model.eval() + with torch.no_grad(): + batch = tuple(input_tensor.to(config.device) for input_tensor in batch) + input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids = batch + #token_emotion_ids = None + model_outputs = model(input_ids, mc_token_ids, token_type_ids=token_type_ids, token_emotion_ids=token_emotion_ids) + lm_logits, mc_logits = model_outputs[0], model_outputs[1] # So we can also use GPT2 outputs + lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1)) + lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) + return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels) + evaluator = Engine(inference) + + # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch + trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) + if config.n_epochs < 1: + trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) + if config.eval_before_start: + trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) + + # Make sure distributed data samplers split the dataset nicely between the distributed processes + if config.distributed: + trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) + evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) + + # Linearly decrease the learning rate from lr to zero + scheduler = PiecewiseLinear(optimizer, "lr", [(0, config.lr), (config.n_epochs * len(train_loader), 0.0)]) + trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) + + # Prepare metrics - note how we compute distributed metrics + RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") + metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0])), + "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))} + + metrics.update({"precision": Precision(output_transform=lambda x: (x[0][1], x[1][1])), + "recall": Recall(output_transform=lambda x: (x[0][1], x[1][1]))}) + + metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], config), + "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], config)}) + + metrics.update({"confusion_matrix": ConfusionMatrix(num_classes=6, output_transform=lambda x: (x[0][1], x[1][1]))}) + metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) + for name, metric in metrics.items(): + metric.attach(evaluator, name) + + # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train + if config.local_rank in [-1, 0]: + pbar = ProgressBar(persist=True) + pbar.attach(trainer, metric_names=["loss"]) + evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics))) + + tb_logger = TensorboardLogger(log_dir=config.log_dir) + tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) + tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) + tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) + + checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir, 'checkpoint', save_interval=1, n_saved=3) + trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)}) # "getattr" take care of distributed encapsulation + + torch.save(config, tb_logger.writer.log_dir + '/model_training_args.bin') + getattr(model, 'module', model).config.to_json_file(os.path.join(tb_logger.writer.log_dir, CONFIG_NAME)) + tokenizer.save_vocabulary(tb_logger.writer.log_dir) + + # Run the training + trainer.run(train_loader, max_epochs=config.n_epochs) + + # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) + if config.local_rank in [-1, 0] and config.n_epochs > 0: + os.rename(checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.log_dir, WEIGHTS_NAME)) # TODO: PR in ignite to have better access to saved file paths (cleaner) + tb_logger.close() + +if __name__ == "__main__": + train() diff --git a/train_full.py b/train_full.py new file mode 100644 index 0000000..30af959 --- /dev/null +++ b/train_full.py @@ -0,0 +1,253 @@ +# Copyright (c) 2019-present, HuggingFace Inc. +# All rights reserved. This source code is licensed under the BSD-style license found in the LICENSE file in the root directory of this source tree. +import os +import math +import logging +from pprint import pformat +from argparse import ArgumentParser +from collections import defaultdict +from itertools import chain + +import torch +from torch.nn.parallel import DistributedDataParallel +from torch.utils.data import DataLoader, TensorDataset +from ignite.engine import Engine, Events +from ignite.handlers import ModelCheckpoint +from ignite.metrics import Accuracy, Loss, MetricsLambda, RunningAverage +from ignite.contrib.handlers import ProgressBar, PiecewiseLinear +from config import Config +from ignite.contrib.handlers.tensorboard_logger import TensorboardLogger, OutputHandler, OptimizerParamsHandler +from pytorch_pretrained_bert import (OpenAIAdam, OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer, + GPT2DoubleHeadsModel, GPT2Tokenizer, WEIGHTS_NAME, CONFIG_NAME, + BertModel, BertTokenizer) + +from utils import get_dataset, get_dataset_for_daily_dialog + +SPECIAL_TOKENS = ["", "", "", "", + "", "", "", "", "", "", "", + "", "", "", "", + ""] +MODEL_INPUTS = ["input_ids", "mc_token_ids", "lm_labels", "mc_labels", "token_type_ids", "token_emotion_ids"] +PADDED_INPUTS = ["input_ids", "lm_labels", "token_type_ids", "token_emotion_ids"] + +logger = logging.getLogger(__file__) + +def average_distributed_scalar(scalar, config): + """ Average a scalar over the nodes if we are in distributed training. We use this for distributed evaluation. """ + if config.local_rank == -1: + return scalar + scalar_t = torch.tensor(scalar, dtype=torch.float, device=config.device) / torch.distributed.get_world_size() + torch.distributed.all_reduce(scalar_t, op=torch.distributed.ReduceOp.SUM) + return scalar_t.item() + + +def pad_dataset(dataset, padding=0): + """ Pad the dataset. This could be optimized by defining a Dataset class and padd only batches but this is simpler. """ + max_l = max(len(x) for x in dataset["input_ids"]) + for name in PADDED_INPUTS: + dataset[name] = [x + [padding if name != "lm_labels" else -1] * (max_l - len(x)) for x in dataset[name]] + return dataset + + +def build_input_from_segments(history, emotions, reply, candidate_emotion, tokenizer, lm_labels=False, with_eos=True): + """ Build a sequence of input from 3 segments: persona, history and last reply """ + bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:4]) + + instance = {} + #sequence = [[bos] + history[0] + list(chain(*history[1:]))] + [reply + ([eos] if with_eos else [])] #seq = [personas, history, reply] concatenate all persona sentences + sequence = [[bos] + history[0]] + history[1:] +[reply +([eos] if with_eos else [])] + sequence = [[speaker2 if (len(sequence)-i) % 2 else speaker1] + s for i, s in enumerate(sequence)] + all_emotions = emotions + [candidate_emotion] + sequence = [[all_emotions[i]] + s for i, s in enumerate(sequence)] + + instance["input_ids"] = list(chain(*sequence)) + instance["token_type_ids"] = [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in s] # the last for is for repeating the speaker1 and speaker2 for all tokens + instance["token_emotion_ids"] = [emotions[i] for i, s in enumerate(sequence[:-1]) for _ in s]+[candidate_emotion]*len(sequence[-1]) + + instance["mc_token_ids"] = len(instance["input_ids"]) - 1 + instance["lm_labels"] = [-1] * len(instance["input_ids"]) + if lm_labels: + instance["lm_labels"] = ([-1] * sum(len(s) for s in sequence[:-1])) + [-1] + sequence[-1][1:] #all -1 except for reply, reply is just the ids + return instance, sequence + + +def get_data_loaders(config, tokenizer): + """ Prepare the dataset for training and evaluation """ + personachat = get_dataset_for_daily_dialog(tokenizer, config.dataset_path, config.dataset_cache, SPECIAL_TOKENS) + + # personachat["train"] = personachat["train"][:100] + # personachat["valid"] = personachat["valid"][:10] + + + logger.info("Build inputs and labels") + datasets = {"train": defaultdict(list), "valid": defaultdict(list)} + gpu_max_length = 310 + for dataset_name, dataset in personachat.items(): + num_candidates = len(dataset[0]["utterances"][0]["candidates"]) + if config.num_candidates > 0 and dataset_name == 'train': + num_candidates = min(config.num_candidates, num_candidates) + for dialog in dataset: + for utterance in dialog["utterances"]: + history = utterance["history"][-(2*config.max_history+1):] + emotions = utterance["emotion"][-(2 * config.max_history + 1):] + for j, candidate in enumerate(utterance["candidates"][-num_candidates:]): + lm_labels = bool(j == num_candidates-1) #the true label is always the last one in list of candidates + candidate_emotion = utterance['candidates_emotions'][j] + instance, _ = build_input_from_segments(history, emotions, candidate, candidate_emotion, tokenizer, lm_labels) + #print(len(instance["input_ids"])) + if len(instance["input_ids"]) > gpu_max_length: + truncated_history = [hist[:10] for hist in history] + truncated_candidate = candidate[:10] + instance, _ = build_input_from_segments(truncated_history, emotions, truncated_candidate, candidate_emotion, tokenizer, lm_labels) + + for input_name, input_array in instance.items(): + datasets[dataset_name][input_name].append(input_array) + datasets[dataset_name]["mc_labels"].append(num_candidates - 1) + datasets[dataset_name]["n_candidates"] = num_candidates + + logger.info("Pad inputs and convert to Tensor") + tensor_datasets = {"train": [], "valid": []} + for dataset_name, dataset in datasets.items(): + dataset = pad_dataset(dataset, padding=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1])) + for input_name in MODEL_INPUTS: + tensor = torch.tensor(dataset[input_name]) + if input_name != "mc_labels": + tensor = tensor.view((-1, datasets[dataset_name]["n_candidates"]) + tensor.shape[1:]) + tensor_datasets[dataset_name].append(tensor) + + logger.info("Build train and validation dataloaders") + train_dataset, valid_dataset = TensorDataset(*tensor_datasets["train"]), TensorDataset(*tensor_datasets["valid"]) + train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if config.distributed else None + valid_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset) if config.distributed else None + train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=config.train_batch_size, shuffle=False) + valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=config.valid_batch_size, shuffle=False) + + logger.info("Train dataset (Batch, Candidates, Seq length): {}".format(train_dataset.tensors[0].shape)) + logger.info("Valid dataset (Batch, Candidates, Seq length): {}".format(valid_dataset.tensors[0].shape)) + return train_loader, valid_loader, train_sampler, valid_sampler + + +def train(): + config_file = "configs/train_full_config.json" + config = Config.from_json_file(config_file) + + + # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes + logging.basicConfig(level=logging.INFO if config.local_rank in [-1, 0] else logging.WARN) + logger.warning("Running process %d", config.local_rank) # This is a logger.warning: it will be printed by all distributed processes + logger.info("Arguments: %s", pformat(config)) + + # Initialize distributed training if needed + config.distributed = (config.local_rank != -1) + if config.distributed: + torch.cuda.set_device(config.local_rank) + config.device = torch.device("cuda", config.local_rank) + torch.distributed.init_process_group(backend='nccl', init_method='env://') + + logger.info("Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning") + tokenizer_class = GPT2Tokenizer if "gpt2" in config.model_checkpoint else OpenAIGPTTokenizer + tokenizer = tokenizer_class.from_pretrained(config.model_checkpoint) + model_class = GPT2DoubleHeadsModel if "gpt2" in config.model_checkpoint else OpenAIGPTDoubleHeadsModel + model = model_class.from_pretrained(config.model_checkpoint) + tokenizer.set_special_tokens(SPECIAL_TOKENS) + model.set_num_special_tokens(len(SPECIAL_TOKENS)) + model.to(config.device) + optimizer = OpenAIAdam(model.parameters(), lr=config.lr) + + # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) + if config.fp16: + from apex import amp # Apex is only required if we use fp16 training + model, optimizer = amp.initialize(model, optimizer, opt_level=config.fp16) + if config.distributed: + model = DistributedDataParallel(model, device_ids=[config.local_rank], output_device=config.local_rank) + + logger.info("Prepare datasets") + train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(config, tokenizer) + + # Training function and trainer + def update(engine, batch): + model.train() + input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids = tuple(input_tensor.to(config.device) for input_tensor in batch) + lm_loss, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids) + loss = (lm_loss * config.lm_coef + mc_loss * config.mc_coef) / config.gradient_accumulation_steps + if config.fp16: + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), config.max_norm) + else: + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_norm) + if engine.state.iteration % config.gradient_accumulation_steps == 0: + optimizer.step() + optimizer.zero_grad() + return loss.item() + trainer = Engine(update) + + # Evaluation function and evaluator (evaluator output is the input of the metrics) + def inference(engine, batch): + model.eval() + with torch.no_grad(): + batch = tuple(input_tensor.to(config.device) for input_tensor in batch) + input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids = batch + #logger.info(tokenizer.decode(input_ids[0, -1, :].tolist())) + model_outputs = model(input_ids, mc_token_ids, token_type_ids=token_type_ids, token_emotion_ids=token_emotion_ids) + lm_logits, mc_logits = model_outputs[0], model_outputs[1] # So we can also use GPT2 outputs + lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1)) + lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) + return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels) + evaluator = Engine(inference) + + # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch + trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) + if config.n_epochs < 1: + trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) + if config.eval_before_start: + trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) + + # Make sure distributed data samplers split the dataset nicely between the distributed processes + if config.distributed: + trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) + evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) + + # Linearly decrease the learning rate from lr to zero + scheduler = PiecewiseLinear(optimizer, "lr", [(0, config.lr), (config.n_epochs * len(train_loader), 0.0)]) + trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) + + # Prepare metrics - note how we compute distributed metrics + RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") + metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0])), + "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))} + metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], config), + "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], config)}) + metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) + for name, metric in metrics.items(): + metric.attach(evaluator, name) + + # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train + if config.local_rank in [-1, 0]: + pbar = ProgressBar(persist=True) + pbar.attach(trainer, metric_names=["loss"]) + evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics))) + + tb_logger = TensorboardLogger(log_dir=config.log_dir) + tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) + tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) + tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) + + checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir, 'checkpoint', save_interval=1, n_saved=3) + trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)}) # "getattr" take care of distributed encapsulation + + torch.save(config, tb_logger.writer.log_dir + '/model_training_args.bin') + getattr(model, 'module', model).config.to_json_file(os.path.join(tb_logger.writer.log_dir, CONFIG_NAME)) + tokenizer.save_vocabulary(tb_logger.writer.log_dir) + + # Run the training + trainer.run(train_loader, max_epochs=config.n_epochs) + + # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) + if config.local_rank in [-1, 0] and config.n_epochs > 0: + os.rename(checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.log_dir, WEIGHTS_NAME)) # TODO: PR in ignite to have better access to saved file paths (cleaner) + tb_logger.close() + +if __name__ == "__main__": + train() diff --git a/train_multihead.py b/train_multihead.py new file mode 100644 index 0000000..950878a --- /dev/null +++ b/train_multihead.py @@ -0,0 +1,297 @@ +# Copyright (c) 2019-present, HuggingFace Inc. +# All rights reserved. This source code is licensed under the BSD-style license found in the LICENSE file in the root directory of this source tree. +import os +import math +import logging +from pprint import pformat +from argparse import ArgumentParser +from collections import defaultdict +from itertools import chain + +import torch +from torch.nn.parallel import DistributedDataParallel +from torch.utils.data import DataLoader, TensorDataset +from ignite.engine import Engine, Events +from ignite.handlers import ModelCheckpoint +from ignite.metrics import Accuracy, Loss, MetricsLambda, RunningAverage +from ignite.contrib.handlers import ProgressBar, PiecewiseLinear +from config import Config +from ignite.contrib.handlers.tensorboard_logger import TensorboardLogger, OutputHandler, OptimizerParamsHandler +from pytorch_pretrained_bert import (OpenAIAdam, OpenAIGPTMultiHeadModel, OpenAIGPTTokenizer, + GPT2DoubleHeadsModel, GPT2Tokenizer, WEIGHTS_NAME, CONFIG_NAME, + BertModel, BertTokenizer) + +from utils import get_dataset, get_dataset_for_daily_dialog + +SPECIAL_TOKENS = ["", "", "", "", + + "", "", "", "", "", "", "", + + "", "", "", "", "", + "", "", "", "", "", + + "", "", "", "", + ""] +MODEL_INPUTS = ["input_ids", "ec_token_ids", "sc_token_ids", "lm_labels", "ec_labels", "sc_labels", + "token_type_ids", "token_emotion_ids", "token_action_ids"] +PADDED_INPUTS = ["input_ids", "lm_labels", "token_type_ids", "token_emotion_ids", "token_action_ids"] + +logger = logging.getLogger(__file__) + +def average_distributed_scalar(scalar, config): + """ Average a scalar over the nodes if we are in distributed training. We use this for distributed evaluation. """ + if config.local_rank == -1: + return scalar + scalar_t = torch.tensor(scalar, dtype=torch.float, device=config.device) / torch.distributed.get_world_size() + torch.distributed.all_reduce(scalar_t, op=torch.distributed.ReduceOp.SUM) + return scalar_t.item() + + +def pad_dataset(dataset, padding=0): + """ Pad the dataset. This could be optimized by defining a Dataset class and padd only batches but this is simpler. """ + max_l = max(len(x) for x in dataset["input_ids"]) + for name in PADDED_INPUTS: + dataset[name] = [x + [padding if name != "lm_labels" else -1] * (max_l - len(x)) for x in dataset[name]] + return dataset + + +def get_emotion_label(tokenizer, candidate_emotion): + no_emotion_id, happiness_id, surprise_id, sadness_id, disgust_id, anger_id, fear_id = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[4:11]) + + if candidate_emotion == no_emotion_id: + return 0 + elif candidate_emotion == happiness_id: + return 1 + elif candidate_emotion == surprise_id: + return 2 + elif candidate_emotion == sadness_id: + return 3 + elif candidate_emotion == disgust_id: + return 4 + elif candidate_emotion == anger_id: + return 5 + elif candidate_emotion == fear_id: + return 6 + +def build_input_from_segments(topic, history, emotions, actions, reply, candidate_emotion, canidate_act, tokenizer, lm_labels=False, with_eos=True): + """ Build a sequence of input from 3 segments: persona, history and last reply """ + bos, eos, speaker1, speaker2, no_emotion = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:5]) + + inform = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-4]) + emotions = [no_emotion] + emotions + actions = [inform] + actions + + instance = {} + #sequence = [[bos] + history[0] + list(chain(*history[1:]))] + [reply + ([eos] if with_eos else [])] #seq = [personas, history, reply] concatenate all persona sentences + sequence = [[bos] + [topic]] + history + [reply + ([eos] if with_eos else [])] + sequence = [[speaker2 if (len(sequence)-i) % 2 else speaker1] + s for i, s in enumerate(sequence)] + + + instance["input_ids"] = list(chain(*sequence)) + instance["token_type_ids"] = [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in s] # the last for is for repeating the speaker1 and speaker2 for all tokens + instance["token_emotion_ids"] = [emotions[i] for i, s in enumerate(sequence[:-1]) for _ in s] + [candidate_emotion]*len(sequence[-1]) + instance["token_action_ids"] = [actions[i] for i, s in enumerate(sequence[:-1]) for _ in s] + [canidate_act]*len(sequence[-1]) + + instance["ec_token_ids"] = len(instance["input_ids"]) - 1 + instance["sc_token_ids"] = len(instance["input_ids"]) - 2 + instance["ec_labels"] = -1 + instance["lm_labels"] = [-1] * len(instance["input_ids"]) + if lm_labels: + instance["lm_labels"] = ([-1] * sum(len(s) for s in sequence[:-1])) + [-1] + sequence[-1][1:] #all -1 except for reply, reply is just the ids + instance["ec_labels"] = get_emotion_label(tokenizer, candidate_emotion) + return instance, sequence + + +def get_data_loaders(config, tokenizer): + """ Prepare the dataset for training and evaluation """ + personachat = get_dataset_for_daily_dialog(tokenizer, config.dataset_path, config.dataset_cache, SPECIAL_TOKENS) + + + logger.info("Build inputs and labels") + datasets = {"train": defaultdict(list), "valid": defaultdict(list)} + gpu_max_length = 310 + for dataset_name, dataset in personachat.items(): + num_candidates = len(dataset[0]["utterances"][0]["candidates"]) + if config.num_candidates > 0 and dataset_name == 'train': + num_candidates = min(config.num_candidates, num_candidates) + for dialog in dataset: + topic = dialog["topic"] + for utterance in dialog["utterances"]: + history = utterance["history"][-(2 * config.max_history+1):] + emotions = utterance["emotion"][-(2 * config.max_history + 1):] + actions = utterance["act"][-(2 * config.max_history+1):] + for j, candidate in enumerate(utterance["candidates"][-num_candidates:]): + lm_labels = bool(j == num_candidates-1) #the true label is always the last one in list of candidates + candidate_emotion = utterance['candidates_emotions'][j] + candidate_act = utterance['candidates_acts'][j] + instance, _ = build_input_from_segments(topic, history, emotions, actions, candidate, + candidate_emotion, candidate_act, tokenizer, lm_labels) + + if len(instance["input_ids"]) > gpu_max_length: + truncated_history = [hist[:10] for hist in history] + truncated_candidate = candidate[:10] + instance, _ = build_input_from_segments(topic, truncated_history, emotions, actions, truncated_candidate, + candidate_emotion, candidate_act, tokenizer, lm_labels) + + + for input_name, input_array in instance.items(): + datasets[dataset_name][input_name].append(input_array) + + datasets[dataset_name]["sc_labels"].append(num_candidates - 1) + datasets[dataset_name]["n_candidates"] = num_candidates + + logger.info("Pad inputs and convert to Tensor") + tensor_datasets = {"train": [], "valid": []} + for dataset_name, dataset in datasets.items(): + dataset = pad_dataset(dataset, padding=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1])) + for input_name in MODEL_INPUTS: + tensor = torch.tensor(dataset[input_name]) + if input_name != "sc_labels": + tensor = tensor.view((-1, datasets[dataset_name]["n_candidates"]) + tensor.shape[1:]) + tensor_datasets[dataset_name].append(tensor) + + logger.info("Build train and validation dataloaders") + train_dataset, valid_dataset = TensorDataset(*tensor_datasets["train"]), TensorDataset(*tensor_datasets["valid"]) + train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if config.distributed else None + valid_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset) if config.distributed else None + train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=config.train_batch_size, shuffle=False) + valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=config.valid_batch_size, shuffle=False) + + logger.info("Train dataset (Batch, Candidates, Seq length): {}".format(train_dataset.tensors[0].shape)) + logger.info("Valid dataset (Batch, Candidates, Seq length): {}".format(valid_dataset.tensors[0].shape)) + return train_loader, valid_loader, train_sampler, valid_sampler + + +def train(): + config_file = "configs/train_multihead_config.json" + config = Config.from_json_file(config_file) + + ec_coef = 1 + sc_coef = 1 + + # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes + logging.basicConfig(level=logging.INFO if config.local_rank in [-1, 0] else logging.WARN) + logger.warning("Running process %d", config.local_rank) # This is a logger.warning: it will be printed by all distributed processes + logger.info("Arguments: %s", pformat(config)) + + # Initialize distributed training if needed + config.distributed = (config.local_rank != -1) + if config.distributed: + torch.cuda.set_device(config.local_rank) + config.device = torch.device("cuda", config.local_rank) + torch.distributed.init_process_group(backend='nccl', init_method='env://') + + logger.info("Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning") + tokenizer_class = OpenAIGPTTokenizer + tokenizer = tokenizer_class.from_pretrained(config.model_checkpoint) + model_class = OpenAIGPTMultiHeadModel + model = model_class.from_pretrained(config.model_checkpoint) + tokenizer.set_special_tokens(SPECIAL_TOKENS) + model.set_num_special_tokens(len(SPECIAL_TOKENS)) + model.to(config.device) + optimizer = OpenAIAdam(model.parameters(), lr=config.lr) + + # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) + if config.fp16: + from apex import amp # Apex is only required if we use fp16 training + model, optimizer = amp.initialize(model, optimizer, opt_level=config.fp16) + if config.distributed: + model = DistributedDataParallel(model, device_ids=[config.local_rank], output_device=config.local_rank) + + logger.info("Prepare datasets") + train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(config, tokenizer) + + # Training function and trainer + def update(engine, batch): + model.train() + #input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids, token_action_ids = tuple(input_tensor.to(config.device) for input_tensor in batch) + input_ids, ec_token_ids, sc_token_ids, lm_labels, ec_labels, sc_labels, token_type_ids, token_emotion_ids, token_action_ids = tuple(input_tensor.to(config.device) for input_tensor in batch) + + lm_loss, emotion_loss, sentence_loss = model(input_ids, ec_token_ids, sc_token_ids, + lm_labels, ec_labels, sc_labels, token_type_ids, + token_emotion_ids, token_action_ids) + loss = (lm_loss * config.lm_coef + emotion_loss * ec_coef + sentence_loss * sc_coef) / config.gradient_accumulation_steps + if config.fp16: + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), config.max_norm) + else: + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_norm) + if engine.state.iteration % config.gradient_accumulation_steps == 0: + optimizer.step() + optimizer.zero_grad() + return loss.item() + trainer = Engine(update) + + # Evaluation function and evaluator (evaluator output is the input of the metrics) + def inference(engine, batch): + model.eval() + with torch.no_grad(): + batch = tuple(input_tensor.to(config.device) for input_tensor in batch) + input_ids, ec_token_ids, sc_token_ids, lm_labels, ec_labels, \ + sc_labels, token_type_ids, token_emotion_ids, token_action_ids = batch + #logger.info(tokenizer.decode(input_ids[0, -1, :].tolist())) + model_outputs = model(input_ids, ec_token_ids, sc_token_ids, token_type_ids=token_type_ids, + token_emotion_ids=token_emotion_ids, + token_action_ids=token_action_ids) + lm_logits, mc_logits = model_outputs[0], model_outputs[2] # So we can also use GPT2 outputs + lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1)) + lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) + return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, sc_labels) + evaluator = Engine(inference) + + # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch + trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) + if config.n_epochs < 1: + trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) + if config.eval_before_start: + trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) + + # Make sure distributed data samplers split the dataset nicely between the distributed processes + if config.distributed: + trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) + evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) + + # Linearly decrease the learning rate from lr to zero + scheduler = PiecewiseLinear(optimizer, "lr", [(0, config.lr), (config.n_epochs * len(train_loader), 0.0)]) + trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) + + # Prepare metrics - note how we compute distributed metrics + RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") + metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0])), + "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))} + metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], config), + "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], config)}) + metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) + for name, metric in metrics.items(): + metric.attach(evaluator, name) + + # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train + if config.local_rank in [-1, 0]: + pbar = ProgressBar(persist=True) + pbar.attach(trainer, metric_names=["loss"]) + evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics))) + + tb_logger = TensorboardLogger(log_dir=config.log_dir) + tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) + tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) + tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) + + checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir, 'checkpoint', save_interval=1, n_saved=3) + trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)}) # "getattr" take care of distributed encapsulation + + torch.save(config, tb_logger.writer.log_dir + '/model_training_args.bin') + getattr(model, 'module', model).config.to_json_file(os.path.join(tb_logger.writer.log_dir, CONFIG_NAME)) + tokenizer.save_vocabulary(tb_logger.writer.log_dir) + + # Run the training + trainer.run(train_loader, max_epochs=config.n_epochs) + + # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) + if config.local_rank in [-1, 0] and config.n_epochs > 0: + os.rename(checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.log_dir, WEIGHTS_NAME)) # TODO: PR in ignite to have better access to saved file paths (cleaner) + tb_logger.close() + +if __name__ == "__main__": + train() diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..c4dd9f8 --- /dev/null +++ b/utils.py @@ -0,0 +1,225 @@ +# Copyright (c) 2019-present, HuggingFace Inc. +# All rights reserved. This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +import json +import logging +import os +import tarfile +import tempfile +import re + +import torch + +from pytorch_pretrained_bert import cached_path +from collections import Counter + +try: + from nltk.translate import bleu_score as nltkbleu +except ImportError: + # User doesn't have nltk installed, so we can't use it for bleu + # We'll just turn off things, but we might want to warn the user + nltkbleu = None + + + +PERSONACHAT_URL = "https://s3.amazonaws.com/datasets.huggingface.co/personachat/personachat_self_original.json" +HF_FINETUNED_MODEL = "https://s3.amazonaws.com/models.huggingface.co/transfer-learning-chatbot/finetuned_chatbot_gpt.tar.gz" + +logger = logging.getLogger(__file__) + + +re_art = re.compile(r'\b(a|an|the)\b') +re_punc = re.compile(r'[!"#$%&()*+,-./:;<=>?@\[\]\\^`{|}~_\']') + + +def normalize_answer(s): + """Lower text and remove punctuation, articles and extra whitespace.""" + def remove_articles(text): + return re_art.sub(' ', text) + + def white_space_fix(text): + return ' '.join(text.split()) + + def remove_punc(text): + return re_punc.sub(' ', text) # convert punctuation to spaces + + def lower(text): + return text.lower() + + return white_space_fix(remove_articles(remove_punc(lower(s)))) + + +def download_pretrained_model(): + """ Download and extract finetuned model from S3 """ + resolved_archive_file = cached_path(HF_FINETUNED_MODEL) + tempdir = tempfile.mkdtemp() + + logger.info("extracting archive file {} to temp dir {}".format(resolved_archive_file, tempdir)) + with tarfile.open(resolved_archive_file, 'r:gz') as archive: + archive.extractall(tempdir) + return tempdir + + +def get_dataset(tokenizer, dataset_path, dataset_cache=None): + """ Get PERSONACHAT from S3 """ + dataset_path = dataset_path or PERSONACHAT_URL + dataset_cache = dataset_cache + '_' + type(tokenizer).__name__ # Do avoid using GPT cache for GPT-2 and vice-versa + if dataset_cache and os.path.isfile(dataset_cache): + logger.info("Load tokenized dataset from cache at %s", dataset_cache) + dataset = torch.load(dataset_cache) + else: + logger.info("Download dataset from %s", dataset_path) + personachat_file = cached_path(dataset_path) + with open(personachat_file, "r", encoding="utf-8") as f: + dataset = json.loads(f.read()) + + logger.info("Tokenize and encode the dataset") + def tokenize(obj): + if isinstance(obj, str): + return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) + if isinstance(obj, dict): + return dict((n, tokenize(o)) for n, o in obj.items()) + return list(tokenize(o) for o in obj) + dataset = tokenize(dataset) + if dataset_cache: + torch.save(dataset, dataset_cache) + return dataset + + + +def get_dataset_for_daily_dialog(tokenizer, dataset_path, dataset_cache=None, special_tokens=None): + """ Get PERSONACHAT from S3 """ + dataset_path = dataset_path or PERSONACHAT_URL + dataset_cache = dataset_cache + '_' + type(tokenizer).__name__ # Do avoid using GPT cache for GPT-2 and vice-versa + if dataset_cache and os.path.isfile(dataset_cache): + logger.info("Load tokenized dataset from cache at %s", dataset_cache) + dataset = torch.load(dataset_cache) + else: + logger.info("Download dataset from %s", dataset_path) + personachat_file = cached_path(dataset_path) + with open(personachat_file, "r", encoding="utf-8") as f: + dataset = json.loads(f.read()) + + logger.info("Tokenize and encode the dataset") + def tokenize(obj): + if isinstance(obj, str): + if obj in special_tokens: + return tokenizer.convert_tokens_to_ids(obj) + else: + return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) + if isinstance(obj, dict): + return dict((n, tokenize(o)) for n, o in obj.items()) + return list(tokenize(o) for o in obj) + dataset = tokenize(dataset) + if dataset_cache: + torch.save(dataset, dataset_cache) + return dataset + + +def get_dataset_personalities(tokenizer, dataset_path, dataset_cache=None): + """ Get personalities from PERSONACHAT """ + dataset_path = dataset_path or PERSONACHAT_URL + dataset_cache = dataset_cache + '_' + type(tokenizer).__name__ # Do avoid using GPT cache for GPT-2 and vice-versa + if os.path.isfile(dataset_cache): + logger.info("Load tokenized dataset from cache at %s", dataset_cache) + personachat = torch.load(dataset_cache) + else: + logger.info("Download PERSONACHAT dataset from %s", dataset_path) + personachat_file = cached_path(dataset_path) + with open(personachat_file, "r", encoding="utf-8") as f: + personachat = json.loads(f.read()) + + logger.info("Tokenize and encode the dataset") + def tokenize(obj): + if isinstance(obj, str): + return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) + if isinstance(obj, dict): + return dict((n, tokenize(o)) for n, o in obj.items()) + return list(tokenize(o) for o in obj) + personachat = tokenize(personachat) + torch.save(personachat, dataset_cache) + + logger.info("Filter personalities") + personalities = [] + for dataset in personachat.values(): + for dialog in dataset: + personalities.append(dialog["personality"]) + + logger.info("Gathered {} personalities".format(len(personalities))) + return personalities + + +def _prec_recall_f1_score(pred_items, gold_items): + """ + Compute precision, recall and f1 given a set of gold and prediction items. + + :param pred_items: iterable of predicted values + :param gold_items: iterable of gold values + + :return: tuple (p, r, f1) for precision, recall, f1 + """ + common = Counter(gold_items) & Counter(pred_items) + num_same = sum(common.values()) + if num_same == 0: + return 0, 0, 0 + precision = 1.0 * num_same / len(pred_items) + recall = 1.0 * num_same / len(gold_items) + f1 = (2 * precision * recall) / (precision + recall) + return precision, recall, f1 + + + +def _f1_score(guess, answers): + """Return the max F1 score between the guess and *any* answer.""" + if guess is None or answers is None: + return 0 + g_tokens = normalize_answer(guess).split() + scores = [ + _prec_recall_f1_score(g_tokens, normalize_answer(a).split())for a in answers + ] + return max(f1 for p, r, f1 in scores) + + +def _bleu(guess, answers, method=None): + """Compute approximate BLEU score between guess and a set of answers.""" + if nltkbleu is None: + # bleu library not installed, just return a default value + return None + # Warning: BLEU calculation *should* include proper tokenization and + # punctuation etc. We're using the normalize_answer for everything though, + # so we're over-estimating our BLEU scores. Also note that NLTK's bleu is + # going to be slower than fairseq's (which is written in C), but fairseq's + # requires that everything be in arrays of ints (i.e. as tensors). NLTK's + # works with strings, which is better suited for this module. + if method == "method0": + smoothing_func = nltkbleu.SmoothingFunction(epsilon=1e-12).method0 + elif method == "method1": + smoothing_func = nltkbleu.SmoothingFunction(epsilon=1e-12).method1 + elif method == "method2": + smoothing_func = nltkbleu.SmoothingFunction(epsilon=1e-12).method2 + elif method == "method3": + smoothing_func = nltkbleu.SmoothingFunction(epsilon=1e-12).method3 + elif method == "method4": + smoothing_func = nltkbleu.SmoothingFunction(epsilon=1e-12).method4 + elif method == "method5": + smoothing_func = nltkbleu.SmoothingFunction(epsilon=1e-12).method5 + elif method == "method6": + smoothing_func = nltkbleu.SmoothingFunction(epsilon=1e-12).method6 + elif method == "method7": + smoothing_func = nltkbleu.SmoothingFunction(epsilon=1e-12).method7 + else: + smoothing_func = nltkbleu.SmoothingFunction(epsilon=1e-12).method3 + + + return nltkbleu.sentence_bleu( + [normalize_answer(a).split(" ") for a in answers], + normalize_answer(guess).split(" "), + smoothing_function=smoothing_func, + ) + + + +class AttrDict(dict): + def __init__(self, *args, **kwargs): + super(AttrDict, self).__init__(*args, **kwargs) + self.__dict__ = self