From 086ed8172a5d47b701b56343bd8c4d191dcd3256 Mon Sep 17 00:00:00 2001
From: rohola <hilbert.cantor@gmail.com>
Date: Mon, 6 Apr 2020 14:51:39 -0600
Subject: [PATCH] first commit

---
 .gitignore                                    |   18 +
 README.md                                     |   56 +
 config.py                                     |  109 ++
 configs/interact_config.json                  |   15 +
 configs/train_config.json                     |   23 +
 configs/train_emotion_recognition_config.json |   23 +
 configs/train_full_config.json                |   23 +
 configs/train_multihead_config.json           |   23 +
 eval_emotion_recognition.py                   |  208 +++
 evaluate.py                                   |  195 ++
 interact.py                                   |  161 ++
 pytorch_pretrained_bert/__init__.py           |   26 +
 pytorch_pretrained_bert/__main__.py           |   83 +
 .../convert_gpt2_checkpoint_to_pytorch.py     |   72 +
 .../convert_openai_checkpoint_to_pytorch.py   |   72 +
 .../convert_tf_checkpoint_to_pytorch.py       |   66 +
 ...onvert_transfo_xl_checkpoint_to_pytorch.py |  116 ++
 pytorch_pretrained_bert/file_utils.py         |  279 +++
 pytorch_pretrained_bert/modeling.py           | 1623 +++++++++++++++++
 pytorch_pretrained_bert/modeling_gpt2.py      |  821 +++++++++
 pytorch_pretrained_bert/modeling_openai.py    | 1089 +++++++++++
 .../modeling_transfo_xl.py                    | 1392 ++++++++++++++
 .../modeling_transfo_xl_utilities.py          |  402 ++++
 pytorch_pretrained_bert/optimization.py       |  302 +++
 .../optimization_openai.py                    |  127 ++
 pytorch_pretrained_bert/tokenization.py       |  434 +++++
 pytorch_pretrained_bert/tokenization_gpt2.py  |  311 ++++
 .../tokenization_openai.py                    |  313 ++++
 .../tokenization_transfo_xl.py                |  586 ++++++
 requirements.txt                              |    9 +
 train.py                                      |  239 +++
 train_emotion_recognition.py                  |  286 +++
 train_full.py                                 |  253 +++
 train_multihead.py                            |  297 +++
 utils.py                                      |  225 +++
 35 files changed, 10277 insertions(+)
 create mode 100644 README.md
 create mode 100644 config.py
 create mode 100644 configs/interact_config.json
 create mode 100644 configs/train_config.json
 create mode 100644 configs/train_emotion_recognition_config.json
 create mode 100644 configs/train_full_config.json
 create mode 100644 configs/train_multihead_config.json
 create mode 100644 eval_emotion_recognition.py
 create mode 100644 evaluate.py
 create mode 100644 interact.py
 create mode 100644 pytorch_pretrained_bert/__init__.py
 create mode 100644 pytorch_pretrained_bert/__main__.py
 create mode 100755 pytorch_pretrained_bert/convert_gpt2_checkpoint_to_pytorch.py
 create mode 100755 pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
 create mode 100755 pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
 create mode 100755 pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
 create mode 100644 pytorch_pretrained_bert/file_utils.py
 create mode 100644 pytorch_pretrained_bert/modeling.py
 create mode 100644 pytorch_pretrained_bert/modeling_gpt2.py
 create mode 100644 pytorch_pretrained_bert/modeling_openai.py
 create mode 100644 pytorch_pretrained_bert/modeling_transfo_xl.py
 create mode 100644 pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
 create mode 100644 pytorch_pretrained_bert/optimization.py
 create mode 100644 pytorch_pretrained_bert/optimization_openai.py
 create mode 100644 pytorch_pretrained_bert/tokenization.py
 create mode 100644 pytorch_pretrained_bert/tokenization_gpt2.py
 create mode 100644 pytorch_pretrained_bert/tokenization_openai.py
 create mode 100644 pytorch_pretrained_bert/tokenization_transfo_xl.py
 create mode 100644 requirements.txt
 create mode 100644 train.py
 create mode 100644 train_emotion_recognition.py
 create mode 100644 train_full.py
 create mode 100644 train_multihead.py
 create mode 100644 utils.py

diff --git a/.gitignore b/.gitignore
index e69de29..dbaabc3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,18 @@
+.vscode
+experiments/
+data/
+dataset_cache*
+dataset1_cache*
+daily_dialog_*
+runs/
+ParlAI/
+__pycache__
+.idea/*
+env/*
+ParlAI/*
+model/*
+logs/*
+caches/*
+_OpenAIGPTTokenizer
+out
+emp_transfo_checkpoint/*
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..fff39f1
--- /dev/null
+++ b/README.md
@@ -0,0 +1,56 @@
+# EmpTransfo: A Multi-head Transformer Architecture for Creating Empathetic Dialog Systems
+
+The present repo contains the code for the paper https://arxiv.org/abs/2003.02958
+on empathetic dialog system. The repository is heavily influenced by https://github.com/huggingface/transfer-learning-conv-ai
+
+
+## Installation
+To install and use the training and inference scripts please clone the repo and install the requirements:
+
+```bash
+git clone git@github.com:roholazandie/EmpTransfo.git
+cd EmpTransfo
+pip install -r requirements.txt
+
+```
+
+
+## Interact with the chatbot
+You can download the the checkpoint model [here](https://drive.google.com/open?id=1EjpK0YEVG1i9meLJzt7ZgODr0k65lTDi), extract and point to it from interact_config.json "model_checkpoint" value.
+For example:
+```
+"model_checkpoint" : "/home/rohola/codes/EmpTransfo/emp_transfo_checkpoint"
+``` 
+Then run interact.py
+```python
+python interact.py
+```
+
+## Dataset
+The original daily dialog dataset is [here](https://www.aclweb.org/anthology/I17-1099/). We changed the format to our purpose and can be download
+from [here](https://drive.google.com/open?id=1T4AdY7wku8srL_xWSxgt-OHqdLFVo3s3). 
+
+
+## Training
+
+The script train_multihead.py uses three heads with all features. 
+
+
+The script train_full.py uses two heads (next sentence prediction and LM head), but uses all the features.
+
+
+The script train_emotion_recognition.py trains to predict the next emotion (wihtout no_emotion).
+
+The script train.py trains without any features of the dataset (the base model).
+
+For all training scripts just change the dataset_path in config.json file related to that task, and then run the script
+without any arguments.
+
+
+
+## Citation
+If you use this code in your research, you can cite our ANLP paper:
+
+```
+
+```
\ No newline at end of file
diff --git a/config.py b/config.py
new file mode 100644
index 0000000..445f2a2
--- /dev/null
+++ b/config.py
@@ -0,0 +1,109 @@
+import json
+
+
+class Config:
+
+    def __init__(self,
+                 dataset_path="",
+                 dataset_cache="",
+                 model_checkpoint="",
+                 num_candidates=2,
+                 do_lower_case=True,
+                 max_history=2,
+                 train_batch_size=4,
+                 valid_batch_size=4,
+                 gradient_accumulation_steps=8,
+                 lr=5e-5,
+                 warmup_proportion=0.1,
+                 lm_coef=1,
+                 mc_coef=1,
+                 max_norm=10,
+                 n_epochs=2,
+                 personality_permutations=1,
+                 eval_before_start=False,
+                 device="cpu",
+                 fp16="",
+                 local_rank=-1,
+                 log_dir="",
+                 ):
+        self.dataset_path = dataset_path
+        self.dataset_cache = dataset_cache
+        self.model_checkpoint = model_checkpoint
+        self.num_candidates = num_candidates
+        self.do_lower_case = do_lower_case
+        self.max_history = max_history
+        self.train_batch_size = train_batch_size
+        self.valid_batch_size = valid_batch_size
+        self.gradient_accumulation_steps = gradient_accumulation_steps
+        self.lr = lr
+        self.warmup_proportion = warmup_proportion
+        self.lm_coef = lm_coef
+        self.mc_coef = mc_coef
+        self.max_norm = max_norm
+        self.n_epochs = n_epochs
+        self.personality_permutations = personality_permutations
+        self.eval_before_start = eval_before_start
+        self.device = device
+        self.fp16 = fp16
+        self.local_rank = local_rank
+        self.log_dir = log_dir
+
+    @classmethod
+    def from_dict(cls, json_object):
+        config = Config()
+        for key in json_object:
+            config.__dict__[key] = json_object[key]
+        return config
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        with open(json_file) as f:
+            config_json = f.read()
+
+        return cls.from_dict(json.loads(config_json))
+
+
+class InteractConfig:
+
+    def __init__(self,
+                 dataset_path="",
+                 model="",
+                 dataset_cache="",
+                 model_checkpoint="",
+                 max_history="",
+                 device="",
+                 no_sample="",
+                 max_length="",
+                 min_length="",
+                 seed="",
+                 temperature="",
+                 top_k="",
+                 top_p=""
+                 ):
+        self.dataset_path = dataset_path
+        self.model = model
+        self.dataset_cache = dataset_cache
+        self.model_checkpoint = model_checkpoint
+        self.max_history = max_history
+        self.device = device
+        self.no_sample = no_sample
+        self.max_length = max_length
+        self.min_length = min_length
+        self.seed = seed
+        self.temperature = temperature
+        self.top_k = top_k
+        self.top_p = top_p
+
+    @classmethod
+    def from_dict(cls, json_object):
+        config = InteractConfig()
+        for key in json_object:
+            config.__dict__[key] = json_object[key]
+        return config
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        with open(json_file) as f:
+            config_json = f.read()
+
+        return cls.from_dict(json.loads(config_json))
diff --git a/configs/interact_config.json b/configs/interact_config.json
new file mode 100644
index 0000000..ab31a1b
--- /dev/null
+++ b/configs/interact_config.json
@@ -0,0 +1,15 @@
+{
+   "dataset_path" : "/home/rohola/data/daily_dialog_full/daily_dialog.json",
+   "model" : "openai-gpt",
+   "dataset_cache" :  "./caches/dataset_cache_OpenAIGPTTokenizer",
+   "model_checkpoint" : "/home/rohola/codes/EmpTransfo/emp_transfo_checkpoint",
+   "max_history" : 2,
+   "device" : "cpu",
+   "no_sample" : true,
+   "max_length" : 20,
+   "min_length" : 1,
+   "seed" : 42,
+   "temperature" : 0.7,
+   "top_k" : 0,
+   "top_p" : 0.9
+}
\ No newline at end of file
diff --git a/configs/train_config.json b/configs/train_config.json
new file mode 100644
index 0000000..c1b58f8
--- /dev/null
+++ b/configs/train_config.json
@@ -0,0 +1,23 @@
+{
+  "dataset_path": "/home/rohola/data/daily_dialog.json" ,
+   "dataset_cache": "./daily_dialog_dataset_cache",
+   "model_checkpoint": "openai-gpt",
+   "num_candidates": 2,
+   "do_lower_case": true,
+   "max_history": 2,
+   "train_batch_size": 1,
+   "valid_batch_size": 1,
+   "gradient_accumulation_steps": 8,
+   "lr": 6.25e-5,
+   "warmup_proportion": 0.1,
+   "lm_coef": 1.0,
+   "mc_coef": 1.0,
+   "max_norm": 1.0,
+   "n_epochs": 3,
+   "personality_permutations":1,
+   "eval_before_start": false,
+   "device": "cuda:0",
+   "fp16": "",
+   "local_rank": -1,
+   "log_dir": ""
+}
diff --git a/configs/train_emotion_recognition_config.json b/configs/train_emotion_recognition_config.json
new file mode 100644
index 0000000..7a2fcbd
--- /dev/null
+++ b/configs/train_emotion_recognition_config.json
@@ -0,0 +1,23 @@
+{
+  "dataset_path": "/home/rohola/data/daily_dialog_full/daily_dialog.json" ,
+   "dataset_cache": "./daily_dialog_dataset_cache",
+   "model_checkpoint": "openai-gpt",
+   "num_candidates": 2,
+   "do_lower_case": true,
+   "max_history": 2,
+   "train_batch_size": 1,
+   "valid_batch_size": 1,
+   "gradient_accumulation_steps": 8,
+   "lr": 6.25e-5,
+   "warmup_proportion": 0.1,
+   "lm_coef": 1.0,
+   "mc_coef": 1.0,
+   "max_norm": 1.0,
+   "n_epochs": 3,
+   "personality_permutations":1,
+   "eval_before_start": false,
+   "device": "cpu",
+   "fp16": "",
+   "local_rank": -1,
+   "log_dir": ""
+}
diff --git a/configs/train_full_config.json b/configs/train_full_config.json
new file mode 100644
index 0000000..ddcaf33
--- /dev/null
+++ b/configs/train_full_config.json
@@ -0,0 +1,23 @@
+{
+  "dataset_path": "/home/rohola/data/daily_dialog_full/daily_dialog.json" ,
+   "dataset_cache": "./caches/daily_dialog_dataset_cache",
+   "model_checkpoint": "openai-gpt",
+   "num_candidates": 2,
+   "do_lower_case": true,
+   "max_history": 2,
+   "train_batch_size": 1,
+   "valid_batch_size": 1,
+   "gradient_accumulation_steps": 8,
+   "lr": 6.25e-5,
+   "warmup_proportion": 0.1,
+   "lm_coef": 1.0,
+   "mc_coef": 1.0,
+   "max_norm": 1.0,
+   "n_epochs": 3,
+   "personality_permutations":1,
+   "eval_before_start": false,
+   "device": "cuda:0",
+   "fp16": "",
+   "local_rank": -1,
+   "log_dir": ""
+}
diff --git a/configs/train_multihead_config.json b/configs/train_multihead_config.json
new file mode 100644
index 0000000..abeb7d7
--- /dev/null
+++ b/configs/train_multihead_config.json
@@ -0,0 +1,23 @@
+{
+  "dataset_path": "/home/rohola/data/daily_dialog_topic/daily_dialog.json" ,
+   "dataset_cache": "caches/daily_dialog_multihead",
+   "model_checkpoint": "openai-gpt",
+   "num_candidates": 2,
+   "do_lower_case": true,
+   "max_history": 2,
+   "train_batch_size": 1,
+   "valid_batch_size": 1,
+   "gradient_accumulation_steps": 8,
+   "lr": 6.25e-5,
+   "warmup_proportion": 0.1,
+   "lm_coef": 1.0,
+   "mc_coef": 1.0,
+   "max_norm": 1.0,
+   "n_epochs": 3,
+   "personality_permutations":1,
+   "eval_before_start": false,
+   "device": "cuda:0",
+   "fp16": "",
+   "local_rank": -1,
+   "log_dir": ""
+}
diff --git a/eval_emotion_recognition.py b/eval_emotion_recognition.py
new file mode 100644
index 0000000..5a3e41e
--- /dev/null
+++ b/eval_emotion_recognition.py
@@ -0,0 +1,208 @@
+# Copyright (c) 2019-present, HuggingFace Inc.
+# All rights reserved. This source code is licensed under the BSD-style license found in the LICENSE file in the root directory of this source tree.
+import logging
+from pprint import pformat
+from collections import defaultdict
+from itertools import chain
+
+import torch
+from torch.nn.parallel import DistributedDataParallel
+from torch.utils.data import DataLoader, TensorDataset
+
+from config import Config
+from pytorch_pretrained_bert import (OpenAIAdam, OpenAIGPTDoubleHeadLMEmotionRecognitionModel, OpenAIGPTTokenizer,
+                                     GPT2DoubleHeadsModel, GPT2Tokenizer, WEIGHTS_NAME, CONFIG_NAME,
+                                     BertModel, BertTokenizer)
+
+from utils import get_dataset, get_dataset_for_daily_dialog
+
+SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>",
+                  "<no_emotion>", "<happiness>", "<surprise>", "<sadness>", "<disgust>", "<anger>", "<fear>",
+                  "<directive>", "<inform>", "<commissive>", "<question>",
+                  "<pad>"]
+MODEL_INPUTS = ["input_ids", "mc_token_ids", "lm_labels", "mc_labels", "token_type_ids", "token_emotion_ids"]
+PADDED_INPUTS = ["input_ids", "lm_labels", "token_type_ids", "token_emotion_ids"]
+
+logger = logging.getLogger(__file__)
+
+def average_distributed_scalar(scalar, config):
+    """ Average a scalar over the nodes if we are in distributed training. We use this for distributed evaluation. """
+    if config.local_rank == -1:
+        return scalar
+    scalar_t = torch.tensor(scalar, dtype=torch.float, device=config.device) / torch.distributed.get_world_size()
+    torch.distributed.all_reduce(scalar_t, op=torch.distributed.ReduceOp.SUM)
+    return scalar_t.item()
+
+
+def pad_dataset(dataset, padding=0):
+    """ Pad the dataset. This could be optimized by defining a Dataset class and padd only batches but this is simpler. """
+    max_l = max(len(x) for x in dataset["input_ids"])
+    for name in PADDED_INPUTS:
+        dataset[name] = [x + [padding if name != "lm_labels" else -1] * (max_l - len(x)) for x in dataset[name]]
+    return dataset
+
+
+def get_emotion_label(tokenizer, candidate_emotion):
+    _, _, _, _, no_emotion_id, happiness_id, surprise_id, sadness_id, disgust_id, anger_id, fear_id, _, _, _, _, _ = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS)
+    if candidate_emotion == happiness_id:
+        return 0
+    elif candidate_emotion == surprise_id:
+        return 1
+    elif candidate_emotion == sadness_id:
+        return 2
+    elif candidate_emotion == disgust_id:
+        return 3
+    elif candidate_emotion == anger_id:
+        return 4
+    elif candidate_emotion == fear_id:
+        return 5
+    elif candidate_emotion == no_emotion_id:
+        return 6
+
+
+def build_input_from_segments(history, emotions, reply, true_emotion, tokenizer, with_eos=True):
+    """ Build a sequence of input from 3 segments: persona, history and last reply """
+    bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:4])
+    #tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1])
+
+    instance = {}
+    # sequence = [[bos] + history[0] + list(chain(*history[1:]))]  + [reply + ([eos] if with_eos else [])] #seq = [personas, history, reply] concatenate all persona sentences
+    sequence = [[bos] + history[0]] + history[1:] + [reply + ([eos] if with_eos else [])]
+    sequence = [[speaker2 if (len(sequence)-i) % 2 else speaker1] + s for i, s in enumerate(sequence)]
+
+    instance["input_ids"] = list(chain(*sequence))
+    instance["token_type_ids"] = [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in s] # the last for is for repeating the speaker1 and speaker2 for all tokens
+    #instance["token_emotion_ids"] = [emotions[i] for i, s in enumerate(sequence[:-1]) for _ in s] + [true_emotion] * len(sequence[-1])
+    instance["token_emotion_ids"] = [emotions[i] for i, s in enumerate(sequence[:-1]) for _ in s]
+
+    instance["mc_token_ids"] = len(instance["input_ids"]) - 1
+    instance["mc_labels"] = get_emotion_label(tokenizer, true_emotion)
+    instance["lm_labels"] = ([-1] * sum(len(s) for s in sequence[:-1])) + [-1] + sequence[-1][1:] #all -1 except for reply, reply is just the ids
+    return instance, sequence
+
+
+def get_data_loaders(config, tokenizer):
+    """ Prepare the dataset for training and evaluation """
+    personachat = get_dataset_for_daily_dialog(tokenizer, config.dataset_path, config.dataset_cache, SPECIAL_TOKENS)
+
+    #personachat["train"] = personachat["train"][:100]
+    #personachat["valid"] = personachat["valid"][:10]
+
+    logger.info("Build inputs and labels")
+    datasets = {"train": defaultdict(list), "valid": defaultdict(list)}
+    c = 0
+    for dataset_name, dataset in personachat.items():
+        num_candidates = 2#len(dataset[0]["utterances"][0]["candidates"])
+        if config.num_candidates > 0 and dataset_name == 'train':
+            num_candidates = min(config.num_candidates, num_candidates)
+        for dialog in dataset:
+            for utterance in dialog["utterances"]:
+                history = utterance["history"][-(2 * config.max_history + 1):]
+                emotions = utterance["emotion"][-(2 * config.max_history + 1):]
+                reply = utterance["candidates"][-1]
+                true_emotion = utterance['candidates_emotions'][-1]
+                if true_emotion == tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS)[4]:
+                    continue
+                instance, _ = build_input_from_segments(history,
+                                                        emotions,
+                                                        reply,
+                                                        true_emotion,
+                                                        tokenizer)
+
+                if len(instance["input_ids"]) > 310:
+                    truncated_history = [hist[:10] for hist in history]
+                    truncated_candidate = reply[:10]
+                    true_emotion = utterance['candidates_emotions'][-1]
+                    instance, _ = build_input_from_segments(truncated_history,
+                                                            emotions,
+                                                            truncated_candidate,
+                                                            true_emotion,
+                                                            tokenizer)
+                    c+=1
+
+                for input_name, input_array in instance.items():
+                    datasets[dataset_name][input_name].append(input_array)
+
+                #datasets[dataset_name]["mc_labels"].append(num_candidates - 1)
+                datasets[dataset_name]["n_candidates"] = num_candidates
+    print(c)
+    logger.info("Pad inputs and convert to Tensor")
+    tensor_datasets = {"train": [], "valid": []}
+    for dataset_name, dataset in datasets.items():
+        dataset = pad_dataset(dataset, padding=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]))
+        for input_name in MODEL_INPUTS:
+            tensor = torch.tensor(dataset[input_name])
+            #if input_name != "mc_labels":
+            #    tensor = tensor.view((-1, datasets[dataset_name]["n_candidates"]) + tensor.shape[1:])
+            tensor_datasets[dataset_name].append(tensor)
+
+    logger.info("Build train and validation dataloaders")
+    train_dataset, valid_dataset = TensorDataset(*tensor_datasets["train"]), TensorDataset(*tensor_datasets["valid"])
+    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if config.distributed else None
+    valid_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset) if config.distributed else None
+    train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=config.train_batch_size, shuffle=False)
+    valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=config.valid_batch_size, shuffle=False)
+
+    logger.info("Train dataset (Batch, Candidates, Seq length): {}".format(train_dataset.tensors[0].shape))
+    logger.info("Valid dataset (Batch, Candidates, Seq length): {}".format(valid_dataset.tensors[0].shape))
+    return train_loader, valid_loader, train_sampler, valid_sampler
+
+
+def train():
+    config_file = "configs/train_full_pipeline_config.json"
+    config = Config.from_json_file(config_file)
+
+    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes
+    logging.basicConfig(level=logging.INFO if config.local_rank in [-1, 0] else logging.WARN)
+    logger.warning("Running process %d", config.local_rank)  # This is a logger.warning: it will be printed by all distributed processes
+    logger.info("Arguments: %s", pformat(config))
+
+    # Initialize distributed training if needed
+    config.distributed = (config.local_rank != -1)
+    if config.distributed:
+        torch.cuda.set_device(config.local_rank)
+        config.device = torch.device("cuda", config.local_rank)
+        torch.distributed.init_process_group(backend='nccl', init_method='env://')
+
+    logger.info("Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning")
+    tokenizer_class = GPT2Tokenizer if "gpt2" in config.model_checkpoint else OpenAIGPTTokenizer
+    tokenizer = tokenizer_class.from_pretrained(config.model_checkpoint)
+    model_class = GPT2DoubleHeadsModel if "gpt2" in config.model_checkpoint else OpenAIGPTDoubleHeadLMEmotionRecognitionModel
+    model = model_class.from_pretrained(config.model_checkpoint)
+    tokenizer.set_special_tokens(SPECIAL_TOKENS)
+    model.set_num_special_tokens(len(SPECIAL_TOKENS))
+    model.to(config.device)
+    optimizer = OpenAIAdam(model.parameters(), lr=config.lr)
+
+    # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
+    if config.fp16:
+        from apex import amp  # Apex is only required if we use fp16 training
+        model, optimizer = amp.initialize(model, optimizer, opt_level=config.fp16)
+    if config.distributed:
+        model = DistributedDataParallel(model, device_ids=[config.local_rank], output_device=config.local_rank)
+
+    logger.info("Prepare datasets")
+    train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(config, tokenizer)
+
+    # Evaluation function and evaluator (evaluator output is the input of the metrics)
+    model.eval()
+    num_correct = 0
+    num_all = len(val_loader)
+    for batch in val_loader:
+        with torch.no_grad():
+            batch = tuple(input_tensor.to(config.device) for input_tensor in batch)
+            input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids = batch
+
+            model_outputs = model(input_ids, mc_token_ids, token_type_ids=token_type_ids, token_emotion_ids=token_emotion_ids)
+            lm_logits, mc_logits = model_outputs[0], model_outputs[1]  # So we can also use GPT2 outputs
+
+            indices = torch.argmax(mc_logits, dim=1)
+
+            correct = torch.eq(indices, mc_labels).view(-1)
+            num_correct += torch.sum(correct).item()
+
+    print(num_correct / num_all)
+
+
+if __name__ == "__main__":
+    train()
diff --git a/evaluate.py b/evaluate.py
new file mode 100644
index 0000000..3bcfffb
--- /dev/null
+++ b/evaluate.py
@@ -0,0 +1,195 @@
+# # Copyright (c) 2019-present, HuggingFace Inc.
+# All rights reserved.
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import random
+from argparse import ArgumentParser
+from itertools import chain
+from pprint import pformat
+import numpy as np
+
+import torch
+import torch.nn.functional as F
+from tqdm import tqdm
+
+from config import InteractConfig
+from pytorch_pretrained_bert import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, GPT2LMHeadModel, GPT2Tokenizer
+from utils import download_pretrained_model, get_dataset, _bleu, _f1_score
+
+
+
+def build_input_from_segments(persona, history, reply, tokenizer, SPECIAL_TOKENS, lm_labels=False, with_eos=True):
+    """ Build a sequence of input from 3 segments: persona, history and last reply """
+    bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1])
+
+    instance = {}
+    sequence = [[bos] + list(chain(*persona))] + history + [
+        reply + ([eos] if with_eos else [])]  # seq = [personas, history, reply] concatenate all persona sentences
+    sequence = [sequence[0]] + [[speaker2 if (len(sequence) - i) % 2 else speaker1] + s for i, s in
+                                enumerate(sequence[1:])]
+
+    instance["input_ids"] = list(chain(*sequence))
+    instance["token_type_ids"] = [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in
+                                  s]  # the last for is for repeating the speaker1 and speaker2 for all tokens
+    instance["mc_token_ids"] = len(instance["input_ids"]) - 1
+    instance["lm_labels"] = [-1] * len(instance["input_ids"])
+    if lm_labels:
+        instance["lm_labels"] = ([-1] * sum(len(s) for s in sequence[:-1])) + [-1] + sequence[-1][1:]  # all -1 except for reply, reply is just the ids
+    return instance, sequence
+
+
+
+def top_filtering(logits, top_k=0, top_p=0.0, threshold=-float('Inf'), filter_value=-float('Inf')):
+    """ Filter a distribution of logits using top-k, top-p (nucleus) and/or threshold filtering
+        Args:
+            logits: logits distribution shape (..., vocabulary size)
+            top_k: <=0: no filtering, >0: keep only top k tokens with highest probability.
+            top_p: <=0.0: no filtering, >0.0: keep only a subset S of candidates, where S is the smallest subset
+                whose total probability mass is greater than or equal to the threshold top_p.
+                In practice, we select the highest probability tokens whose cumulative probability mass exceeds
+                the threshold top_p.
+            threshold: a minimal threshold to keep logits
+    """
+    top_k = min(top_k, logits.size(-1))
+    if top_k > 0:
+        # Remove all tokens with a probability less than the last token in the top-k tokens
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+        logits[indices_to_remove] = filter_value
+
+    if top_p > 0.0:
+        # Compute cumulative probabilities of sorted tokens
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        cumulative_probabilities = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+
+        # Remove tokens with cumulative probability above the threshold
+        sorted_indices_to_remove = cumulative_probabilities > top_p
+        # Shift the indices to the right to keep also the first token above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+
+        # Back to unsorted indices and set them to -infinity
+        indices_to_remove = sorted_indices[sorted_indices_to_remove]
+        logits[indices_to_remove] = filter_value
+
+    indices_to_remove = logits < threshold
+    logits[indices_to_remove] = filter_value
+
+    return logits
+
+
+def get_emotions(dataset):
+
+
+    for data in tqdm(dataset['valid']):
+        utterances = data['utterances']
+
+        for utterance in utterances:
+            true_emotion = utterance["emotion"]
+
+
+def calculate_metrics(args, model, tokenizer, dataset, special_tokens):
+    special_tokens_ids = tokenizer.convert_tokens_to_ids(special_tokens)
+
+    all_blues = []
+    all_f1_scores = []
+    all_true_sentences = []
+    all_predicted_sentences = []
+    for data in tqdm(dataset['valid']):
+        personality = data['personality']
+        utterances = data['utterances']
+
+        #utterance = utterances[-1] #only the longest conversaion
+        for utterance in utterances:
+            true_label = utterance['candidates'][-1]
+            history = utterance['history']
+            predicted_output = []
+            for i in range(args.max_length):
+                instance, _ = build_input_from_segments(personality, history, predicted_output, tokenizer, special_tokens, with_eos=False)
+
+                try:
+
+                    if len(instance["input_ids"]) > 310:
+                        truncated_history = [hist[:5] for hist in history]
+                        instance, _ = build_input_from_segments(personality, truncated_history, predicted_output, tokenizer, special_tokens, with_eos=False)
+
+                    input_ids = torch.tensor(instance["input_ids"], device=args.device).unsqueeze(0)
+                    token_type_ids = torch.tensor(instance["token_type_ids"], device=args.device).unsqueeze(0)
+
+                    logits = model(input_ids, token_type_ids=token_type_ids)
+                except:
+                    print("exception")
+                    continue
+
+                if "gpt2" == args.model:
+                    logits = logits[0]
+                logits = logits[0, -1, :] / args.temperature
+                logits = top_filtering(logits, top_k=args.top_k, top_p=args.top_p)
+                probs = F.softmax(logits, dim=-1)
+
+                prev = torch.topk(probs, 1)[1] if args.no_sample else torch.multinomial(probs, 1)
+                # if i < args.min_length and prev.item() in special_tokens_ids:
+                #     k=0
+                #     while prev.item() in special_tokens_ids and k < 100:
+                #         prev = torch.multinomial(probs, num_samples=1)
+                #         k+=1
+
+                if i < args.min_length:
+                    prev = torch.multinomial(probs, num_samples=1)
+
+                # if prev.item() in special_tokens_ids:
+                #     break
+                predicted_output.append(prev.item())
+
+            predicted_sentence = tokenizer.decode(predicted_output, skip_special_tokens=True)
+            true_sentence = tokenizer.decode(true_label, skip_special_tokens=True)
+            #looks like zero gives the best results
+
+            all_predicted_sentences.append(predicted_sentence)
+            all_true_sentences.append(true_sentence)
+
+            bleus = [_bleu(predicted_sentence, [true_sentence], method="method"+str(i)) for i in [0,1,2,3,5]]
+            #bleu = _bleu(predicted_sentence, [true_sentence])
+            f1_score = _f1_score(predicted_sentence, [true_sentence])
+            #print(f1_score)
+            all_blues.append(bleus)
+            all_f1_scores.append(f1_score)
+            #compare predicted and label with bleu
+
+
+    print("avg bleu", np.array(all_blues).mean(axis=0))
+    print("avg f1 score", np.mean(all_f1_scores))
+    print("max bleu", np.array(all_blues).max(axis=0))
+
+
+def run():
+    config_file = "configs/interact_config.json"
+    config = InteractConfig.from_json_file(config_file)
+
+    logging.basicConfig(level=logging.INFO)
+    logger = logging.getLogger(__file__)
+    logger.info(pformat(config))
+
+    if config.model_checkpoint == "":
+        config.model_checkpoint = download_pretrained_model()
+
+    random.seed(config.seed)
+    torch.random.manual_seed(config.seed)
+    torch.cuda.manual_seed(config.seed)
+
+    logger.info("Get pretrained model and tokenizer")
+    tokenizer_class = GPT2Tokenizer if "gpt2" == config.model else OpenAIGPTTokenizer
+    tokenizer = tokenizer_class.from_pretrained(config.model_checkpoint)
+    model_class = GPT2LMHeadModel if "gpt2" == config.model else OpenAIGPTLMHeadModel
+    model = model_class.from_pretrained(config.model_checkpoint)
+
+    model.to(config.device)
+    model.eval()
+
+    dataset = get_dataset(tokenizer, config.dataset_path, config.dataset_cache)
+
+    special_tokens = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"]
+    calculate_metrics(config, model, tokenizer, dataset, special_tokens)
+
+if __name__ == "__main__":
+    run()
diff --git a/interact.py b/interact.py
new file mode 100644
index 0000000..c59e160
--- /dev/null
+++ b/interact.py
@@ -0,0 +1,161 @@
+# # Copyright (c) 2019-present, HuggingFace Inc.
+# All rights reserved.
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import random
+from argparse import ArgumentParser
+from itertools import chain
+from pprint import pformat
+
+import torch
+import torch.nn.functional as F
+
+from config import InteractConfig
+from pytorch_pretrained_bert import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, GPT2LMHeadModel, GPT2Tokenizer, \
+    BertTokenizer
+from pytorch_pretrained_bert.modeling import BertLMHeadModel
+from utils import get_dataset_personalities, download_pretrained_model, get_dataset
+
+
+def build_input_from_segments(history, reply, tokenizer, SPECIAL_TOKENS, lm_labels=False, with_eos=True):
+    """ Build a sequence of input from 3 segments: persona, history and last reply """
+    bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1])
+    persona = []
+    instance = {}
+    sequence = [[bos] + list(chain(*persona))] + history + [
+        reply + ([eos] if with_eos else [])]  # seq = [personas, history, reply] concatenate all persona sentences
+    sequence = [sequence[0]] + [[speaker2 if (len(sequence) - i) % 2 else speaker1] + s for i, s in
+                                enumerate(sequence[1:])]
+
+    instance["input_ids"] = list(chain(*sequence))
+    instance["token_type_ids"] = [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in
+                                  s]  # the last for is for repeating the speaker1 and speaker2 for all tokens
+    instance["mc_token_ids"] = len(instance["input_ids"]) - 1
+    instance["lm_labels"] = [-1] * len(instance["input_ids"])
+    if lm_labels:
+        instance["lm_labels"] = ([-1] * sum(len(s) for s in sequence[:-1])) + [-1] + sequence[-1][1:]  # all -1 except for reply, reply is just the ids
+    return instance, sequence
+
+
+def top_filtering(logits, top_k=0, top_p=0.0, threshold=-float('Inf'), filter_value=-float('Inf')):
+    """ Filter a distribution of logits using top-k, top-p (nucleus) and/or threshold filtering
+        Args:
+            logits: logits distribution shape (..., vocabulary size)
+            top_k: <=0: no filtering, >0: keep only top k tokens with highest probability.
+            top_p: <=0.0: no filtering, >0.0: keep only a subset S of candidates, where S is the smallest subset
+                whose total probability mass is greater than or equal to the threshold top_p.
+                In practice, we select the highest probability tokens whose cumulative probability mass exceeds
+                the threshold top_p.
+            threshold: a minimal threshold to keep logits
+    """
+    top_k = min(top_k, logits.size(-1))
+    if top_k > 0:
+        # Remove all tokens with a probability less than the last token in the top-k tokens
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+        logits[indices_to_remove] = filter_value
+
+    if top_p > 0.0:
+        # Compute cumulative probabilities of sorted tokens
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        cumulative_probabilities = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+
+        # Remove tokens with cumulative probability above the threshold
+        sorted_indices_to_remove = cumulative_probabilities > top_p
+        # Shift the indices to the right to keep also the first token above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+
+        # Back to unsorted indices and set them to -infinity
+        indices_to_remove = sorted_indices[sorted_indices_to_remove]
+        logits[indices_to_remove] = filter_value
+
+    indices_to_remove = logits < threshold
+    logits[indices_to_remove] = filter_value
+
+    return logits
+
+
+def sample_sequence(history, tokenizer, model, args, SPECIAL_TOKENS, current_output=None):
+    special_tokens_ids = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS)
+
+    if current_output is None:
+        current_output = []
+
+    for i in range(args.max_length):
+        instance, sequence = build_input_from_segments(history, current_output, tokenizer, SPECIAL_TOKENS,
+                                                       with_eos=False)
+
+        input_ids = torch.tensor(instance["input_ids"], device=args.device).unsqueeze(0)
+        token_type_ids = torch.tensor(instance["token_type_ids"], device=args.device).unsqueeze(0)
+
+        logits = model(input_ids, token_type_ids=token_type_ids)
+
+        if "gpt2" == args.model:
+            logits = logits[0]
+        logits = logits[0, -1, :] / args.temperature
+        logits = top_filtering(logits, top_k=args.top_k, top_p=args.top_p)
+        probs = F.softmax(logits, dim=-1)
+
+        prev = torch.topk(probs, 1)[1] if args.no_sample else torch.multinomial(probs, 1)
+        if i < args.min_length and prev.item() in special_tokens_ids:
+            while prev.item() in special_tokens_ids:
+                prev = torch.multinomial(probs, num_samples=1)
+
+        if prev.item() in special_tokens_ids:
+            break
+        current_output.append(prev.item())
+
+    return current_output
+
+
+def run():
+    config_file = "configs/interact_config.json"
+    config = InteractConfig.from_json_file(config_file)
+
+    logging.basicConfig(level=logging.INFO)
+    logger = logging.getLogger(__file__)
+    logger.info(pformat(config))
+
+    if config.model_checkpoint == "":
+        config.model_checkpoint = download_pretrained_model()
+
+    torch.random.manual_seed(config.seed)
+    torch.cuda.manual_seed(config.seed)
+
+    logger.info("Get pretrained model and tokenizer")
+    if config.model == "bert":
+        tokenizer_class = BertTokenizer
+        model_class = BertLMHeadModel
+    elif config.model == "gpt2":
+        tokenizer_class = GPT2Tokenizer
+        model_class = GPT2LMHeadModel
+    else:
+        tokenizer_class = OpenAIGPTTokenizer
+        model_class = OpenAIGPTLMHeadModel
+
+    SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"]
+
+    tokenizer = tokenizer_class.from_pretrained(config.model_checkpoint)
+    model = model_class.from_pretrained(config.model_checkpoint)
+
+    model.to(config.device)
+    model.eval()
+
+    history = []
+    while True:
+        raw_text = input(">>> ")
+        while not raw_text:
+            print('Prompt should not be empty!')
+            raw_text = input(">>> ")
+        history.append(tokenizer.encode(raw_text))
+        with torch.no_grad():
+            out_ids = sample_sequence(history, tokenizer, model, config, SPECIAL_TOKENS)
+        history.append(out_ids)
+        history = history[-(2 * config.max_history + 1):]
+        out_text = tokenizer.decode(out_ids, skip_special_tokens=True)
+        print(out_text)
+
+
+if __name__ == "__main__":
+    run()
diff --git a/pytorch_pretrained_bert/__init__.py b/pytorch_pretrained_bert/__init__.py
new file mode 100644
index 0000000..522f976
--- /dev/null
+++ b/pytorch_pretrained_bert/__init__.py
@@ -0,0 +1,26 @@
+__version__ = "0.6.2"
+from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer
+from .tokenization_openai import OpenAIGPTTokenizer
+from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
+from .tokenization_gpt2 import GPT2Tokenizer
+
+from .modeling import (BertConfig, BertModel, BertForPreTraining,
+                       BertForMaskedLM, BertForNextSentencePrediction,
+                       BertForSequenceClassification, BertForMultipleChoice,
+                       BertForTokenClassification, BertForQuestionAnswering,
+                       load_tf_weights_in_bert)
+from .modeling_openai import (OpenAIGPTConfig, OpenAIGPTModel,
+                              OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel, OpenAIGPTDoubleHeadLMEmotionRecognitionModel,
+                              OpenAIGPTForEmotionDetection,
+                              OpenAIGPTMultiHeadModel,
+                              load_tf_weights_in_openai_gpt)
+from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel,
+                                  load_tf_weights_in_transfo_xl)
+from .modeling_gpt2 import (GPT2Config, GPT2Model,
+                            GPT2LMHeadModel, GPT2DoubleHeadsModel, GPT2MultipleChoiceHead,
+                            load_tf_weights_in_gpt2)
+
+from .optimization import BertAdam
+from .optimization_openai import OpenAIAdam
+
+from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, cached_path, WEIGHTS_NAME, CONFIG_NAME
diff --git a/pytorch_pretrained_bert/__main__.py b/pytorch_pretrained_bert/__main__.py
new file mode 100644
index 0000000..a2aae9e
--- /dev/null
+++ b/pytorch_pretrained_bert/__main__.py
@@ -0,0 +1,83 @@
+# coding: utf8
+def main():
+    import sys
+    if (len(sys.argv) != 4 and len(sys.argv) != 5) or sys.argv[1] not in [
+        "convert_tf_checkpoint_to_pytorch",
+        "convert_openai_checkpoint",
+        "convert_transfo_xl_checkpoint",
+        "convert_gpt2_checkpoint",
+    ]:
+        print(
+        "Should be used as one of: \n"
+        ">> `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`, \n"
+        ">> `pytorch_pretrained_bert convert_openai_checkpoint OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`, \n"
+        ">> `pytorch_pretrained_bert convert_transfo_xl_checkpoint TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG]` or \n"
+        ">> `pytorch_pretrained_bert convert_gpt2_checkpoint TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG]`")
+    else:
+        if sys.argv[1] == "convert_tf_checkpoint_to_pytorch":
+            try:
+                from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
+            except ImportError:
+                print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                    "In that case, it requires TensorFlow to be installed. Please see "
+                    "https://www.tensorflow.org/install/ for installation instructions.")
+                raise
+
+            if len(sys.argv) != 5:
+                # pylint: disable=line-too-long
+                print("Should be used as `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
+            else:
+                PYTORCH_DUMP_OUTPUT = sys.argv.pop()
+                TF_CONFIG = sys.argv.pop()
+                TF_CHECKPOINT = sys.argv.pop()
+                convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
+        elif sys.argv[1] == "convert_openai_checkpoint":
+            from .convert_openai_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
+            OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2]
+            PYTORCH_DUMP_OUTPUT = sys.argv[3]
+            if len(sys.argv) == 5:
+                OPENAI_GPT_CONFIG = sys.argv[4]
+            else:
+                OPENAI_GPT_CONFIG = ""
+            convert_openai_checkpoint_to_pytorch(OPENAI_GPT_CHECKPOINT_FOLDER_PATH,
+                                                 OPENAI_GPT_CONFIG,
+                                                 PYTORCH_DUMP_OUTPUT)
+        elif sys.argv[1] == "convert_transfo_xl_checkpoint":
+            try:
+                from .convert_transfo_xl_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
+            except ImportError:
+                print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                    "In that case, it requires TensorFlow to be installed. Please see "
+                    "https://www.tensorflow.org/install/ for installation instructions.")
+                raise
+
+            if 'ckpt' in sys.argv[2].lower():
+                TF_CHECKPOINT = sys.argv[2]
+                TF_DATASET_FILE = ""
+            else:
+                TF_DATASET_FILE = sys.argv[2]
+                TF_CHECKPOINT = ""
+            PYTORCH_DUMP_OUTPUT = sys.argv[3]
+            if len(sys.argv) == 5:
+                TF_CONFIG = sys.argv[4]
+            else:
+                TF_CONFIG = ""
+            convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, TF_DATASET_FILE)
+        else:
+            try:
+                from .convert_gpt2_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch
+            except ImportError:
+                print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                    "In that case, it requires TensorFlow to be installed. Please see "
+                    "https://www.tensorflow.org/install/ for installation instructions.")
+                raise
+
+            TF_CHECKPOINT = sys.argv[2]
+            PYTORCH_DUMP_OUTPUT = sys.argv[3]
+            if len(sys.argv) == 5:
+                TF_CONFIG = sys.argv[4]
+            else:
+                TF_CONFIG = ""
+            convert_gpt2_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
+if __name__ == '__main__':
+    main()
diff --git a/pytorch_pretrained_bert/convert_gpt2_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_gpt2_checkpoint_to_pytorch.py
new file mode 100755
index 0000000..51d52a6
--- /dev/null
+++ b/pytorch_pretrained_bert/convert_gpt2_checkpoint_to_pytorch.py
@@ -0,0 +1,72 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert OpenAI GPT checkpoint."""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+from io import open
+
+import torch
+
+from pytorch_pretrained_bert.modeling_gpt2 import (CONFIG_NAME, WEIGHTS_NAME,
+                                                     GPT2Config,
+                                                     GPT2Model,
+                                                     load_tf_weights_in_gpt2)
+
+
+def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path):
+    # Construct model
+    if gpt2_config_file == "":
+        config = GPT2Config()
+    else:
+        config = GPT2Config(gpt2_config_file)
+    model = GPT2Model(config)
+
+    # Load weights from numpy
+    load_tf_weights_in_gpt2(model, gpt2_checkpoint_path)
+
+    # Save pytorch-model
+    pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
+    pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
+    print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
+    torch.save(model.state_dict(), pytorch_weights_dump_path)
+    print("Save configuration file to {}".format(pytorch_config_dump_path))
+    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
+        f.write(config.to_json_string())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ## Required parameters
+    parser.add_argument("--gpt2_checkpoint_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path the TensorFlow checkpoint path.")
+    parser.add_argument("--pytorch_dump_folder_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the output PyTorch model.")
+    parser.add_argument("--gpt2_config_file",
+                        default = "",
+                        type = str,
+                        help = "An optional config json file corresponding to the pre-trained OpenAI model. \n"
+                            "This specifies the model architecture.")
+    args = parser.parse_args()
+    convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path,
+                                         args.gpt2_config_file,
+                                         args.pytorch_dump_folder_path)
diff --git a/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
new file mode 100755
index 0000000..566008a
--- /dev/null
+++ b/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
@@ -0,0 +1,72 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert OpenAI GPT checkpoint."""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+from io import open
+
+import torch
+
+from pytorch_pretrained_bert.modeling_openai import (CONFIG_NAME, WEIGHTS_NAME,
+                                                     OpenAIGPTConfig,
+                                                     OpenAIGPTModel,
+                                                     load_tf_weights_in_openai_gpt)
+
+
+def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path):
+    # Construct model
+    if openai_config_file == "":
+        config = OpenAIGPTConfig()
+    else:
+        config = OpenAIGPTConfig(openai_config_file)
+    model = OpenAIGPTModel(config)
+
+    # Load weights from numpy
+    load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path)
+
+    # Save pytorch-model
+    pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
+    pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
+    print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
+    torch.save(model.state_dict(), pytorch_weights_dump_path)
+    print("Save configuration file to {}".format(pytorch_config_dump_path))
+    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
+        f.write(config.to_json_string())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ## Required parameters
+    parser.add_argument("--openai_checkpoint_folder_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path the TensorFlow checkpoint path.")
+    parser.add_argument("--pytorch_dump_folder_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the output PyTorch model.")
+    parser.add_argument("--openai_config_file",
+                        default = "",
+                        type = str,
+                        help = "An optional config json file corresponding to the pre-trained OpenAI model. \n"
+                            "This specifies the model architecture.")
+    args = parser.parse_args()
+    convert_openai_checkpoint_to_pytorch(args.openai_checkpoint_folder_path,
+                                         args.openai_config_file,
+                                         args.pytorch_dump_folder_path)
diff --git a/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
new file mode 100755
index 0000000..13d9638
--- /dev/null
+++ b/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
@@ -0,0 +1,66 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert BERT checkpoint."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import re
+import argparse
+import tensorflow as tf
+import torch
+import numpy as np
+
+from pytorch_pretrained_bert.modeling import BertConfig, BertForPreTraining, load_tf_weights_in_bert
+
+def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
+    # Initialise PyTorch model
+    config = BertConfig.from_json_file(bert_config_file)
+    print("Building PyTorch model from configuration: {}".format(str(config)))
+    model = BertForPreTraining(config)
+
+    # Load weights from tf checkpoint
+    load_tf_weights_in_bert(model, tf_checkpoint_path)
+
+    # Save pytorch-model
+    print("Save PyTorch model to {}".format(pytorch_dump_path))
+    torch.save(model.state_dict(), pytorch_dump_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ## Required parameters
+    parser.add_argument("--tf_checkpoint_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path the TensorFlow checkpoint path.")
+    parser.add_argument("--bert_config_file",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "The config json file corresponding to the pre-trained BERT model. \n"
+                            "This specifies the model architecture.")
+    parser.add_argument("--pytorch_dump_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the output PyTorch model.")
+    args = parser.parse_args()
+    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
+                                     args.bert_config_file,
+                                     args.pytorch_dump_path)
diff --git a/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
new file mode 100755
index 0000000..8d6b965
--- /dev/null
+++ b/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
@@ -0,0 +1,116 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert Transformer XL checkpoint and datasets."""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import os
+import sys
+from io import open
+
+import torch
+
+import pytorch_pretrained_bert.tokenization_transfo_xl as data_utils
+from pytorch_pretrained_bert.modeling_transfo_xl import (CONFIG_NAME,
+                                                         WEIGHTS_NAME,
+                                                         TransfoXLConfig,
+                                                         TransfoXLLMHeadModel,
+                                                         load_tf_weights_in_transfo_xl)
+from pytorch_pretrained_bert.tokenization_transfo_xl import (CORPUS_NAME,
+                                                             VOCAB_NAME)
+
+if sys.version_info[0] == 2:
+    import cPickle as pickle
+else:
+    import pickle
+
+# We do this to be able to load python 2 datasets pickles
+# See e.g. https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory/2121918#2121918
+data_utils.Vocab = data_utils.TransfoXLTokenizer
+data_utils.Corpus = data_utils.TransfoXLCorpus
+sys.modules['data_utils'] = data_utils
+sys.modules['vocabulary'] = data_utils
+
+def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
+                                             transfo_xl_config_file,
+                                             pytorch_dump_folder_path,
+                                             transfo_xl_dataset_file):
+    if transfo_xl_dataset_file:
+        # Convert a pre-processed corpus (see original TensorFlow repo)
+        with open(transfo_xl_dataset_file, "rb") as fp:
+            corpus = pickle.load(fp, encoding="latin1")
+        # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term)
+        pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_NAME
+        print("Save vocabulary to {}".format(pytorch_vocab_dump_path))
+        corpus_vocab_dict = corpus.vocab.__dict__
+        torch.save(corpus_vocab_dict, pytorch_vocab_dump_path)
+
+        corpus_dict_no_vocab = corpus.__dict__
+        corpus_dict_no_vocab.pop('vocab', None)
+        pytorch_dataset_dump_path = pytorch_dump_folder_path + '/' + CORPUS_NAME
+        print("Save dataset to {}".format(pytorch_dataset_dump_path))
+        torch.save(corpus_dict_no_vocab, pytorch_dataset_dump_path)
+
+    if tf_checkpoint_path:
+        # Convert a pre-trained TensorFlow model
+        config_path = os.path.abspath(transfo_xl_config_file)
+        tf_path = os.path.abspath(tf_checkpoint_path)
+
+        print("Converting Transformer XL checkpoint from {} with config at {}".format(tf_path, config_path))
+        # Initialise PyTorch model
+        if transfo_xl_config_file == "":
+            config = TransfoXLConfig()
+        else:
+            config = TransfoXLConfig(transfo_xl_config_file)
+        print("Building PyTorch model from configuration: {}".format(str(config)))
+        model = TransfoXLLMHeadModel(config)
+
+        model = load_tf_weights_in_transfo_xl(model, config, tf_path)
+        # Save pytorch-model
+        pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
+        pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)
+        print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path)))
+        torch.save(model.state_dict(), pytorch_weights_dump_path)
+        print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path)))
+        with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
+            f.write(config.to_json_string())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pytorch_dump_folder_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the folder to store the PyTorch model or dataset/vocab.")
+    parser.add_argument("--tf_checkpoint_path",
+                        default = "",
+                        type = str,
+                        help = "An optional path to a TensorFlow checkpoint path to be converted.")
+    parser.add_argument("--transfo_xl_config_file",
+                        default = "",
+                        type = str,
+                        help = "An optional config json file corresponding to the pre-trained BERT model. \n"
+                            "This specifies the model architecture.")
+    parser.add_argument("--transfo_xl_dataset_file",
+                        default = "",
+                        type = str,
+                        help = "An optional dataset file to be converted in a vocabulary.")
+    args = parser.parse_args()
+    convert_transfo_xl_checkpoint_to_pytorch(args.tf_checkpoint_path,
+                                     args.transfo_xl_config_file,
+                                     args.pytorch_dump_folder_path,
+                                     args.transfo_xl_dataset_file)
diff --git a/pytorch_pretrained_bert/file_utils.py b/pytorch_pretrained_bert/file_utils.py
new file mode 100644
index 0000000..605c841
--- /dev/null
+++ b/pytorch_pretrained_bert/file_utils.py
@@ -0,0 +1,279 @@
+"""
+Utilities for working with the local dataset cache.
+This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
+Copyright by the AllenNLP authors.
+"""
+from __future__ import (absolute_import, division, print_function, unicode_literals)
+
+import sys
+import json
+import logging
+import os
+import shutil
+import tempfile
+import fnmatch
+from functools import wraps
+from hashlib import sha256
+import sys
+from io import open
+
+import boto3
+import requests
+from botocore.exceptions import ClientError
+from tqdm import tqdm
+
+try:
+    from torch.hub import _get_torch_home
+    torch_cache_home = _get_torch_home()
+except ImportError:
+    torch_cache_home = os.path.expanduser(
+        os.getenv('TORCH_HOME', os.path.join(
+            os.getenv('XDG_CACHE_HOME', '~/.cache'), 'torch')))
+default_cache_path = os.path.join(torch_cache_home, 'pytorch_pretrained_bert')
+
+try:
+    from urllib.parse import urlparse
+except ImportError:
+    from urlparse import urlparse
+
+try:
+    from pathlib import Path
+    PYTORCH_PRETRAINED_BERT_CACHE = Path(
+        os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', default_cache_path))
+except (AttributeError, ImportError):
+    PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
+                                              default_cache_path)
+
+CONFIG_NAME = "config.json"
+WEIGHTS_NAME = "pytorch_model.bin"
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+
+def url_to_filename(url, etag=None):
+    """
+    Convert `url` into a hashed filename in a repeatable way.
+    If `etag` is specified, append its hash to the url's, delimited
+    by a period.
+    """
+    url_bytes = url.encode('utf-8')
+    url_hash = sha256(url_bytes)
+    filename = url_hash.hexdigest()
+
+    if etag:
+        etag_bytes = etag.encode('utf-8')
+        etag_hash = sha256(etag_bytes)
+        filename += '.' + etag_hash.hexdigest()
+
+    return filename
+
+
+def filename_to_url(filename, cache_dir=None):
+    """
+    Return the url and etag (which may be ``None``) stored for `filename`.
+    Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    cache_path = os.path.join(cache_dir, filename)
+    if not os.path.exists(cache_path):
+        raise EnvironmentError("file {} not found".format(cache_path))
+
+    meta_path = cache_path + '.json'
+    if not os.path.exists(meta_path):
+        raise EnvironmentError("file {} not found".format(meta_path))
+
+    with open(meta_path, encoding="utf-8") as meta_file:
+        metadata = json.load(meta_file)
+    url = metadata['url']
+    etag = metadata['etag']
+
+    return url, etag
+
+
+def cached_path(url_or_filename, cache_dir=None):
+    """
+    Given something that might be a URL (or might be a local path),
+    determine which. If it's a URL, download the file and cache it, and
+    return the path to the cached file. If it's already a local path,
+    make sure the file exists and then return the path.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
+        url_or_filename = str(url_or_filename)
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    parsed = urlparse(url_or_filename)
+
+    if parsed.scheme in ('http', 'https', 's3'):
+        # URL, so get it from the cache (downloading if necessary)
+        return get_from_cache(url_or_filename, cache_dir)
+    elif os.path.exists(url_or_filename):
+        # File, and it exists.
+        return url_or_filename
+    elif parsed.scheme == '':
+        # File, but it doesn't exist.
+        raise EnvironmentError("file {} not found".format(url_or_filename))
+    else:
+        # Something unknown
+        raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
+
+
+def split_s3_path(url):
+    """Split a full s3 path into the bucket name and path."""
+    parsed = urlparse(url)
+    if not parsed.netloc or not parsed.path:
+        raise ValueError("bad s3 path {}".format(url))
+    bucket_name = parsed.netloc
+    s3_path = parsed.path
+    # Remove '/' at beginning of path.
+    if s3_path.startswith("/"):
+        s3_path = s3_path[1:]
+    return bucket_name, s3_path
+
+
+def s3_request(func):
+    """
+    Wrapper function for s3 requests in order to create more helpful error
+    messages.
+    """
+
+    @wraps(func)
+    def wrapper(url, *args, **kwargs):
+        try:
+            return func(url, *args, **kwargs)
+        except ClientError as exc:
+            if int(exc.response["Error"]["Code"]) == 404:
+                raise EnvironmentError("file {} not found".format(url))
+            else:
+                raise
+
+    return wrapper
+
+
+@s3_request
+def s3_etag(url):
+    """Check ETag on S3 object."""
+    s3_resource = boto3.resource("s3")
+    bucket_name, s3_path = split_s3_path(url)
+    s3_object = s3_resource.Object(bucket_name, s3_path)
+    return s3_object.e_tag
+
+
+@s3_request
+def s3_get(url, temp_file):
+    """Pull a file directly from S3."""
+    s3_resource = boto3.resource("s3")
+    bucket_name, s3_path = split_s3_path(url)
+    s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
+
+
+def http_get(url, temp_file):
+    req = requests.get(url, stream=True)
+    content_length = req.headers.get('Content-Length')
+    total = int(content_length) if content_length is not None else None
+    progress = tqdm(unit="B", total=total)
+    for chunk in req.iter_content(chunk_size=1024):
+        if chunk: # filter out keep-alive new chunks
+            progress.update(len(chunk))
+            temp_file.write(chunk)
+    progress.close()
+
+
+def get_from_cache(url, cache_dir=None):
+    """
+    Given a URL, look for the corresponding dataset in the local cache.
+    If it's not there, download it. Then return the path to the cached file.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    if not os.path.exists(cache_dir):
+        os.makedirs(cache_dir)
+
+    # Get eTag to add to filename, if it exists.
+    if url.startswith("s3://"):
+        etag = s3_etag(url)
+    else:
+        try:
+            response = requests.head(url, allow_redirects=True)
+            if response.status_code != 200:
+                etag = None
+            else:
+                etag = response.headers.get("ETag")
+        except EnvironmentError:
+            etag = None
+
+    if sys.version_info[0] == 2 and etag is not None:
+        etag = etag.decode('utf-8')
+    filename = url_to_filename(url, etag)
+
+    # get cache path to put the file
+    cache_path = os.path.join(cache_dir, filename)
+
+    # If we don't have a connection (etag is None) and can't identify the file
+    # try to get the last downloaded one
+    if not os.path.exists(cache_path) and etag is None:
+        matching_files = fnmatch.filter(os.listdir(cache_dir), filename + '.*')
+        matching_files = list(filter(lambda s: not s.endswith('.json'), matching_files))
+        if matching_files:
+            cache_path = os.path.join(cache_dir, matching_files[-1])
+
+    if not os.path.exists(cache_path):
+        # Download to temporary file, then copy to cache dir once finished.
+        # Otherwise you get corrupt cache entries if the download gets interrupted.
+        with tempfile.NamedTemporaryFile() as temp_file:
+            logger.info("%s not found in cache, downloading to %s", url, temp_file.name)
+
+            # GET file object
+            if url.startswith("s3://"):
+                s3_get(url, temp_file)
+            else:
+                http_get(url, temp_file)
+
+            # we are copying the file before closing it, so flush to avoid truncation
+            temp_file.flush()
+            # shutil.copyfileobj() starts at the current position, so go to the start
+            temp_file.seek(0)
+
+            logger.info("copying %s to cache at %s", temp_file.name, cache_path)
+            with open(cache_path, 'wb') as cache_file:
+                shutil.copyfileobj(temp_file, cache_file)
+
+            logger.info("creating metadata file for %s", cache_path)
+            meta = {'url': url, 'etag': etag}
+            meta_path = cache_path + '.json'
+            with open(meta_path, 'w') as meta_file:
+                output_string = json.dumps(meta)
+                if sys.version_info[0] == 2 and isinstance(output_string, str):
+                    output_string = unicode(output_string, 'utf-8')  # The beauty of python 2
+                meta_file.write(output_string)
+
+            logger.info("removing temp file %s", temp_file.name)
+
+    return cache_path
+
+
+def read_set_from_file(filename):
+    '''
+    Extract a de-duped collection (set) of text from a file.
+    Expected file format is one item per line.
+    '''
+    collection = set()
+    with open(filename, 'r', encoding='utf-8') as file_:
+        for line in file_:
+            collection.add(line.rstrip())
+    return collection
+
+
+def get_file_extension(path, dot=True, lower=True):
+    ext = os.path.splitext(path)[1]
+    ext = ext if dot else ext[1:]
+    return ext.lower() if lower else ext
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
new file mode 100644
index 0000000..bdec14c
--- /dev/null
+++ b/pytorch_pretrained_bert/modeling.py
@@ -0,0 +1,1623 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model."""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import copy
+import json
+import logging
+import math
+import os
+import sys
+from io import open
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from .file_utils import cached_path, WEIGHTS_NAME, CONFIG_NAME
+#from pytorch_pretrained_bert.modeling_openai import OpenAIGPTModel, OpenAIGPTLMHead
+logger = logging.getLogger(__name__)
+
+PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin",
+    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin",
+    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin",
+    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin",
+    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin",
+    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin",
+    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin",
+    'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-pytorch_model.bin",
+    'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin",
+    'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin",
+    'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin",
+    'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin",
+    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
+}
+PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
+    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
+    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json",
+    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json",
+    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json",
+    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json",
+    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json",
+    'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-config.json",
+    'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json",
+    'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json",
+    'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json",
+    'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json",
+    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
+}
+BERT_CONFIG_NAME = 'bert_config.json'
+TF_WEIGHTS_NAME = 'model.ckpt'
+
+
+def prune_linear_layer(layer, index, dim=0):
+    """ Prune a linear layer (a model parameters) to keep only entries in index.
+        Return the pruned layer as a new layer with requires_grad=True.
+        Used to remove heads.
+    """
+    index = index.to(layer.weight.device)
+    W = layer.weight.index_select(dim, index).clone().detach()
+    if layer.bias is not None:
+        if dim == 1:
+            b = layer.bias.clone().detach()
+        else:
+            b = layer.bias[index].clone().detach()
+    new_size = list(layer.weight.size())
+    new_size[dim] = len(index)
+    new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None).to(layer.weight.device)
+    new_layer.weight.requires_grad = False
+    new_layer.weight.copy_(W.contiguous())
+    new_layer.weight.requires_grad = True
+    if layer.bias is not None:
+        new_layer.bias.requires_grad = False
+        new_layer.bias.copy_(b.contiguous())
+        new_layer.bias.requires_grad = True
+    return new_layer
+
+
+def load_tf_weights_in_bert(model, tf_checkpoint_path):
+    """ Load tf checkpoints in a pytorch model
+    """
+    try:
+        import re
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
+              "https://www.tensorflow.org/install/ for installation instructions.")
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    print("Converting TensorFlow checkpoint from {}".format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        print("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name.split('/')
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
+            print("Skipping {}".format("/".join(name)))
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
+                l = re.split(r'_(\d+)', m_name)
+            else:
+                l = [m_name]
+            if l[0] == 'kernel' or l[0] == 'gamma':
+                pointer = getattr(pointer, 'weight')
+            elif l[0] == 'output_bias' or l[0] == 'beta':
+                pointer = getattr(pointer, 'bias')
+            elif l[0] == 'output_weights':
+                pointer = getattr(pointer, 'weight')
+            elif l[0] == 'squad':
+                pointer = getattr(pointer, 'classifier')
+            else:
+                try:
+                    pointer = getattr(pointer, l[0])
+                except AttributeError:
+                    print("Skipping {}".format("/".join(name)))
+                    continue
+            if len(l) >= 2:
+                num = int(l[1])
+                pointer = pointer[num]
+        if m_name[-11:] == '_embeddings':
+            pointer = getattr(pointer, 'weight')
+        elif m_name == 'kernel':
+            array = np.transpose(array)
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        print("Initialize PyTorch weight {}".format(name))
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+def gelu(x):
+    """Implementation of the gelu activation function.
+        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+        Also see https://arxiv.org/abs/1606.08415
+    """
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+
+
+def swish(x):
+    return x * torch.sigmoid(x)
+
+
+ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
+
+
+class BertConfig(object):
+    """Configuration class to store the configuration of a `BertModel`.
+    """
+
+    def __init__(self,
+                 vocab_size_or_config_json_file,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act="gelu",
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.02,
+                 layer_norm_eps=1e-12):
+        """Constructs BertConfig.
+
+        Args:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
+            hidden_size: Size of the encoder layers and the pooler layer.
+            num_hidden_layers: Number of hidden layers in the Transformer encoder.
+            num_attention_heads: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+                layer in the Transformer encoder.
+            hidden_act: The non-linear activation function (function or string) in the
+                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+            hidden_dropout_prob: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob: The dropout ratio for the attention
+                probabilities.
+            max_position_embeddings: The maximum sequence length that this model might
+                ever be used with. Typically set this to something large just in case
+                (e.g., 512 or 1024 or 2048).
+            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+                `BertModel`.
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+            layer_norm_eps: The epsilon used by LayerNorm.
+        """
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                                                               and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.hidden_act = hidden_act
+            self.intermediate_size = intermediate_size
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.initializer_range = initializer_range
+            self.layer_norm_eps = layer_norm_eps
+        else:
+            raise ValueError("First argument must be either a vocabulary size (int)"
+                             "or the path to a pretrained model config file (str)")
+
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `BertConfig` from a Python dictionary of parameters."""
+        config = BertConfig(vocab_size_or_config_json_file=-1)
+        for key, value in json_object.items():
+            config.__dict__[key] = value
+        return config
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `BertConfig` from a json file of parameters."""
+        with open(json_file, "r", encoding='utf-8') as reader:
+            text = reader.read()
+        return cls.from_dict(json.loads(text))
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+
+    def to_json_file(self, json_file_path):
+        """ Save this instance to a json file."""
+        with open(json_file_path, "w", encoding='utf-8') as writer:
+            writer.write(self.to_json_string())
+
+
+try:
+    from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
+except ImportError:
+    logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .")
+
+
+    class BertLayerNorm(nn.Module):
+        def __init__(self, hidden_size, eps=1e-12):
+            """Construct a layernorm module in the TF style (epsilon inside the square root).
+            """
+            super(BertLayerNorm, self).__init__()
+            self.weight = nn.Parameter(torch.ones(hidden_size))
+            self.bias = nn.Parameter(torch.zeros(hidden_size))
+            self.variance_epsilon = eps
+
+        def forward(self, x):
+            u = x.mean(-1, keepdim=True)
+            s = (x - u).pow(2).mean(-1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.variance_epsilon)
+            return self.weight * x + self.bias
+
+
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings.
+    """
+
+    def __init__(self, config):
+        super(BertEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, input_ids, token_type_ids=None):
+        seq_length = input_ids.size(1)
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
+        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        words_embeddings = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = words_embeddings + position_embeddings + token_type_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertSelfAttention(nn.Module):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+        super(BertSelfAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+        self.output_attentions = output_attentions
+        self.keep_multihead_output = keep_multihead_output
+        self.multihead_output = None
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(self, hidden_states, attention_mask, head_mask=None):
+        mixed_query_layer = self.query(hidden_states)
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+        attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+        if self.keep_multihead_output:
+            self.multihead_output = context_layer
+            self.multihead_output.retain_grad()
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        if self.output_attentions:
+            return attention_probs, context_layer
+        return context_layer
+
+
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super(BertSelfOutput, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertAttention(nn.Module):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+        super(BertAttention, self).__init__()
+        self.output_attentions = output_attentions
+        self.self = BertSelfAttention(config, output_attentions=output_attentions,
+                                      keep_multihead_output=keep_multihead_output)
+        self.output = BertSelfOutput(config)
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size)
+        for head in heads:
+            mask[head] = 0
+        mask = mask.view(-1).contiguous().eq(1)
+        index = torch.arange(len(mask))[mask].long()
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+        # Update hyper params
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+
+    def forward(self, input_tensor, attention_mask, head_mask=None):
+        self_output = self.self(input_tensor, attention_mask, head_mask)
+        if self.output_attentions:
+            attentions, self_output = self_output
+        attention_output = self.output(self_output, input_tensor)
+        if self.output_attentions:
+            return attentions, attention_output
+        return attention_output
+
+
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super(BertIntermediate, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super(BertOutput, self).__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+        super(BertLayer, self).__init__()
+        self.output_attentions = output_attentions
+        self.attention = BertAttention(config, output_attentions=output_attentions,
+                                       keep_multihead_output=keep_multihead_output)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+    def forward(self, hidden_states, attention_mask, head_mask=None):
+        attention_output = self.attention(hidden_states, attention_mask, head_mask)
+        if self.output_attentions:
+            attentions, attention_output = attention_output
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        if self.output_attentions:
+            return attentions, layer_output
+        return layer_output
+
+
+class BertEncoder(nn.Module):
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+        super(BertEncoder, self).__init__()
+        self.output_attentions = output_attentions
+        layer = BertLayer(config, output_attentions=output_attentions,
+                          keep_multihead_output=keep_multihead_output)
+        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
+
+    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, head_mask=None):
+        all_encoder_layers = []
+        all_attentions = []
+        for i, layer_module in enumerate(self.layer):
+            hidden_states = layer_module(hidden_states, attention_mask, head_mask[i])
+            if self.output_attentions:
+                attentions, hidden_states = hidden_states
+                all_attentions.append(attentions)
+            if output_all_encoded_layers:
+                all_encoder_layers.append(hidden_states)
+        if not output_all_encoded_layers:
+            all_encoder_layers.append(hidden_states)
+        if self.output_attentions:
+            return all_attentions, all_encoder_layers
+        return all_encoder_layers
+
+
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super(BertPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super(BertPredictionHeadTransform, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertLMPredictionHead, self).__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(bert_model_embedding_weights.size(1),
+                                 bert_model_embedding_weights.size(0),
+                                 bias=False)
+        self.decoder.weight = bert_model_embedding_weights
+        self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0)))
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states) + self.bias
+        return hidden_states
+
+
+class BertOnlyMLMHead(nn.Module):
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertOnlyMLMHead, self).__init__()
+        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class BertOnlyNSPHead(nn.Module):
+    def __init__(self, config):
+        super(BertOnlyNSPHead, self).__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+class BertPreTrainingHeads(nn.Module):
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertPreTrainingHeads, self).__init__()
+        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class BertPreTrainedModel(nn.Module):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+
+    def __init__(self, config, *inputs, **kwargs):
+        super(BertPreTrainedModel, self).__init__()
+        if not isinstance(config, BertConfig):
+            raise ValueError(
+                "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
+                "To create a model from a Google pretrained model use "
+                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
+                    self.__class__.__name__, self.__class__.__name__
+                ))
+        self.config = config
+
+    def init_bert_weights(self, module):
+        """ Initialize the weights.
+        """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, BertLayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
+        """
+        Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict.
+        Download and cache the pre-trained model file if needed.
+
+        Params:
+            pretrained_model_name_or_path: either:
+                - a str with the name of a pre-trained model to load selected in the list of:
+                    . `bert-base-uncased`
+                    . `bert-large-uncased`
+                    . `bert-base-cased`
+                    . `bert-large-cased`
+                    . `bert-base-multilingual-uncased`
+                    . `bert-base-multilingual-cased`
+                    . `bert-base-chinese`
+                    . `bert-base-german-cased`
+                    . `bert-large-uncased-whole-word-masking`
+                    . `bert-large-cased-whole-word-masking`
+                - a path or url to a pretrained model archive containing:
+                    . `bert_config.json` a configuration file for the model
+                    . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
+                - a path or url to a pretrained model archive containing:
+                    . `bert_config.json` a configuration file for the model
+                    . `model.chkpt` a TensorFlow checkpoint
+            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
+            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
+            *inputs, **kwargs: additional input for the specific Bert class
+                (ex: num_labels for BertForSequenceClassification)
+        """
+        state_dict = kwargs.get('state_dict', None)
+        kwargs.pop('state_dict', None)
+        cache_dir = kwargs.get('cache_dir', None)
+        kwargs.pop('cache_dir', None)
+        from_tf = kwargs.get('from_tf', False)
+        kwargs.pop('from_tf', None)
+
+        if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
+            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
+            config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
+        else:
+            if from_tf:
+                # Directly load from a TensorFlow checkpoint
+                archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME)
+                config_file = os.path.join(pretrained_model_name_or_path, BERT_CONFIG_NAME)
+            else:
+                archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
+                config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
+        # redirect to the cache, if necessary
+        try:
+            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained weights.".format(
+                        archive_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find any file "
+                    "associated to this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
+                        archive_file))
+            return None
+        try:
+            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            if pretrained_model_name_or_path in PRETRAINED_CONFIG_ARCHIVE_MAP:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
+                        config_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find any file "
+                    "associated to this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(PRETRAINED_CONFIG_ARCHIVE_MAP.keys()),
+                        config_file))
+            return None
+        if resolved_archive_file == archive_file and resolved_config_file == config_file:
+            logger.info("loading weights file {}".format(archive_file))
+            logger.info("loading configuration file {}".format(config_file))
+        else:
+            logger.info("loading weights file {} from cache at {}".format(
+                archive_file, resolved_archive_file))
+            logger.info("loading configuration file {} from cache at {}".format(
+                config_file, resolved_config_file))
+        # Load config
+        config = BertConfig.from_json_file(resolved_config_file)
+        logger.info("Model config {}".format(config))
+        # Instantiate model.
+        model = cls(config, *inputs, **kwargs)
+        if state_dict is None and not from_tf:
+            state_dict = torch.load(resolved_archive_file, map_location='cpu')
+        if from_tf:
+            # Directly load from a TensorFlow checkpoint
+            return load_tf_weights_in_bert(model, weights_path)
+        # Load from a PyTorch state_dict
+        old_keys = []
+        new_keys = []
+        for key in state_dict.keys():
+            new_key = None
+            if 'gamma' in key:
+                new_key = key.replace('gamma', 'weight')
+            if 'beta' in key:
+                new_key = key.replace('beta', 'bias')
+            if new_key:
+                old_keys.append(key)
+                new_keys.append(new_key)
+        for old_key, new_key in zip(old_keys, new_keys):
+            state_dict[new_key] = state_dict.pop(old_key)
+
+        missing_keys = []
+        unexpected_keys = []
+        error_msgs = []
+        # copy state_dict so _load_from_state_dict can modify it
+        metadata = getattr(state_dict, '_metadata', None)
+        state_dict = state_dict.copy()
+        if metadata is not None:
+            state_dict._metadata = metadata
+
+        def load(module, prefix=''):
+            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+            module._load_from_state_dict(
+                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
+            for name, child in module._modules.items():
+                if child is not None:
+                    load(child, prefix + name + '.')
+
+        start_prefix = ''
+        if not hasattr(model, 'bert') and any(s.startswith('bert.') for s in state_dict.keys()):
+            start_prefix = 'bert.'
+        load(model, prefix=start_prefix)
+        if len(missing_keys) > 0:
+            logger.info("Weights of {} not initialized from pretrained model: {}".format(
+                model.__class__.__name__, missing_keys))
+        if len(unexpected_keys) > 0:
+            logger.info("Weights from pretrained model not used in {}: {}".format(
+                model.__class__.__name__, unexpected_keys))
+        if len(error_msgs) > 0:
+            raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
+                model.__class__.__name__, "\n\t".join(error_msgs)))
+        return model
+
+
+class BertModel(BertPreTrainedModel):
+    """BERT model ("Bidirectional Embedding Representations from a Transformer").
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
+
+    Outputs: Tuple of (encoded_layers, pooled_output)
+        `encoded_layers`: controled by `output_all_encoded_layers` argument:
+            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
+                of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
+                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
+            - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
+                to the last attention block of shape [batch_size, sequence_length, hidden_size],
+        `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
+            classifier pretrained on top of the hidden state associated to the first character of the
+            input (`CLS`) to train on the Next-Sentence task (see BERT's paper).
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = modeling.BertModel(config=config)
+    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+        super(BertModel, self).__init__(config)
+        self.output_attentions = output_attentions
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config, output_attentions=output_attentions,
+                                   keep_multihead_output=keep_multihead_output)
+        self.pooler = BertPooler(config)
+        self.apply(self.init_bert_weights)
+
+    def prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def get_multihead_outputs(self):
+        """ Gather all multi-head outputs.
+            Return: list (layers) of multihead module outputs with gradients
+        """
+        return [layer.attention.self.multihead_output for layer in self.encoder.layer]
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True,
+                head_mask=None):
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand_as(self.config.num_hidden_layers, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(
+                    -1)  # We can specify head_mask for each layer
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype)  # switch to fload if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
+
+        embedding_output = self.embeddings(input_ids, token_type_ids)
+        encoded_layers = self.encoder(embedding_output,
+                                      extended_attention_mask,
+                                      output_all_encoded_layers=output_all_encoded_layers,
+                                      head_mask=head_mask)
+        if self.output_attentions:
+            all_attentions, encoded_layers = encoded_layers
+        sequence_output = encoded_layers[-1]
+        pooled_output = self.pooler(sequence_output)
+        if not output_all_encoded_layers:
+            encoded_layers = encoded_layers[-1]
+        if self.output_attentions:
+            return all_attentions, encoded_layers, pooled_output
+        return encoded_layers, pooled_output
+
+
+class BertForPreTraining(BertPreTrainedModel):
+    """BERT model with pre-training heads.
+    This module comprises the BERT model followed by the two pre-training heads:
+        - the masked language modeling head, and
+        - the next sentence classification head.
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `masked_lm_labels`: optional masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
+            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
+            is only computed for the labels set in [0, ..., vocab_size]
+        `next_sentence_label`: optional next sentence classification loss: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, 1].
+            0 => next sentence is the continuation, 1 => next sentence is a random sentence.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
+    Outputs:
+        if `masked_lm_labels` and `next_sentence_label` are not `None`:
+            Outputs the total_loss which is the sum of the masked language modeling loss and the next
+            sentence classification loss.
+        if `masked_lm_labels` or `next_sentence_label` is `None`:
+            Outputs a tuple comprising
+            - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
+            - the next sentence classification logits of shape [batch_size, 2].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = BertForPreTraining(config)
+    masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+        super(BertForPreTraining, self).__init__(config)
+        self.output_attentions = output_attentions
+        self.bert = BertModel(config, output_attentions=output_attentions,
+                              keep_multihead_output=keep_multihead_output)
+        self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
+        self.apply(self.init_bert_weights)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
+                next_sentence_label=None, head_mask=None):
+        outputs = self.bert(input_ids, token_type_ids, attention_mask,
+                            output_all_encoded_layers=False, head_mask=head_mask)
+        if self.output_attentions:
+            all_attentions, sequence_output, pooled_output = outputs
+        else:
+            sequence_output, pooled_output = outputs
+        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
+
+        if masked_lm_labels is not None and next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
+            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+            total_loss = masked_lm_loss + next_sentence_loss
+            return total_loss
+        elif self.output_attentions:
+            return all_attentions, prediction_scores, seq_relationship_score
+        return prediction_scores, seq_relationship_score
+
+
+class BertForMaskedLM(BertPreTrainedModel):
+    """BERT model with the masked language modeling head.
+    This module comprises the BERT model followed by the masked language modeling head.
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
+            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
+            is only computed for the labels set in [0, ..., vocab_size]
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
+    Outputs:
+        if `masked_lm_labels` is  not `None`:
+            Outputs the masked language modeling loss.
+        if `masked_lm_labels` is `None`:
+            Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = BertForMaskedLM(config)
+    masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+        super(BertForMaskedLM, self).__init__(config)
+        self.output_attentions = output_attentions
+        self.bert = BertModel(config, output_attentions=output_attentions,
+                              keep_multihead_output=keep_multihead_output)
+        self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
+        self.apply(self.init_bert_weights)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None):
+        #todo added by rooh
+        input_shape = input_ids.size()  # (B, C, F)
+        input_ids = input_ids.view(-1, input_ids.size(-1))
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        #todo
+        outputs = self.bert(input_ids, token_type_ids, attention_mask,
+                            output_all_encoded_layers=False,
+                            head_mask=head_mask)
+        if self.output_attentions:
+            all_attentions, sequence_output, _ = outputs
+        else:
+            sequence_output, _ = outputs
+        prediction_scores = self.cls(sequence_output)
+
+        if masked_lm_labels is not None:
+            # #todo
+            # prediction_scores = prediction_scores[..., :-1, :].contiguous()
+            # masked_lm_labels = masked_lm_labels[..., 1:].contiguous()
+            # #todo
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
+            return masked_lm_loss
+        elif self.output_attentions:
+            return all_attentions, prediction_scores
+        return prediction_scores
+
+
+class BertForNextSentencePrediction(BertPreTrainedModel):
+    """BERT model with next sentence prediction head.
+    This module comprises the BERT model followed by the next sentence classification head.
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, 1].
+            0 => next sentence is the continuation, 1 => next sentence is a random sentence.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
+    Outputs:
+        if `next_sentence_label` is not `None`:
+            Outputs the total_loss which is the sum of the masked language modeling loss and the next
+            sentence classification loss.
+        if `next_sentence_label` is `None`:
+            Outputs the next sentence classification logits of shape [batch_size, 2].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = BertForNextSentencePrediction(config)
+    seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+        super(BertForNextSentencePrediction, self).__init__(config)
+        self.output_attentions = output_attentions
+        self.bert = BertModel(config, output_attentions=output_attentions,
+                              keep_multihead_output=keep_multihead_output)
+        self.cls = BertOnlyNSPHead(config)
+        self.apply(self.init_bert_weights)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None, head_mask=None):
+        outputs = self.bert(input_ids, token_type_ids, attention_mask,
+                            output_all_encoded_layers=False,
+                            head_mask=head_mask)
+        if self.output_attentions:
+            all_attentions, _, pooled_output = outputs
+        else:
+            _, pooled_output = outputs
+        seq_relationship_score = self.cls(pooled_output)
+
+        if next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+            return next_sentence_loss
+        elif self.output_attentions:
+            return all_attentions, seq_relationship_score
+        return seq_relationship_score
+
+
+class BertForSequenceClassification(BertPreTrainedModel):
+    """BERT model for classification.
+    This module is composed of the BERT model with a linear layer on top of
+    the pooled output.
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
+        `num_labels`: the number of classes for the classifier. Default = 2.
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary. Items in the batch should begin with the special "CLS" token. (see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, ..., num_labels].
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
+    Outputs:
+        if `labels` is not `None`:
+            Outputs the CrossEntropy classification loss of the output with the labels.
+        if `labels` is `None`:
+            Outputs the classification logits of shape [batch_size, num_labels].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    num_labels = 2
+
+    model = BertForSequenceClassification(config, num_labels)
+    logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+
+    def __init__(self, config, num_labels=2, output_attentions=False, keep_multihead_output=False):
+        super(BertForSequenceClassification, self).__init__(config)
+        self.output_attentions = output_attentions
+        self.num_labels = num_labels
+        self.bert = BertModel(config, output_attentions=output_attentions,
+                              keep_multihead_output=keep_multihead_output)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, num_labels)
+        self.apply(self.init_bert_weights)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
+        outputs = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False,
+                            head_mask=head_mask)
+        if self.output_attentions:
+            all_attentions, _, pooled_output = outputs
+        else:
+            _, pooled_output = outputs
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            return loss
+        elif self.output_attentions:
+            return all_attentions, logits
+        return logits
+
+
+class BertForMultipleChoice(BertPreTrainedModel):
+    """BERT model for multiple choice tasks.
+    This module is composed of the BERT model with a linear layer on top of
+    the pooled output.
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
+        `num_choices`: the number of classes for the classifier. Default = 2.
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length]
+            with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A`
+            and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, ..., num_choices].
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
+    Outputs:
+        if `labels` is not `None`:
+            Outputs the CrossEntropy classification loss of the output with the labels.
+        if `labels` is `None`:
+            Outputs the classification logits of shape [batch_size, num_labels].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]])
+    input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]])
+    token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]])
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    num_choices = 2
+
+    model = BertForMultipleChoice(config, num_choices)
+    logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+
+    def __init__(self, config, num_choices=2, output_attentions=False, keep_multihead_output=False):
+        super(BertForMultipleChoice, self).__init__(config)
+        self.output_attentions = output_attentions
+        self.num_choices = num_choices
+        self.bert = BertModel(config, output_attentions=output_attentions,
+                              keep_multihead_output=keep_multihead_output)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+        self.apply(self.init_bert_weights)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
+        flat_input_ids = input_ids.view(-1, input_ids.size(-1))
+        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        outputs = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, output_all_encoded_layers=False,
+                            head_mask=head_mask)
+        if self.output_attentions:
+            all_attentions, _, pooled_output = outputs
+        else:
+            _, pooled_output = outputs
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, self.num_choices)
+
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+            return loss
+        elif self.output_attentions:
+            return all_attentions, reshaped_logits
+        return reshaped_logits
+
+
+class BertMultipleChoice(BertPreTrainedModel):
+
+    def __init__(self, config, num_choices=2, output_attentions=False, keep_multihead_output=False):
+        super(BertMultipleChoice, self).__init__(config)
+        self.output_attentions = output_attentions
+        self.num_choices = num_choices
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+        self.apply(self.init_bert_weights)
+
+    def forward(self, pooled_output, num_choices=2, labels=None, head_mask=None):
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+            return loss
+        return reshaped_logits
+
+
+class BertForTokenClassification(BertPreTrainedModel):
+    """BERT model for token-level classification.
+    This module is composed of the BERT model with a linear layer on top of
+    the full hidden state of the last layer.
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
+        `num_labels`: the number of classes for the classifier. Default = 2.
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length]
+            with indices selected in [0, ..., num_labels].
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
+    Outputs:
+        if `labels` is not `None`:
+            Outputs the CrossEntropy classification loss of the output with the labels.
+        if `labels` is `None`:
+            Outputs the classification logits of shape [batch_size, sequence_length, num_labels].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    num_labels = 2
+
+    model = BertForTokenClassification(config, num_labels)
+    logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+
+    def __init__(self, config, num_labels=2, output_attentions=False, keep_multihead_output=False):
+        super(BertForTokenClassification, self).__init__(config)
+        self.output_attentions = output_attentions
+        self.num_labels = num_labels
+        self.bert = BertModel(config, output_attentions=output_attentions,
+                              keep_multihead_output=keep_multihead_output)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, num_labels)
+        self.apply(self.init_bert_weights)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
+        outputs = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False,
+                            head_mask=head_mask)
+        if self.output_attentions:
+            all_attentions, sequence_output, _ = outputs
+        else:
+            sequence_output, _ = outputs
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)[active_loss]
+                active_labels = labels.view(-1)[active_loss]
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            return loss
+        elif self.output_attentions:
+            return all_attentions, logits
+        return logits
+
+
+class BertForQuestionAnswering(BertPreTrainedModel):
+    """BERT model for Question Answering (span extraction).
+    This module is composed of the BERT model with a linear layer on top of
+    the sequence output that computes start_logits and end_logits
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size].
+            Positions are clamped to the length of the sequence and position outside of the sequence are not taken
+            into account for computing the loss.
+        `end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size].
+            Positions are clamped to the length of the sequence and position outside of the sequence are not taken
+            into account for computing the loss.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
+    Outputs:
+        if `start_positions` and `end_positions` are not `None`:
+            Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions.
+        if `start_positions` or `end_positions` is `None`:
+            Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end
+            position tokens of shape [batch_size, sequence_length].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = BertForQuestionAnswering(config)
+    start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+
+    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
+        super(BertForQuestionAnswering, self).__init__(config)
+        self.output_attentions = output_attentions
+        self.bert = BertModel(config, output_attentions=output_attentions,
+                              keep_multihead_output=keep_multihead_output)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+        self.apply(self.init_bert_weights)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None,
+                end_positions=None, head_mask=None):
+        outputs = self.bert(input_ids, token_type_ids, attention_mask,
+                            output_all_encoded_layers=False,
+                            head_mask=head_mask)
+        if self.output_attentions:
+            all_attentions, sequence_output, _ = outputs
+        else:
+            sequence_output, _ = outputs
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions.clamp_(0, ignored_index)
+            end_positions.clamp_(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+            return total_loss
+        elif self.output_attentions:
+            return all_attentions, start_logits, end_logits
+        return start_logits, end_logits
+
+
+class BertLMHeadModel(BertPreTrainedModel):
+    def __init__(self, config, output_attentions=False):
+        super(BertLMHeadModel, self).__init__(config)
+        self.bert = BertModel(config, output_attentions=output_attentions)
+        self.lm_head = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
+        self.apply(self.init_bert_weights)
+
+    def forward(self, input_ids, input_mask=None, lm_labels=None, token_type_ids=None, position_ids=None):
+        input_shape = input_ids.size()  # (B, C, F)
+        flat_input_ids = input_ids.view(-1, input_ids.size(-1))
+        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        flat_attention_mask = input_mask.view(-1, input_mask.size(-1)) if input_mask is not None else None
+        hidden_states, pooled_output = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask,
+                                                 output_all_encoded_layers=False)
+
+        lm_logits = self.lm_head(hidden_states)
+
+        if lm_labels is not None:
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = lm_labels[..., 1:].contiguous()
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+            return loss
+
+        return lm_logits
+
+
+class BertDoubleHeadsModel(BertPreTrainedModel):
+    def __init__(self, config, output_attentions=False):
+        super(BertDoubleHeadsModel, self).__init__(config)
+        self.bert = BertModel(config, output_attentions=output_attentions)
+        self.lm_head = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
+        self.multiple_choice_head = BertMultipleChoice(config)
+        self.apply(self.init_bert_weights)
+
+    def forward(self, input_ids, mc_token_ids, input_mask=None, lm_labels=None, mc_labels=None, token_type_ids=None,
+                position_ids=None):
+        input_shape = input_ids.size()  # (B, C, F)
+        flat_input_ids = input_ids.view(-1, input_ids.size(-1))
+        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        flat_attention_mask = input_mask.view(-1, input_mask.size(-1)) if input_mask is not None else None
+        hidden_states, pooled_output = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask,
+                                                 output_all_encoded_layers=False)
+
+        num_choices = input_shape[1]
+        output_shape = (input_shape) + (hidden_states.size(-1),)
+        hidden_states = hidden_states.view(*output_shape)
+
+        lm_logits = self.lm_head(hidden_states)
+        mc_logits = self.multiple_choice_head(pooled_output, num_choices=num_choices)
+        losses = []
+        if lm_labels is not None:
+            #bert is not a causal language model so the lm loss can't be defined. But I used it
+            # and for now it works pretty well
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = lm_labels[..., 1:].contiguous()
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            losses.append(loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)))
+        if mc_labels is not None:
+            loss_fct = CrossEntropyLoss()
+            losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))
+        if losses:
+            return losses
+        return lm_logits, mc_logits
+
+
+# class BertOpenAIDoubleHeadsModel(BertPreTrainedModel):
+#     def __init__(self, config, output_attentions=False):
+#         super(BertOpenAIDoubleHeadsModel, self).__init__(config)
+#         self.bert = BertModel(config, output_attentions=output_attentions)
+#         self.transformer = OpenAIGPTModel(config, output_attentions=output_attentions)
+#         self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
+#         self.multiple_choice_head = BertMultipleChoice(config)
+#         self.apply(self.init_bert_weights)
+#
+#     def forward(self, input_ids, mc_token_ids, input_mask=None, lm_labels=None, mc_labels=None, token_type_ids=None,
+#                 position_ids=None):
+#         input_shape = input_ids.size()  # (B, C, F)
+#         flat_input_ids = input_ids.view(-1, input_ids.size(-1))
+#         flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+#         flat_attention_mask = input_mask.view(-1, input_mask.size(-1)) if input_mask is not None else None
+#         bert_hidden_states, pooled_output = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask,
+#                                                  output_all_encoded_layers=False)
+#
+#         transformer_hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
+#
+#         num_choices = input_shape[1]
+#         # output_shape = (input_shape) + (hidden_states.size(-1),)
+#         # hidden_states = hidden_states.view(*output_shape)
+#
+#         lm_logits = self.lm_head(transformer_hidden_states)
+#         mc_logits = self.multiple_choice_head(pooled_output, num_choices=num_choices)
+#         losses = []
+#         if lm_labels is not None:
+#             shift_logits = lm_logits[..., :-1, :].contiguous()
+#             shift_labels = lm_labels[..., 1:].contiguous()
+#             loss_fct = CrossEntropyLoss(ignore_index=-1)
+#             losses.append(loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)))
+#         if mc_labels is not None:
+#             loss_fct = CrossEntropyLoss()
+#             losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))
+#         if losses:
+#             return losses
+#         return lm_logits, mc_logits
\ No newline at end of file
diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
new file mode 100644
index 0000000..3d227a3
--- /dev/null
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -0,0 +1,821 @@
+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch OpenAI GPT-2 model."""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import collections
+import copy
+import json
+import logging
+import math
+import os
+import shutil
+import tarfile
+import tempfile
+import sys
+from io import open
+
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+from torch.nn.parameter import Parameter
+
+from .file_utils import cached_path, CONFIG_NAME, WEIGHTS_NAME
+from .modeling import BertLayerNorm as LayerNorm
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
+                                "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin"}
+PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
+                                 "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json"}
+
+def load_tf_weights_in_gpt2(model, gpt2_checkpoint_path):
+    """ Load tf checkpoints in a pytorch model
+    """
+    try:
+        import re
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions.")
+        raise
+    tf_path = os.path.abspath(gpt2_checkpoint_path)
+    print("Converting TensorFlow checkpoint from {}".format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        print("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array.squeeze())
+
+    for name, array in zip(names, arrays):
+        name = name[6:]  # skip "model/"
+        name = name.split('/')
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+\d+', m_name):
+                l = re.split(r'(\d+)', m_name)
+            else:
+                l = [m_name]
+            if l[0] == 'w' or l[0] == 'g':
+                pointer = getattr(pointer, 'weight')
+            elif l[0] == 'b':
+                pointer = getattr(pointer, 'bias')
+            elif l[0] == 'wpe' or l[0] == 'wte':
+                pointer = getattr(pointer, l[0])
+                pointer = getattr(pointer, 'weight')
+            else:
+                pointer = getattr(pointer, l[0])
+            if len(l) >= 2:
+                num = int(l[1])
+                pointer = pointer[num]
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        print("Initialize PyTorch weight {}".format(name))
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+def gelu(x):
+    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+
+
+class GPT2Config(object):
+    """Configuration class to store the configuration of a `GPT2Model`.
+    """
+
+    def __init__(
+        self,
+        vocab_size_or_config_json_file=50257,
+        n_special=0,
+        n_positions=1024,
+        n_ctx=1024,
+        n_embd=768,
+        n_layer=12,
+        n_head=12,
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        predict_special_tokens=True
+    ):
+        """Constructs GPT2Config.
+
+        Args:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
+            n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
+            n_positions: Number of positional embeddings.
+            n_ctx: Size of the causal mask (usually same as n_positions).
+            n_embd: Dimensionality of the embeddings and hidden states.
+            n_layer: Number of hidden layers in the Transformer encoder.
+            n_head: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            layer_norm_epsilon: epsilon to use in the layer norm layers
+            resid_pdrop: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attn_pdrop: The dropout ratio for the attention
+                probabilities.
+            embd_pdrop: The dropout ratio for the embeddings.
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+            predict_special_tokens: should we predict special tokens (when the model has a LM head)
+        """
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.n_special = n_special
+            self.n_ctx = n_ctx
+            self.n_positions = n_positions
+            self.n_embd = n_embd
+            self.n_layer = n_layer
+            self.n_head = n_head
+            self.resid_pdrop = resid_pdrop
+            self.embd_pdrop = embd_pdrop
+            self.attn_pdrop = attn_pdrop
+            self.layer_norm_epsilon = layer_norm_epsilon
+            self.initializer_range = initializer_range
+            self.predict_special_tokens = predict_special_tokens
+        else:
+            raise ValueError(
+                "First argument must be either a vocabulary size (int)"
+                "or the path to a pretrained model config file (str)"
+            )
+
+    @property
+    def total_tokens_embeddings(self):
+        return self.vocab_size + self.n_special
+
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `GPT2Config` from a Python dictionary of parameters."""
+        config = GPT2Config(vocab_size_or_config_json_file=-1)
+        for key, value in json_object.items():
+            config.__dict__[key] = value
+        return config
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `GPT2Config` from a json file of parameters."""
+        with open(json_file, "r", encoding="utf-8") as reader:
+            text = reader.read()
+        return cls.from_dict(json.loads(text))
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+
+    def to_json_file(self, json_file_path):
+        """ Save this instance to a json file."""
+        with open(json_file_path, "w", encoding='utf-8') as writer:
+            writer.write(self.to_json_string())
+
+
+class Conv1D(nn.Module):
+    def __init__(self, nf, nx):
+        super(Conv1D, self).__init__()
+        self.nf = nf
+        w = torch.empty(nx, nf)
+        nn.init.normal_(w, std=0.02)
+        self.weight = Parameter(w)
+        self.bias = Parameter(torch.zeros(nf))
+
+    def forward(self, x):
+        size_out = x.size()[:-1] + (self.nf,)
+        x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
+        x = x.view(*size_out)
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(self, nx, n_ctx, config, scale=False, output_attentions=False):
+        super(Attention, self).__init__()
+        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
+        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
+        assert n_state % config.n_head == 0
+        self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
+        self.n_head = config.n_head
+        self.split_size = n_state
+        self.scale = scale
+        self.output_attentions = output_attentions
+        self.c_attn = Conv1D(n_state * 3, nx)
+        self.c_proj = Conv1D(n_state, nx)
+        self.attn_dropout = nn.Dropout(config.attn_pdrop)
+        self.resid_dropout = nn.Dropout(config.resid_pdrop)
+
+    def _attn(self, q, k, v):
+        w = torch.matmul(q, k)
+        if self.scale:
+            w = w / math.sqrt(v.size(-1))
+        nd, ns = w.size(-2), w.size(-1)
+        b = self.bias[:, :, ns-nd:ns, :ns]
+        w = w * b - 1e4 * (1 - b)
+
+        w = nn.Softmax(dim=-1)(w)
+        w = self.attn_dropout(w)
+        if self.output_attentions:
+            return w, torch.matmul(w, v)
+        return torch.matmul(w, v)
+
+    def merge_heads(self, x):
+        x = x.permute(0, 2, 1, 3).contiguous()
+        new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
+        return x.view(*new_x_shape)  # in Tensorflow implem: fct merge_states
+
+    def split_heads(self, x, k=False):
+        new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
+        x = x.view(*new_x_shape)  # in Tensorflow implem: fct split_states
+        if k:
+            return x.permute(0, 2, 3, 1)  # (batch, head, head_features, seq_length)
+        else:
+            return x.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
+
+    def forward(self, x, layer_past=None):
+        x = self.c_attn(x)
+        query, key, value = x.split(self.split_size, dim=2)
+        query = self.split_heads(query)
+        key = self.split_heads(key, k=True)
+        value = self.split_heads(value)
+        if layer_past is not None:
+            past_key, past_value = layer_past[0].transpose(-2, -1), layer_past[1]  # transpose back cf below
+            key = torch.cat((past_key, key), dim=-1)
+            value = torch.cat((past_value, value), dim=-2)
+        present = torch.stack((key.transpose(-2, -1), value))  # transpose to have same shapes for stacking
+        a = self._attn(query, key, value)
+        if self.output_attentions:
+            attentions, a = a
+        a = self.merge_heads(a)
+        a = self.c_proj(a)
+        a = self.resid_dropout(a)
+        if self.output_attentions:
+            return attentions, a, present
+        return a, present
+
+
+class MLP(nn.Module):
+    def __init__(self, n_state, config):  # in MLP: n_state=3072 (4 * n_embd)
+        super(MLP, self).__init__()
+        nx = config.n_embd
+        self.c_fc = Conv1D(n_state, nx)
+        self.c_proj = Conv1D(nx, n_state)
+        self.act = gelu
+        self.dropout = nn.Dropout(config.resid_pdrop)
+
+    def forward(self, x):
+        h = self.act(self.c_fc(x))
+        h2 = self.c_proj(h)
+        return self.dropout(h2)
+
+
+class Block(nn.Module):
+    def __init__(self, n_ctx, config, scale=False, output_attentions=False):
+        super(Block, self).__init__()
+        nx = config.n_embd
+        self.output_attentions = output_attentions
+        self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon)
+        self.attn = Attention(nx, n_ctx, config, scale, output_attentions)
+        self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
+        self.mlp = MLP(4 * nx, config)
+
+    def forward(self, x, layer_past=None):
+        output_attn = self.attn(self.ln_1(x), layer_past=layer_past)
+        if self.output_attentions:
+            attentions, a, present = output_attn
+        else:
+            a, present = output_attn
+        x = x + a
+        m = self.mlp(self.ln_2(x))
+        x = x + m
+        if self.output_attentions:
+            return attentions, x, present
+        return x, present
+
+
+class GPT2LMHead(nn.Module):
+    """ Language Model Head for the transformer """
+
+    def __init__(self, model_embeddings_weights, config):
+        super(GPT2LMHead, self).__init__()
+        self.n_embd = config.n_embd
+        self.vocab_size = config.vocab_size
+        self.predict_special_tokens = config.predict_special_tokens
+        embed_shape = model_embeddings_weights.shape
+        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
+        self.set_embeddings_weights(model_embeddings_weights)
+
+    def set_embeddings_weights(self, model_embeddings_weights, predict_special_tokens=True):
+        self.predict_special_tokens = predict_special_tokens
+        self.decoder.weight = model_embeddings_weights  # Tied weights
+
+    def forward(self, hidden_state):
+        lm_logits = self.decoder(hidden_state)
+        if not self.predict_special_tokens:
+            lm_logits = lm_logits[..., :self.vocab_size]
+        return lm_logits
+
+
+class GPT2MultipleChoiceHead(nn.Module):
+    """ Classifier Head for the transformer """
+
+    def __init__(self, config):
+        super(GPT2MultipleChoiceHead, self).__init__()
+        self.n_embd = config.n_embd
+        self.dropout = nn.Dropout2d(config.resid_pdrop)  # To reproduce the noise_shape parameter of TF implementation
+        self.linear = nn.Linear(config.n_embd, 1)
+
+        nn.init.normal_(self.linear.weight, std=0.02)
+        nn.init.normal_(self.linear.bias, 0)
+
+    def forward(self, hidden_states, mc_token_ids):
+        # Classification logits
+        # hidden_state (bsz, num_choices, seq_length, hidden_size)
+        # mc_token_ids (bsz, num_choices)
+        mc_token_ids = mc_token_ids.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, -1, hidden_states.size(-1))
+        # (bsz, num_choices, 1, hidden_size)
+        multiple_choice_h = hidden_states.gather(2, mc_token_ids).squeeze(2)
+        # (bsz, num_choices, hidden_size)
+        multiple_choice_h = self.dropout(multiple_choice_h.transpose(1, 2)).transpose(1, 2)
+        multiple_choice_logits = self.linear(multiple_choice_h).squeeze(-1)
+        # (bsz, num_choices)
+        return multiple_choice_logits
+
+
+class GPT2PreTrainedModel(nn.Module):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+
+    def __init__(self, config, *inputs, **kwargs):
+        super(GPT2PreTrainedModel, self).__init__()
+        if not isinstance(config, GPT2Config):
+            raise ValueError(
+                "Parameter config in `{}(config)` should be an instance of class `GPT2Config`. "
+                "To create a model from a pretrained model use "
+                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
+                    self.__class__.__name__, self.__class__.__name__
+                )
+            )
+        self.config = config
+
+    def init_weights(self, module):
+        """ Initialize the weights.
+        """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
+        """
+        Instantiate a GPT2PreTrainedModel from a pre-trained model file or a pytorch state dict.
+        Download and cache the pre-trained model file if needed.
+
+        Params:
+            pretrained_model_name_or_path: either:
+                - a str with the name of a pre-trained model to load selected in the list of:
+                    . `gpt2`
+                - a path or url to a pretrained model archive containing:
+                    . `gpt2_config.json` a configuration file for the model
+                    . `pytorch_model.bin` a PyTorch dump of a GPT2Model instance
+                - a path or url to a pretrained model archive containing:
+                    . `gpt2_config.json` a configuration file for the model
+                    . a TensorFlow checkpoint with trained weights
+            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
+            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+            state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
+            *inputs, **kwargs: additional input for the specific GPT2 class
+        """
+        state_dict = kwargs.get('state_dict', None)
+        kwargs.pop('state_dict', None)
+        cache_dir = kwargs.get('cache_dir', None)
+        kwargs.pop('cache_dir', None)
+        from_tf = kwargs.get('from_tf', False)
+        kwargs.pop('from_tf', None)
+        num_special_tokens = kwargs.get('num_special_tokens', None)
+        kwargs.pop('num_special_tokens', None)
+
+        if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
+            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
+            config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
+        else:
+            archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
+            config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
+        # redirect to the cache, if necessary
+        try:
+            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
+            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find files {} and {} "
+                "at this path or url.".format(
+                    pretrained_model_name_or_path, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
+                    archive_file, config_file
+                )
+            )
+            return None
+        if resolved_archive_file == archive_file and resolved_config_file == config_file:
+            logger.info("loading weights file {}".format(archive_file))
+            logger.info("loading configuration file {}".format(config_file))
+        else:
+            logger.info("loading weights file {} from cache at {}".format(
+                archive_file, resolved_archive_file))
+            logger.info("loading configuration file {} from cache at {}".format(
+                config_file, resolved_config_file))
+        # Load config
+        config = GPT2Config.from_json_file(resolved_config_file)
+        logger.info("Model config {}".format(config))
+        # Instantiate model.
+        model = cls(config, *inputs, **kwargs)
+        if state_dict is None and not from_tf:
+            state_dict = torch.load(resolved_archive_file, map_location='cpu')
+        if from_tf:
+            # Directly load from a TensorFlow checkpoint (stored as NumPy array)
+            return load_tf_weights_in_gpt2(model, resolved_archive_file)
+
+        old_keys = []
+        new_keys = []
+        for key in state_dict.keys():
+            new_key = None
+            if key.endswith(".g"):
+                new_key = key[:-2] + ".weight"
+            elif key.endswith(".b"):
+                new_key = key[:-2] + ".bias"
+            elif key.endswith(".w"):
+                new_key = key[:-2] + ".weight"
+            if new_key:
+                old_keys.append(key)
+                new_keys.append(new_key)
+        for old_key, new_key in zip(old_keys, new_keys):
+            state_dict[new_key] = state_dict.pop(old_key)
+
+        missing_keys = []
+        unexpected_keys = []
+        error_msgs = []
+        # copy state_dict so _load_from_state_dict can modify it
+        metadata = getattr(state_dict, "_metadata", None)
+        state_dict = state_dict.copy()
+        if metadata is not None:
+            state_dict._metadata = metadata
+
+        def load(module, prefix=""):
+            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+            module._load_from_state_dict(
+                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs
+            )
+            for name, child in module._modules.items():
+                if child is not None:
+                    load(child, prefix + name + ".")
+
+        start_model = model
+        if hasattr(model, "transformer") and all(not s.startswith('transformer.') for s in state_dict.keys()):
+            start_model = model.transformer
+        load(start_model, prefix="")
+
+        if len(missing_keys) > 0:
+            logger.info(
+                "Weights of {} not initialized from pretrained model: {}".format(model.__class__.__name__, missing_keys)
+            )
+        if len(unexpected_keys) > 0:
+            logger.info(
+                "Weights from pretrained model not used in {}: {}".format(model.__class__.__name__, unexpected_keys)
+            )
+        if len(error_msgs) > 0:
+            raise RuntimeError(
+                "Error(s) in loading state_dict for {}:\n\t{}".format(model.__class__.__name__, "\n\t".join(error_msgs))
+            )
+
+        # Add additional embeddings for special tokens if needed
+        # This step also make sure we are still sharing the output and input embeddings after loading weights
+        model.set_num_special_tokens(num_special_tokens if num_special_tokens is not None else config.n_special)
+        return model
+
+
+class GPT2Model(GPT2PreTrainedModel):
+    """OpenAI GPT-2 model ("Language Models are Unsupervised Multitask Learners").
+
+    GPT-2 use a single embedding matrix to store the word and special embeddings.
+    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
+    Special tokens need to be trained during the fine-tuning if you use them.
+    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
+
+    The embeddings are ordered as follow in the token embeddings matrice:
+        [0,                                                         ----------------------
+         ...                                                        -> word embeddings
+         config.vocab_size - 1,                                     ______________________
+         config.vocab_size,
+         ...                                                        -> special embeddings
+         config.vocab_size + config.n_special - 1]                  ______________________
+
+    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
+        total_tokens_embeddings = config.vocab_size + config.n_special
+    You should use the associate indices to index the embeddings.
+
+    Params:
+        config: a GPT2Config class instance with the configuration to build a new model
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
+            were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
+        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
+            with the position indices (selected in the range [0, config.n_positions - 1[.
+        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
+            You can use it to add a third type of embedding to each input token in the sequence
+            (the previous two being the word and position embeddings).
+            The input, position and token_type embeddings are summed inside the Transformer before the first
+            self-attention block.
+        `past`: an optional list of torch.LongTensor that contains pre-computed hidden-states
+            (key and values in the attention blocks) to speed up sequential decoding
+            (this is the presents output of the model, cf. below).
+
+    Outputs a tuple consisting of:
+        `hidden_states`: the encoded-hidden-states at the top of the model
+            as a torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
+            (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
+        `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as
+            torch.FloatTensors. They can be reused to speed up sequential decoding.
+
+    Example usage:
+    ```python
+    # Already been converted into BPE token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+
+    config = modeling_gpt2.GPT2Config()
+
+    model = modeling_gpt2.GPT2Model(config)
+    hidden_states, presents = model(input_ids)
+    ```
+    """
+
+    def __init__(self, config, output_attentions=False):
+        super(GPT2Model, self).__init__(config)
+        self.output_attentions = output_attentions
+        self.wte = nn.Embedding(config.total_tokens_embeddings, config.n_embd)
+        self.wpe = nn.Embedding(config.n_positions, config.n_embd)
+        self.drop = nn.Dropout(config.embd_pdrop)
+        block = Block(config.n_ctx, config, scale=True, output_attentions=output_attentions)
+        self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])
+        self.ln_f = LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+
+        self.apply(self.init_weights)
+
+    def set_num_special_tokens(self, num_special_tokens):
+        " Update input embeddings with new embedding matrice if needed "
+        if self.config.n_special == num_special_tokens:
+            return
+        # Update config
+        self.config.n_special = num_special_tokens
+        # Build new embeddings and initialize all new embeddings (in particular the special tokens)
+        old_embed = self.wte
+        self.wte = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd)
+        self.wte.to(old_embed.weight.device)
+        self.init_weights(self.wte)
+        # Copy word embeddings from the previous weights
+        self.wte.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
+
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None):
+        if past is None:
+            past_length = 0
+            past = [None] * len(self.h)
+        else:
+            past_length = past[0][0].size(-2)
+        if position_ids is None:
+            position_ids = torch.arange(past_length, input_ids.size(-1) + past_length, dtype=torch.long, device=input_ids.device)
+            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_ids.size(-1))
+        position_ids = position_ids.view(-1, position_ids.size(-1))
+
+        inputs_embeds = self.wte(input_ids)
+        position_embeds = self.wpe(position_ids)
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
+            token_type_embeds = self.wte(token_type_ids)
+        else:
+            token_type_embeds = 0
+        hidden_states = inputs_embeds + position_embeds + token_type_embeds
+        hidden_states = self.drop(hidden_states)
+
+        presents = []
+        all_attentions = []
+        for block, layer_past in zip(self.h, past):
+            if self.output_attentions:
+                attentions, hidden_states, present = block(hidden_states, layer_past)
+                all_attentions.append(attentions)
+            else:
+                hidden_states, present = block(hidden_states, layer_past)
+            presents.append(present)
+        hidden_states = self.ln_f(hidden_states)
+        output_shape = input_shape + (hidden_states.size(-1),)
+        if self.output_attentions:
+            return all_attentions, hidden_states.view(*output_shape), presents
+        return hidden_states.view(*output_shape), presents
+
+
+class GPT2LMHeadModel(GPT2PreTrainedModel):
+    """OpenAI GPT-2 model with a Language Modeling head ("Language Models are Unsupervised Multitask Learners").
+
+    Params:
+        config: a GPT2Config class instance with the configuration to build a new model
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
+            were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
+        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
+            with the position indices (selected in the range [0, config.n_positions - 1[.
+        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
+            You can use it to add a third type of embedding to each input token in the sequence
+            (the previous two being the word and position embeddings).
+            The input, position and token_type embeddings are summed inside the Transformer before the first
+            self-attention block.
+        `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
+            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
+            is only computed for the labels set in [0, ..., vocab_size]
+        `past`: an optional list of torch.LongTensor that contains pre-computed hidden-states
+            (key and values in the attention blocks) to speed up sequential decoding
+            (this is the presents output of the model, cf. below).
+
+    Outputs:
+        if `lm_labels` is not `None`:
+            Outputs the language modeling loss.
+        else a tuple:
+            `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, sequence_length, config.vocab_size]
+                (or more generally [d_1, ..., d_n, config.vocab_size] were d_1 ... d_n are the dimension of input_ids)
+            `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as
+                torch.FloatTensors. They can be reused to speed up sequential decoding.
+
+    Example usage:
+    ```python
+    # Already been converted into BPE token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+
+    config = modeling_gpt2.GPT2Config()
+
+    model = modeling_gpt2.GPT2LMHeadModel(config)
+    lm_logits, presents = model(input_ids)
+    ```
+    """
+
+    def __init__(self, config, output_attentions=False):
+        super(GPT2LMHeadModel, self).__init__(config)
+        self.transformer = GPT2Model(config, output_attentions=output_attentions)
+        self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)
+        self.apply(self.init_weights)
+
+    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
+        """ Update input and output embeddings with new embedding matrice
+            Make sure we are sharing the embeddings
+        """
+        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
+        self.transformer.set_num_special_tokens(num_special_tokens)
+        self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens)
+
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None):
+        transformer_output = self.transformer(input_ids, position_ids, token_type_ids, past)
+        if self.transformer.output_attentions:
+            all_attentions, hidden_states, presents = transformer_output
+        else:
+            hidden_states, presents = transformer_output
+        lm_logits = self.lm_head(hidden_states)
+        if lm_labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = lm_labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
+                            shift_labels.view(-1))
+            return loss
+        if self.transformer.output_attentions:
+            return all_attentions, lm_logits, presents
+        return lm_logits, presents
+
+
+class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
+    """OpenAI GPT-2 model with a Language Modeling and a Multiple Choice head ("Language Models are Unsupervised Multitask Learners").
+
+    Params:
+        config: a GPT2Config class instance with the configuration to build a new model
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] with the BPE token
+            indices selected in the range [0, config.vocab_size[
+        `mc_token_ids`: a torch.LongTensor of shape [batch_size, num_choices] with the index of the token from
+            which we should take the hidden state to feed the multiple choice classifier (usually last token of the sequence)
+        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
+            with the position indices (selected in the range [0, config.n_positions - 1[.
+        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
+            You can use it to add a third type of embedding to each input token in the sequence
+            (the previous two being the word and position embeddings).
+            The input, position and token_type embeddings are summed inside the Transformer before the first
+            self-attention block.
+        `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, num_choices, sequence_length]
+            with indices selected in [-1, 0, ..., config.vocab_size]. All labels set to -1 are ignored (masked), the loss
+            is only computed for the labels set in [0, ..., config.vocab_size]
+        `multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, ..., num_choices].
+        `past`: an optional list of torch.LongTensor that contains pre-computed hidden-states
+            (key and values in the attention blocks) to speed up sequential decoding
+            (this is the presents output of the model, cf. below).
+
+    Outputs:
+        if `lm_labels` and `multiple_choice_labels` are not `None`:
+            Outputs a tuple of losses with the language modeling loss and the multiple choice loss.
+        else: a tuple with
+            `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, num_choices, sequence_length, config.vocab_size]
+            `multiple_choice_logits`: the multiple choice logits as a torch.FloatTensor of size [batch_size, num_choices]
+            `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as
+                torch.FloatTensors. They can be reused to speed up sequential decoding.
+
+    Example usage:
+    ```python
+    # Already been converted into BPE token ids
+    input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]]])  # (bsz, number of choice, seq length)
+    mc_token_ids = torch.LongTensor([[2], [1]]) # (bsz, number of choice)
+
+    config = modeling_gpt2.GPT2Config()
+
+    model = modeling_gpt2.GPT2DoubleHeadsModel(config)
+    lm_logits, multiple_choice_logits, presents = model(input_ids, mc_token_ids)
+    ```
+    """
+
+    def __init__(self, config, output_attentions=False):
+        super(GPT2DoubleHeadsModel, self).__init__(config)
+        self.transformer = GPT2Model(config, output_attentions=output_attentions)
+        self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)
+        self.multiple_choice_head = GPT2MultipleChoiceHead(config)
+        self.apply(self.init_weights)
+
+    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
+        """ Update input and output embeddings with new embedding matrice
+            Make sure we are sharing the embeddings
+        """
+        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
+        self.transformer.set_num_special_tokens(num_special_tokens)
+        self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens)
+
+    def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None, position_ids=None, past=None):
+        transformer_output = self.transformer(input_ids, position_ids, token_type_ids, past)
+        if self.transformer.output_attentions:
+            all_attentions, hidden_states, presents = transformer_output
+        else:
+            hidden_states, presents = transformer_output
+        lm_logits = self.lm_head(hidden_states)
+        mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
+        losses = []
+        if lm_labels is not None:
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = lm_labels[..., 1:].contiguous()
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            losses.append(loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)))
+        if mc_labels is not None:
+            loss_fct = CrossEntropyLoss()
+            losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))
+        if losses:
+            return losses
+        if self.transformer.output_attentions:
+            return all_attentions, lm_logits, mc_logits, presents
+        return lm_logits, mc_logits, presents
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
new file mode 100644
index 0000000..9aa5f77
--- /dev/null
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -0,0 +1,1089 @@
+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch OpenAI GPT model."""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import collections
+import copy
+import json
+import logging
+import math
+import os
+import shutil
+import tarfile
+import tempfile
+import sys
+from io import open
+
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+from torch.nn.parameter import Parameter
+
+from .file_utils import cached_path, CONFIG_NAME, WEIGHTS_NAME
+from .modeling import BertLayerNorm as LayerNorm
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_MODEL_ARCHIVE_MAP = {
+    "openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-pytorch_model.bin"}
+PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json"}
+
+
+def load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path):
+    """ Load tf pre-trained weights in a pytorch model (from NumPy arrays here)
+    """
+    import re
+    import numpy as np
+    print("Loading weights...")
+    names = json.load(open(openai_checkpoint_folder_path + '/parameters_names.json', "r", encoding='utf-8'))
+    shapes = json.load(open(openai_checkpoint_folder_path + '/params_shapes.json', "r", encoding='utf-8'))
+    offsets = np.cumsum([np.prod(shape) for shape in shapes])
+    init_params = [np.load(openai_checkpoint_folder_path + '/params_{}.npy'.format(n)) for n in range(10)]
+    init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
+    init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)]
+
+    # This was used when we had a single embedding matrix for positions and tokens
+    # init_params[0] = np.concatenate([init_params[1], init_params[0]], 0)
+    # del init_params[1]
+    init_params = [arr.squeeze() for arr in init_params]
+
+    try:
+        assert model.tokens_embed.weight.shape == init_params[1].shape
+        assert model.positions_embed.weight.shape == init_params[0].shape
+    except AssertionError as e:
+        e.args += (model.tokens_embed.weight.shape, init_params[1].shape)
+        e.args += (model.positions_embed.weight.shape, init_params[0].shape)
+        raise
+
+    model.tokens_embed.weight.data = torch.from_numpy(init_params[1])
+    model.positions_embed.weight.data = torch.from_numpy(init_params[0])
+    names.pop(0)
+    # Pop position and token embedding arrays
+    init_params.pop(0)
+    init_params.pop(0)
+
+    for name, array in zip(names, init_params):  # names[1:n_transfer], init_params[1:n_transfer]):
+        name = name[6:]  # skip "model/"
+        assert name[-2:] == ":0"
+        name = name[:-2]
+        name = name.split('/')
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+\d+', m_name):
+                l = re.split(r'(\d+)', m_name)
+            else:
+                l = [m_name]
+            if l[0] == 'g':
+                pointer = getattr(pointer, 'weight')
+            elif l[0] == 'b':
+                pointer = getattr(pointer, 'bias')
+            elif l[0] == 'w':
+                pointer = getattr(pointer, 'weight')
+            else:
+                pointer = getattr(pointer, l[0])
+            if len(l) >= 2:
+                num = int(l[1])
+                pointer = pointer[num]
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        print("Initialize PyTorch weight {}".format(name))
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+def gelu(x):
+    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+
+
+def swish(x):
+    return x * torch.sigmoid(x)
+
+
+ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu}
+
+
+class OpenAIGPTConfig(object):
+    """Configuration class to store the configuration of a `OpenAIGPTModel`.
+    """
+
+    def __init__(
+            self,
+            vocab_size_or_config_json_file=40478,
+            n_special=0,
+            n_positions=512,
+            n_ctx=512,
+            n_embd=768,
+            n_layer=12,
+            n_head=12,
+            afn="gelu",
+            resid_pdrop=0.1,
+            embd_pdrop=0.1,
+            attn_pdrop=0.1,
+            layer_norm_epsilon=1e-5,
+            initializer_range=0.02,
+            predict_special_tokens=True
+    ):
+        """Constructs OpenAIGPTConfig.
+
+        Args:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
+            n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
+            n_positions: Number of positional embeddings.
+            n_ctx: Size of the causal mask (usually same as n_positions).
+            n_embd: Dimensionality of the embeddings and hidden states.
+            n_layer: Number of hidden layers in the Transformer encoder.
+            n_head: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            afn: The non-linear activation function (function or string) in the
+                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+            resid_pdrop: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attn_pdrop: The dropout ratio for the attention
+                probabilities.
+            embd_pdrop: The dropout ratio for the embeddings.
+            layer_norm_epsilon: epsilon to use in the layer norm layers
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+            predict_special_tokens: should we predict special tokens (when the model has a LM head)
+        """
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                                                               and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.n_special = n_special
+            self.n_ctx = n_ctx
+            self.n_positions = n_positions
+            self.n_embd = n_embd
+            self.n_layer = n_layer
+            self.n_head = n_head
+            self.afn = afn
+            self.resid_pdrop = resid_pdrop
+            self.embd_pdrop = embd_pdrop
+            self.attn_pdrop = attn_pdrop
+            self.layer_norm_epsilon = layer_norm_epsilon
+            self.initializer_range = initializer_range
+            self.predict_special_tokens = predict_special_tokens
+        else:
+            raise ValueError(
+                "First argument must be either a vocabulary size (int)"
+                "or the path to a pretrained model config file (str)"
+            )
+
+    @property
+    def total_tokens_embeddings(self):
+        return self.vocab_size + self.n_special
+
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `OpenAIGPTConfig` from a Python dictionary of parameters."""
+        config = OpenAIGPTConfig(vocab_size_or_config_json_file=-1)
+        for key, value in json_object.items():
+            config.__dict__[key] = value
+        return config
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `OpenAIGPTConfig` from a json file of parameters."""
+        with open(json_file, "r", encoding="utf-8") as reader:
+            text = reader.read()
+        return cls.from_dict(json.loads(text))
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+
+    def to_json_file(self, json_file_path):
+        """ Save this instance to a json file."""
+        with open(json_file_path, "w", encoding='utf-8') as writer:
+            writer.write(self.to_json_string())
+
+
+class Conv1D(nn.Module):
+    def __init__(self, nf, rf, nx):
+        super(Conv1D, self).__init__()
+        self.rf = rf
+        self.nf = nf
+        if rf == 1:  # faster 1x1 conv
+            w = torch.empty(nx, nf)
+            nn.init.normal_(w, std=0.02)
+            self.weight = Parameter(w)
+            self.bias = Parameter(torch.zeros(nf))
+        else:  # was used to train LM
+            raise NotImplementedError
+
+    def forward(self, x):
+        if self.rf == 1:
+            size_out = x.size()[:-1] + (self.nf,)
+            x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
+            x = x.view(*size_out)
+        else:
+            raise NotImplementedError
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(self, nx, n_ctx, config, scale=False, output_attentions=False):
+        super(Attention, self).__init__()
+        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
+        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
+        assert n_state % config.n_head == 0
+        self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
+        self.n_head = config.n_head
+        self.split_size = n_state
+        self.scale = scale
+        self.output_attentions = output_attentions
+        self.c_attn = Conv1D(n_state * 3, 1, nx)  # (out_channels, size_conv, in_channels)
+        self.c_proj = Conv1D(n_state, 1, nx)
+        self.attn_dropout = nn.Dropout(config.attn_pdrop)
+        self.resid_dropout = nn.Dropout(config.resid_pdrop)
+
+    def _attn(self, q, k, v):
+        w = torch.matmul(q, k)
+        if self.scale:
+            w = w / math.sqrt(v.size(-1))
+        # w = w * self.bias + -1e9 * (1 - self.bias)  # TF implem method: mask_attn_weights
+        # XD: self.b may be larger than w, so we need to crop it
+        b = self.bias[:, :, : w.size(-2), : w.size(-1)]
+        w = w * b + -1e9 * (1 - b)
+
+        w = nn.Softmax(dim=-1)(w)
+        w = self.attn_dropout(w)
+        if self.output_attentions:
+            return w, torch.matmul(w, v)
+        return torch.matmul(w, v)
+
+    def merge_heads(self, x):
+        x = x.permute(0, 2, 1, 3).contiguous()
+        new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
+        return x.view(*new_x_shape)  # in Tensorflow implem: fct merge_states
+
+    def split_heads(self, x, k=False):
+        new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
+        x = x.view(*new_x_shape)  # in Tensorflow implem: fct split_states
+        if k:
+            return x.permute(0, 2, 3, 1)
+        else:
+            return x.permute(0, 2, 1, 3)
+
+    def forward(self, x):
+        x = self.c_attn(x)
+        query, key, value = x.split(self.split_size, dim=2)
+        query = self.split_heads(query)
+        key = self.split_heads(key, k=True)
+        value = self.split_heads(value)
+        a = self._attn(query, key, value)
+        if self.output_attentions:
+            attentions, a = a
+        a = self.merge_heads(a)
+        a = self.c_proj(a)
+        a = self.resid_dropout(a)
+        if self.output_attentions:
+            return attentions, a
+        return a
+
+
+class MLP(nn.Module):
+    def __init__(self, n_state, config):  # in MLP: n_state=3072 (4 * n_embd)
+        super(MLP, self).__init__()
+        nx = config.n_embd
+        self.c_fc = Conv1D(n_state, 1, nx)
+        self.c_proj = Conv1D(nx, 1, n_state)
+        self.act = ACT_FNS[config.afn]
+        self.dropout = nn.Dropout(config.resid_pdrop)
+
+    def forward(self, x):
+        h = self.act(self.c_fc(x))
+        h2 = self.c_proj(h)
+        return self.dropout(h2)
+
+
+class Block(nn.Module):
+    def __init__(self, n_ctx, config, scale=False, output_attentions=False):
+        super(Block, self).__init__()
+        nx = config.n_embd
+        self.output_attentions = output_attentions
+        self.attn = Attention(nx, n_ctx, config, scale, output_attentions)
+        self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon)
+        self.mlp = MLP(4 * nx, config)
+        self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
+
+    def forward(self, x):
+        a = self.attn(x)
+        if self.output_attentions:
+            attentions, a = a
+        n = self.ln_1(x + a)
+        m = self.mlp(n)
+        h = self.ln_2(n + m)
+        if self.output_attentions:
+            return attentions, h
+        return h
+
+
+class OpenAIGPTLMHead(nn.Module):
+    """ Language Model Head for the transformer """
+
+    def __init__(self, model_embeddings_weights, config):
+        super(OpenAIGPTLMHead, self).__init__()
+        self.n_embd = config.n_embd
+        self.vocab_size = config.vocab_size
+        self.predict_special_tokens = config.predict_special_tokens
+        embed_shape = model_embeddings_weights.shape
+        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
+        self.set_embeddings_weights(model_embeddings_weights)
+
+    def set_embeddings_weights(self, model_embeddings_weights, predict_special_tokens=True):
+        self.predict_special_tokens = predict_special_tokens
+        embed_shape = model_embeddings_weights.shape
+        self.decoder.weight = model_embeddings_weights  # Tied weights
+
+    def forward(self, hidden_state):
+        lm_logits = self.decoder(hidden_state)
+        if not self.predict_special_tokens:
+            lm_logits = lm_logits[..., :self.vocab_size]
+        return lm_logits
+
+
+class OpenAIGPTMultipleChoiceHead(nn.Module):
+    """ Classifier Head for the transformer """
+
+    def __init__(self, config):
+        super(OpenAIGPTMultipleChoiceHead, self).__init__()
+        self.n_embd = config.n_embd
+        self.dropout = nn.Dropout2d(config.resid_pdrop)  # To reproduce the noise_shape parameter of TF implementation
+        self.linear = nn.Linear(config.n_embd, 1)
+
+        nn.init.normal_(self.linear.weight, std=0.02)
+        nn.init.normal_(self.linear.bias, 0)
+
+    def forward(self, hidden_states, mc_token_ids):
+        # Classification logits
+        # hidden_state (bsz, num_choices, seq_length, hidden_size)
+        # mc_token_ids (bsz, num_choices)
+        mc_token_ids = mc_token_ids.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, -1, hidden_states.size(-1))
+        # mc_token_ids (bsz, num_choices, 1, hidden_size)
+        multiple_choice_h = hidden_states.gather(2, mc_token_ids).squeeze(2)
+        # multiple_choice_h (bsz, num_choices, hidden_size)
+        multiple_choice_h = self.dropout(multiple_choice_h.transpose(1, 2)).transpose(1, 2)
+        multiple_choice_logits = self.linear(multiple_choice_h).squeeze(-1)
+        # (bsz, num_choices)
+        return multiple_choice_logits
+
+
+class OpenAIGPTPreTrainedModel(nn.Module):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+
+    def __init__(self, config, *inputs, **kwargs):
+        super(OpenAIGPTPreTrainedModel, self).__init__()
+        if not isinstance(config, OpenAIGPTConfig):
+            raise ValueError(
+                "Parameter config in `{}(config)` should be an instance of class `OpenAIGPTConfig`. "
+                "To create a model from a pretrained model use "
+                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
+                    self.__class__.__name__, self.__class__.__name__
+                )
+            )
+        self.config = config
+
+    def init_weights(self, module):
+        """ Initialize the weights.
+        """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, num_special_tokens=None, *inputs, **kwargs):
+        """
+        Instantiate a OpenAIGPTPreTrainedModel from a pre-trained model file or a pytorch state dict.
+        Download and cache the pre-trained model file if needed.
+
+        Params:
+            pretrained_model_name_or_path: either:
+                - a str with the name of a pre-trained model to load selected in the list of:
+                    . `openai-gpt`
+                - a path or url to a pretrained model archive containing:
+                    . `openai_gpt_config.json` a configuration file for the model
+                    . `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance
+                - a path or url to a pretrained model archive containing:
+                    . `openai-gpt-config.json` a configuration file for the model
+                    . a series of NumPy files containing OpenAI TensorFlow trained weights
+            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
+            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
+            *inputs, **kwargs: additional input for the specific OpenAI-GPT class
+        """
+        state_dict = kwargs.get('state_dict', None)
+        kwargs.pop('state_dict', None)
+        cache_dir = kwargs.get('cache_dir', None)
+        kwargs.pop('cache_dir', None)
+        from_tf = kwargs.get('from_tf', False)
+        kwargs.pop('from_tf', None)
+
+        if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
+            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
+            config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
+        else:
+            archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
+            config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
+        # redirect to the cache, if necessary
+        try:
+            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
+            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find files {} and {} "
+                "at this path or url.".format(
+                    pretrained_model_name_or_path, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
+                    pretrained_model_name_or_path,
+                    archive_file, config_file
+                )
+            )
+            return None
+        if resolved_archive_file == archive_file and resolved_config_file == config_file:
+            logger.info("loading weights file {}".format(archive_file))
+            logger.info("loading configuration file {}".format(config_file))
+        else:
+            logger.info("loading weights file {} from cache at {}".format(
+                archive_file, resolved_archive_file))
+            logger.info("loading configuration file {} from cache at {}".format(
+                config_file, resolved_config_file))
+        # Load config
+        config = OpenAIGPTConfig.from_json_file(resolved_config_file)
+        logger.info("Model config {}".format(config))
+        # Instantiate model.
+        model = cls(config, *inputs, **kwargs)
+        if state_dict is None and not from_tf:
+            state_dict = torch.load(resolved_archive_file, map_location='cpu')
+        if from_tf:
+            # Directly load from a TensorFlow checkpoint (stored as NumPy array)
+            return load_tf_weights_in_openai_gpt(model, resolved_archive_file)
+
+        old_keys = []
+        new_keys = []
+        for key in state_dict.keys():
+            new_key = None
+            if key.endswith(".g"):
+                new_key = key[:-2] + ".weight"
+            elif key.endswith(".b"):
+                new_key = key[:-2] + ".bias"
+            elif key.endswith(".w"):
+                new_key = key[:-2] + ".weight"
+            if new_key:
+                old_keys.append(key)
+                new_keys.append(new_key)
+        for old_key, new_key in zip(old_keys, new_keys):
+            state_dict[new_key] = state_dict.pop(old_key)
+
+        missing_keys = []
+        unexpected_keys = []
+        error_msgs = []
+        # copy state_dict so _load_from_state_dict can modify it
+        metadata = getattr(state_dict, "_metadata", None)
+        state_dict = state_dict.copy()
+        if metadata is not None:
+            state_dict._metadata = metadata
+
+        def load(module, prefix=""):
+            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+            module._load_from_state_dict(
+                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs
+            )
+            for name, child in module._modules.items():
+                if child is not None:
+                    load(child, prefix + name + ".")
+
+        start_model = model
+        if hasattr(model, "transformer") and all(not s.startswith('transformer.') for s in state_dict.keys()):
+            start_model = model.transformer
+        load(start_model, prefix="")
+
+        if len(missing_keys) > 0:
+            logger.info(
+                "Weights of {} not initialized from pretrained model: {}".format(model.__class__.__name__, missing_keys)
+            )
+        if len(unexpected_keys) > 0:
+            logger.info(
+                "Weights from pretrained model not used in {}: {}".format(model.__class__.__name__, unexpected_keys)
+            )
+        if len(error_msgs) > 0:
+            raise RuntimeError(
+                "Error(s) in loading state_dict for {}:\n\t{}".format(model.__class__.__name__, "\n\t".join(error_msgs))
+            )
+
+        # Add additional embeddings for special tokens if needed
+        # This step also make sure we are still sharing the output and input embeddings after loading weights
+        model.set_num_special_tokens(num_special_tokens if num_special_tokens is not None else config.n_special)
+        return model
+
+
+class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
+    """OpenAI GPT model ("Improving Language Understanding by Generative Pre-Training").
+
+    OpenAI GPT use a single embedding matrix to store the word and special embeddings.
+    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
+    Special tokens need to be trained during the fine-tuning if you use them.
+    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
+
+    The embeddings are ordered as follow in the token embeddings matrice:
+        [0,                                                         ----------------------
+         ...                                                        -> word embeddings
+         config.vocab_size - 1,                                     ______________________
+         config.vocab_size,
+         ...                                                        -> special embeddings
+         config.vocab_size + config.n_special - 1]                  ______________________
+
+    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
+        total_tokens_embeddings = config.vocab_size + config.n_special
+    You should use the associate indices to index the embeddings.
+
+    Params:
+        config: a OpenAIGPTConfig class instance with the configuration to build a new model
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
+            were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[
+        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
+            with the position indices (selected in the range [0, config.n_positions - 1[.
+        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
+            You can use it to add a third type of embedding to each input token in the sequence
+            (the previous two being the word and position embeddings).
+            The input, position and token_type embeddings are summed inside the Transformer before the first
+            self-attention block.
+
+    Outputs:
+        `hidden_states`: the encoded-hidden-states at the top of the model
+            as a torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
+            (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
+
+    Example usage:
+    ```python
+    # Already been converted into BPE token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+
+    config = modeling_openai.OpenAIGPTConfig()
+
+    model = modeling_openai.OpenAIGPTModel(config)
+    hidden_states = model(input_ids)
+    ```
+    """
+
+    def __init__(self, config, output_attentions=False):
+        super(OpenAIGPTModel, self).__init__(config)
+        self.output_attentions = output_attentions
+        self.tokens_embed = nn.Embedding(config.total_tokens_embeddings, config.n_embd)
+        self.positions_embed = nn.Embedding(config.n_positions, config.n_embd)
+        self.drop = nn.Dropout(config.embd_pdrop)
+        block = Block(config.n_ctx, config, scale=True, output_attentions=output_attentions)
+        self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])
+
+        self.apply(self.init_weights)
+
+    def set_num_special_tokens(self, num_special_tokens):
+        " Update input embeddings with new embedding matrice if needed "
+        if self.config.n_special == num_special_tokens:
+            return
+        # Update config
+        self.config.n_special = num_special_tokens
+        # Build new embeddings and initialize all new embeddings (in particular the special tokens)
+        old_embed = self.tokens_embed
+        self.tokens_embed = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd)
+        self.tokens_embed.to(old_embed.weight.device)
+        self.init_weights(self.tokens_embed)
+        # Copy word embeddings from the previous weights
+        self.tokens_embed.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
+
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, token_emotion_ids=None, token_action_ids=None):
+        if position_ids is None:
+            # This was used when we had a single embedding matrice from position and token embeddings
+            # start = self.config.vocab_size + self.config.n_special
+            # end = start + input_ids.size(-1)
+            # position_ids = torch.arange(start, end, dtype=torch.long, device=input_ids.device)
+            position_ids = torch.arange(input_ids.size(-1), dtype=torch.long, device=input_ids.device)
+            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_ids.size(-1))
+        position_ids = position_ids.view(-1, position_ids.size(-1))
+
+        inputs_embeds = self.tokens_embed(input_ids)
+        position_embeds = self.positions_embed(position_ids)
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
+            token_type_embeds = self.tokens_embed(token_type_ids)
+        else:
+            token_type_embeds = 0
+
+        if token_emotion_ids is not None:
+            token_emotion_ids = token_emotion_ids.view(-1, token_emotion_ids.size(-1))
+            token_emotion_embeds = self.tokens_embed(token_emotion_ids)
+        else:
+            token_emotion_embeds = 0
+
+        if token_action_ids is not None:
+            token_action_ids = token_action_ids.view(-1, token_action_ids.size(-1))
+            token_action_embeds = self.tokens_embed(token_action_ids)
+        else:
+            token_action_embeds = 0
+
+        hidden_states = inputs_embeds + position_embeds + token_type_embeds + token_emotion_embeds + token_action_embeds
+        hidden_states = self.drop(hidden_states)
+
+        all_attentions = []
+        for block in self.h:
+            if self.output_attentions:
+                attentions, hidden_states = block(hidden_states)
+                all_attentions.append(attentions)
+            else:
+                hidden_states = block(hidden_states)
+        output_shape = input_shape + (hidden_states.size(-1),)
+        if self.output_attentions:
+            return all_attentions, hidden_states.view(*output_shape)
+        return hidden_states.view(*output_shape)
+
+
+class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
+    """OpenAI GPT model with a Language Modeling head ("Improving Language Understanding by Generative Pre-Training").
+
+    OpenAI GPT use a single embedding matrix to store the word and special embeddings.
+    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
+    Special tokens need to be trained during the fine-tuning if you use them.
+    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
+
+    The embeddings are ordered as follow in the token embeddings matrice:
+        [0,                                                         ----------------------
+         ...                                                        -> word embeddings
+         config.vocab_size - 1,                                     ______________________
+         config.vocab_size,
+         ...                                                        -> special embeddings
+         config.vocab_size + config.n_special - 1]                  ______________________
+
+    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
+        total_tokens_embeddings = config.vocab_size + config.n_special
+    You should use the associate indices to index the embeddings.
+
+    Params:
+        config: a OpenAIGPTConfig class instance with the configuration to build a new model
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
+            were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[
+        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
+            with the position indices (selected in the range [0, config.n_positions - 1[.
+        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
+            You can use it to add a third type of embedding to each input token in the sequence
+            (the previous two being the word and position embeddings).
+            The input, position and token_type embeddings are summed inside the Transformer before the first
+            self-attention block.
+        `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
+            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
+            is only computed for the labels set in [0, ..., vocab_size]
+
+    Outputs:
+        if `lm_labels` is not `None`:
+            Outputs the language modeling loss.
+        else:
+            `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, sequence_length, total_tokens_embeddings]
+                (or more generally [d_1, ..., d_n, total_tokens_embeddings] were d_1 ... d_n are the dimension of input_ids)
+
+    Example usage:
+    ```python
+    # Already been converted into BPE token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+
+    config = modeling_openai.OpenAIGPTConfig()
+
+    model = modeling_openai.OpenAIGPTLMHeadModel(config)
+    lm_logits = model(input_ids)
+    ```
+    """
+
+    def __init__(self, config, output_attentions=False):
+        super(OpenAIGPTLMHeadModel, self).__init__(config)
+        self.transformer = OpenAIGPTModel(config, output_attentions=output_attentions)
+        self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
+        self.apply(self.init_weights)
+
+    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
+        """ Update input and output embeddings with new embedding matrice
+            Make sure we are sharing the embeddings
+        """
+        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
+        self.transformer.set_num_special_tokens(num_special_tokens)
+        self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight,
+                                            predict_special_tokens=predict_special_tokens)
+
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None):
+        hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
+        if self.transformer.output_attentions:
+            all_attentions, hidden_states = hidden_states
+        lm_logits = self.lm_head(hidden_states)
+        if lm_labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = lm_labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
+                            shift_labels.view(-1))
+            return loss
+        if self.transformer.output_attentions:
+            return all_attentions, lm_logits
+        return lm_logits
+
+
+class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
+    """OpenAI GPT model with a Language Modeling and a Multiple Choice head ("Improving Language Understanding by Generative Pre-Training").
+
+    OpenAI GPT use a single embedding matrix to store the word and special embeddings.
+    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
+    Special tokens need to be trained during the fine-tuning if you use them.
+    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
+
+    The embeddings are ordered as follow in the token embeddings matrice:
+        [0,                                                         ----------------------
+         ...                                                        -> word embeddings
+         config.vocab_size - 1,                                     ______________________
+         config.vocab_size,
+         ...                                                        -> special embeddings
+         config.vocab_size + config.n_special - 1]                  ______________________
+
+    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
+        total_tokens_embeddings = config.vocab_size + config.n_special
+    You should use the associate indices to index the embeddings.
+
+    Params:
+        config: a OpenAIGPTConfig class instance with the configuration to build a new model
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] with the BPE token
+            indices selected in the range [0, total_tokens_embeddings[
+        `mc_token_ids`: a torch.LongTensor of shape [batch_size, num_choices] with the index of the token from
+            which we should take the hidden state to feed the multiple choice classifier (usually last token of the sequence)
+        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
+            with the position indices (selected in the range [0, config.n_positions - 1[.
+        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
+            You can use it to add a third type of embedding to each input token in the sequence
+            (the previous two being the word and position embeddings).
+            The input, position and token_type embeddings are summed inside the Transformer before the first
+            self-attention block.
+        `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, num_choices, sequence_length]
+            with indices selected in [-1, 0, ..., total_tokens_embeddings]. All labels set to -1 are ignored (masked), the loss
+            is only computed for the labels set in [0, ..., total_tokens_embeddings]
+        `multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, ..., num_choices].
+
+    Outputs:
+        if `lm_labels` and `multiple_choice_labels` are not `None`:
+            Outputs a tuple of losses with the language modeling loss and the multiple choice loss.
+        else: a tuple with
+            `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, num_choices, sequence_length, total_tokens_embeddings]
+            `multiple_choice_logits`: the multiple choice logits as a torch.FloatTensor of size [batch_size, num_choices]
+
+    Example usage:
+    ```python
+    # Already been converted into BPE token ids
+    input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]]])  # (bsz, number of choice, seq length)
+    mc_token_ids = torch.LongTensor([[2], [1]]) # (bsz, number of choice)
+
+    config = modeling_openai.OpenAIGPTConfig()
+
+    model = modeling_openai.OpenAIGPTDoubleHeadsModel(config)
+    lm_logits, multiple_choice_logits = model(input_ids, mc_token_ids)
+    ```
+    """
+
+    def __init__(self, config, output_attentions=False):
+        super(OpenAIGPTDoubleHeadsModel, self).__init__(config)
+        self.transformer = OpenAIGPTModel(config, output_attentions=output_attentions)
+        self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
+        self.multiple_choice_head = OpenAIGPTMultipleChoiceHead(config)
+        self.apply(self.init_weights)
+
+    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
+        """ Update input and output embeddings with new embedding matrice
+            Make sure we are sharing the embeddings
+        """
+        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
+        self.transformer.set_num_special_tokens(num_special_tokens)
+        self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight,
+                                            predict_special_tokens=predict_special_tokens)
+
+    def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None,
+                token_type_ids=None, token_emotion_ids=None, token_action_ids=None, position_ids=None):
+        hidden_states = self.transformer(input_ids, position_ids, token_type_ids, token_emotion_ids, token_action_ids)
+        if self.transformer.output_attentions:
+            all_attentions, hidden_states = hidden_states
+        lm_logits = self.lm_head(hidden_states)
+        mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
+        losses = []
+        if lm_labels is not None:  # when lm_labels is all -1 it means it's not the correct candidate which in turn means it's a negative example and we ignore it because ignore_index=-1
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = lm_labels[..., 1:].contiguous()
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            losses.append(loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)))
+        if mc_labels is not None:
+            loss_fct = CrossEntropyLoss()
+            losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))
+        if losses:
+            return losses
+        if self.transformer.output_attentions:
+            return all_attentions, lm_logits, mc_logits
+        return lm_logits, mc_logits
+
+
+###############################################################################
+
+
+class OpenAIGPTEmotionChoiceHead(nn.Module):
+    """ Classifier Head for the transformer """
+
+    def __init__(self, config):
+        super(OpenAIGPTEmotionChoiceHead, self).__init__()
+        self.n_embd = config.n_embd
+        self.dropout = nn.Dropout2d(config.resid_pdrop)  # To reproduce the noise_shape parameter of TF implementation
+        num_emotions = 7
+        self.linear = nn.Linear(config.n_embd, num_emotions)
+
+        nn.init.normal_(self.linear.weight, std=0.02)
+        nn.init.normal_(self.linear.bias, 0)
+
+    def forward(self, hidden_states, mc_token_ids):
+        # Classification logits
+        # hidden_state (bsz, seq_length, hidden_size)
+        # mc_token_ids (bsz,)
+        mc_token_ids = mc_token_ids.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, hidden_states.size(-1))
+        # mc_token_ids (bsz, 1, hidden_size)
+        multiple_choice_h = hidden_states.gather(1, mc_token_ids).squeeze(1)
+        # multiple_choice_h (bsz, hidden_size)
+        multiple_choice_h = self.dropout(multiple_choice_h)
+        multiple_choice_logits = self.linear(multiple_choice_h)
+        # (bsz, num_choices)
+        return multiple_choice_logits
+
+
+class OpenAIGPTBatchedEmotionChoiceHead(nn.Module):
+
+    def __init__(self, config):
+        super(OpenAIGPTBatchedEmotionChoiceHead, self).__init__()
+        self.n_embd = config.n_embd
+        self.dropout = nn.Dropout2d(config.resid_pdrop)  # To reproduce the noise_shape parameter of TF implementation
+        num_emotions = 7
+        self.linear = nn.Linear(config.n_embd, num_emotions)
+
+        nn.init.normal_(self.linear.weight, std=0.02)
+        nn.init.normal_(self.linear.bias, 0)
+
+    def forward(self, hidden_states, mc_token_ids):
+        # Classification logits
+        # hidden_state (bsz, num_choices, seq_length, hidden_size)
+        # mc_token_ids (bsz, num_choices)
+        mc_token_ids = mc_token_ids.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, -1, hidden_states.size(-1))
+        # mc_token_ids (bsz, num_choices, 1, hidden_size)
+        multiple_choice_h = hidden_states.gather(2, mc_token_ids).squeeze(2)
+        # multiple_choice_h (bsz, num_choices, hidden_size)
+        multiple_choice_h = self.dropout(multiple_choice_h.transpose(1, 2)).transpose(1, 2)
+        multiple_choice_logits = self.linear(multiple_choice_h).squeeze(-1)
+        # (bsz, num_choices)
+        return multiple_choice_logits
+
+
+class OpenAIGPTEmotionHead(nn.Module):
+    """ Classifier Head for the transformer """
+
+    def __init__(self, config):
+        super(OpenAIGPTEmotionHead, self).__init__()
+        self.n_embd = config.n_embd
+        self.dropout = nn.Dropout2d(config.resid_pdrop)  # To reproduce the noise_shape parameter of TF implementation
+        num_classes = 2  # this probably need to be 1
+        self.linear = nn.Linear(config.n_embd, num_classes)
+
+        nn.init.normal_(self.linear.weight, std=0.02)
+        nn.init.normal_(self.linear.bias, 0)
+
+    def forward(self, hidden_states, mc_token_ids):
+        # Classification logits
+        # hidden_state (bsz, seq_length, hidden_size)
+        # mc_token_ids (bsz,)
+        mc_token_ids = mc_token_ids.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, hidden_states.size(-1))
+        # mc_token_ids (bsz, 1, hidden_size)
+        multiple_choice_h = hidden_states.gather(1, mc_token_ids).squeeze(1)
+        # multiple_choice_h (bsz, hidden_size)
+        multiple_choice_h = self.dropout(multiple_choice_h)
+        multiple_choice_logits = self.linear(multiple_choice_h)
+        # (bsz, num_choices)
+        return multiple_choice_logits
+
+
+class OpenAIGPTDoubleHeadLMEmotionRecognitionModel(OpenAIGPTPreTrainedModel):
+    def __init__(self, config, output_attentions=False):
+        super(OpenAIGPTDoubleHeadLMEmotionRecognitionModel, self).__init__(config)
+        self.transformer = OpenAIGPTModel(config, output_attentions=output_attentions)
+        self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
+        self.emotion_choice_head = OpenAIGPTEmotionChoiceHead(config)
+        self.apply(self.init_weights)
+
+    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
+        """ Update input and output embeddings with new embedding matrice
+            Make sure we are sharing the embeddings
+        """
+        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
+        self.transformer.set_num_special_tokens(num_special_tokens)
+        self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight,
+                                            predict_special_tokens=predict_special_tokens)
+
+    def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None,
+                token_emotion_ids=None, position_ids=None):
+        hidden_states = self.transformer(input_ids, position_ids, token_type_ids, token_emotion_ids)
+        if self.transformer.output_attentions:
+            all_attentions, hidden_states = hidden_states
+        lm_logits = self.lm_head(hidden_states)
+        mc_logits = self.emotion_choice_head(hidden_states, mc_token_ids)
+        losses = []
+        if lm_labels is not None:  # when lm_labels is all -1 it means it's not the correct candidate which in turn means it's a negative example and we ignore it because ignore_index=-1
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = lm_labels[..., 1:].contiguous()
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            losses.append(loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)))
+        if mc_labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            # loss_fct = CrossEntropyLoss()
+            losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))
+        if losses:
+            return losses
+        if self.transformer.output_attentions:
+            return all_attentions, lm_logits, mc_logits
+        return lm_logits, mc_logits
+
+
+class OpenAIGPTForEmotionDetection(OpenAIGPTPreTrainedModel):
+    def __init__(self, config, output_attentions=False):
+        super(OpenAIGPTForEmotionDetection, self).__init__(config)
+        self.transformer = OpenAIGPTModel(config, output_attentions=output_attentions)
+        self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
+        self.emotion_classification_head = OpenAIGPTEmotionHead(config)
+        self.apply(self.init_weights)
+
+    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
+        """ Update input and output embeddings with new embedding matrice
+            Make sure we are sharing the embeddings
+        """
+        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
+        self.transformer.set_num_special_tokens(num_special_tokens)
+        self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight,
+                                            predict_special_tokens=predict_special_tokens)
+
+    def forward(self, input_ids, mc_token_ids, lm_labels=None,
+                mc_labels=None, token_type_ids=None, position_ids=None, token_emotion_ids=None):
+        hidden_states = self.transformer(input_ids, position_ids, token_type_ids, token_emotion_ids=token_emotion_ids)
+        if self.transformer.output_attentions:
+            all_attentions, hidden_states = hidden_states
+        lm_logits = self.lm_head(hidden_states)
+        mc_logits = self.emotion_classification_head(hidden_states, mc_token_ids)
+        losses = []
+        if lm_labels is not None:  # when lm_labels is all -1 it means it's not the correct candidate which in turn means it's a negative example and we ignore it because ignore_index=-1
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = lm_labels[..., 1:].contiguous()
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            losses.append(loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)))
+        if mc_labels is not None:
+            # loss_fct = CrossEntropyLoss(ignore_index=-1)
+            loss_fct = CrossEntropyLoss()
+            losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))
+        if losses:
+            return losses
+        if self.transformer.output_attentions:
+            return all_attentions, lm_logits, mc_logits
+        return lm_logits, mc_logits
+
+
+class OpenAIGPTMultiHeadModel(OpenAIGPTPreTrainedModel):
+    def __init__(self, config, output_attentions=False):
+        super(OpenAIGPTMultiHeadModel, self).__init__(config)
+        self.transformer = OpenAIGPTModel(config, output_attentions=output_attentions)
+        self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
+        self.emotion_choice_head = OpenAIGPTBatchedEmotionChoiceHead(config)
+        self.sentence_choice_head = OpenAIGPTMultipleChoiceHead(config)
+        self.apply(self.init_weights)
+
+    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
+        """ Update input and output embeddings with new embedding matrice
+            Make sure we are sharing the embeddings
+        """
+        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
+        self.transformer.set_num_special_tokens(num_special_tokens)
+        self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight,
+                                            predict_special_tokens=predict_special_tokens)
+
+    def forward(self, input_ids, ec_token_ids, sc_token_ids, lm_labels=None,
+                ec_labels=None, sc_labels=None, token_type_ids=None,
+                token_emotion_ids=None, token_action_ids=None,
+                position_ids=None):
+
+        hidden_states = self.transformer(input_ids, position_ids, token_type_ids, token_emotion_ids)
+        if self.transformer.output_attentions:
+            all_attentions, hidden_states = hidden_states
+        lm_logits = self.lm_head(hidden_states)
+        emotion_logits = self.emotion_choice_head(hidden_states, ec_token_ids)
+        sentence_logits = self.sentence_choice_head(hidden_states, sc_token_ids)
+        losses = []
+        if lm_labels is not None:  # when lm_labels is all -1 it means it's not the correct candidate which in turn means it's a negative example and we ignore it because ignore_index=-1
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = lm_labels[..., 1:].contiguous()
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            losses.append(loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)))
+        if ec_labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            losses.append(loss_fct(emotion_logits.view(-1, emotion_logits.size(-1)), ec_labels.view(-1)))
+        if sc_labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            losses.append(loss_fct(sentence_logits.view(-1, sentence_logits.size(-1)), sc_labels.view(-1)))
+        if losses:
+            return losses
+        if self.transformer.output_attentions:
+            return all_attentions, lm_logits, emotion_logits, sentence_logits
+        return lm_logits, emotion_logits, sentence_logits
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
new file mode 100644
index 0000000..e70a29a
--- /dev/null
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -0,0 +1,1392 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Transformer XL model.
+    Adapted from https://github.com/kimiyoung/transformer-xl.
+    In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/mem_transformer.py
+"""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+import copy
+import json
+import math
+import logging
+import tarfile
+import tempfile
+import shutil
+import collections
+import sys
+from io import open
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import CrossEntropyLoss
+from torch.nn.parameter import Parameter
+
+from .modeling import BertLayerNorm as LayerNorm
+from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits
+from .file_utils import cached_path, CONFIG_NAME, WEIGHTS_NAME
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-pytorch_model.bin",
+}
+PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json",
+}
+
+TF_WEIGHTS_NAME = 'model.ckpt'
+
+def build_tf_to_pytorch_map(model, config):
+    """ A map of modules from TF to PyTorch.
+        This time I use a map to keep the PyTorch model as identical to the original PyTorch model as possible.
+    """
+    tf_to_pt_map = {}
+
+    if hasattr(model, 'transformer'):
+        # We are loading in a TransfoXLLMHeadModel => we will load also the Adaptive Softmax
+        tf_to_pt_map.update({
+            "transformer/adaptive_softmax/cutoff_0/cluster_W": model.crit.cluster_weight,
+            "transformer/adaptive_softmax/cutoff_0/cluster_b": model.crit.cluster_bias})
+        for i, (out_l, proj_l, tie_proj) in enumerate(zip(
+                                model.crit.out_layers,
+                                model.crit.out_projs,
+                                config.tie_projs)):
+            layer_str = "transformer/adaptive_softmax/cutoff_%d/" % i
+            if config.tie_weight:
+                tf_to_pt_map.update({
+                    layer_str + 'b': out_l.bias})
+            else:
+                raise NotImplementedError
+                # I don't think this is implemented in the TF code
+                tf_to_pt_map.update({
+                    layer_str + 'lookup_table': out_l.weight,
+                    layer_str + 'b': out_l.bias})
+            if not tie_proj:
+                tf_to_pt_map.update({
+                    layer_str + 'proj': proj_l
+                    })
+        # Now load the rest of the transformer
+        model = model.transformer
+
+    # Embeddings
+    for i, (embed_l, proj_l) in enumerate(zip(model.word_emb.emb_layers, model.word_emb.emb_projs)):
+        layer_str = "transformer/adaptive_embed/cutoff_%d/" % i
+        tf_to_pt_map.update({
+            layer_str + 'lookup_table': embed_l.weight,
+            layer_str + 'proj_W': proj_l
+            })
+
+    # Transformer blocks
+    for i, b in enumerate(model.layers):
+        layer_str = "transformer/layer_%d/" % i
+        tf_to_pt_map.update({
+            layer_str + "rel_attn/LayerNorm/gamma": b.dec_attn.layer_norm.weight,
+            layer_str + "rel_attn/LayerNorm/beta": b.dec_attn.layer_norm.bias,
+            layer_str + "rel_attn/o/kernel": b.dec_attn.o_net.weight,
+            layer_str + "rel_attn/qkv/kernel": b.dec_attn.qkv_net.weight,
+            layer_str + "rel_attn/r/kernel": b.dec_attn.r_net.weight,
+            layer_str + "ff/LayerNorm/gamma": b.pos_ff.layer_norm.weight,
+            layer_str + "ff/LayerNorm/beta": b.pos_ff.layer_norm.bias,
+            layer_str + "ff/layer_1/kernel": b.pos_ff.CoreNet[0].weight,
+            layer_str + "ff/layer_1/bias": b.pos_ff.CoreNet[0].bias,
+            layer_str + "ff/layer_2/kernel": b.pos_ff.CoreNet[3].weight,
+            layer_str + "ff/layer_2/bias": b.pos_ff.CoreNet[3].bias,
+        })
+
+    # Relative positioning biases
+    if config.untie_r:
+        r_r_list = []
+        r_w_list = []
+        for b in model.layers:
+            r_r_list.append(b.dec_attn.r_r_bias)
+            r_w_list.append(b.dec_attn.r_w_bias)
+    else:
+        r_r_list = [model.r_r_bias]
+        r_w_list = [model.r_w_bias]
+    tf_to_pt_map.update({
+        'transformer/r_r_bias': r_r_list,
+        'transformer/r_w_bias': r_w_list})
+    return tf_to_pt_map
+
+def load_tf_weights_in_transfo_xl(model, config, tf_path):
+    """ Load tf checkpoints in a pytorch model
+    """
+    try:
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions.")
+        raise
+    # Build TF to PyTorch weights loading map
+    tf_to_pt_map = build_tf_to_pytorch_map(model, config)
+
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    tf_weights = {}
+    for name, shape in init_vars:
+        print("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        tf_weights[name] = array
+
+    for name, pointer in tf_to_pt_map.items():
+        assert name in tf_weights
+        array = tf_weights[name]
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if 'kernel' in name or 'proj' in name:
+            array = np.transpose(array)
+        if ('r_r_bias' in name or 'r_w_bias' in name) and len(pointer) > 1:
+            # Here we will split the TF weigths
+            assert len(pointer) == array.shape[0]
+            for i, p_i in enumerate(pointer):
+                arr_i = array[i, ...]
+                try:
+                    assert p_i.shape == arr_i.shape
+                except AssertionError as e:
+                    e.args += (p_i.shape, arr_i.shape)
+                    raise
+                print("Initialize PyTorch weight {} for layer {}".format(name, i))
+                p_i.data = torch.from_numpy(arr_i)
+        else:
+            try:
+                assert pointer.shape == array.shape
+            except AssertionError as e:
+                e.args += (pointer.shape, array.shape)
+                raise
+            print("Initialize PyTorch weight {}".format(name))
+            pointer.data = torch.from_numpy(array)
+        tf_weights.pop(name, None)
+        tf_weights.pop(name + '/Adam', None)
+        tf_weights.pop(name + '/Adam_1', None)
+
+    print("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
+    return model
+
+
+class TransfoXLConfig(object):
+    """Configuration class to store the configuration of a `TransfoXLModel`.
+    """
+    def __init__(self,
+                 vocab_size_or_config_json_file=267735,
+                 cutoffs=[20000, 40000, 200000],
+                 d_model=1024,
+                 d_embed=1024,
+                 n_head=16,
+                 d_head=64,
+                 d_inner=4096,
+                 div_val=4,
+                 pre_lnorm=False,
+                 n_layer=18,
+                 tgt_len=128,
+                 ext_len=0,
+                 mem_len=1600,
+                 clamp_len=1000,
+                 same_length=True,
+                 proj_share_all_but_first=True,
+                 attn_type=0,
+                 sample_softmax=-1,
+                 adaptive=True,
+                 tie_weight=True,
+                 dropout=0.1,
+                 dropatt=0.0,
+                 untie_r=True,
+                 init="normal",
+                 init_range=0.01,
+                 proj_init_std=0.01,
+                 init_std=0.02):
+        """Constructs TransfoXLConfig.
+
+        Args:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
+            cutoffs: cutoffs for the adaptive softmax
+            d_model: Dimensionality of the model's hidden states.
+            d_embed: Dimensionality of the embeddings
+            d_head: Dimensionality of the model's heads.
+            div_val: divident value for adapative input and softmax
+            pre_lnorm: apply LayerNorm to the input instead of the output
+            d_inner: Inner dimension in FF
+            n_layer: Number of hidden layers in the Transformer encoder.
+            n_head: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            tgt_len: number of tokens to predict
+            ext_len: length of the extended context
+            mem_len: length of the retained previous heads
+            same_length: use the same attn length for all tokens
+            proj_share_all_but_first: True to share all but first projs, False not to share.
+            attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
+            clamp_len: use the same pos embeddings after clamp_len
+            sample_softmax: number of samples in sampled softmax
+            adaptive: use adaptive softmax
+            tie_weight: tie the word embedding and softmax weights
+            dropout: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            dropatt: The dropout ratio for the attention probabilities.
+            untie_r: untie relative position biases           
+            embd_pdrop: The dropout ratio for the embeddings.
+            init: parameter initializer to use
+            init_range: parameters initialized by U(-init_range, init_range).
+            proj_init_std: parameters initialized by N(0, init_std)
+            init_std: parameters initialized by N(0, init_std)
+        """
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.n_token = vocab_size_or_config_json_file
+            self.cutoffs = []
+            self.cutoffs.extend(cutoffs)
+            self.tie_weight = tie_weight
+            if proj_share_all_but_first:
+                self.tie_projs = [False] + [True] * len(self.cutoffs)
+            else:
+                self.tie_projs = [False] + [False] * len(self.cutoffs)
+            self.d_model = d_model
+            self.d_embed = d_embed
+            self.d_head = d_head
+            self.d_inner = d_inner
+            self.div_val = div_val
+            self.pre_lnorm = pre_lnorm
+            self.n_layer = n_layer
+            self.n_head = n_head
+            self.tgt_len = tgt_len
+            self.ext_len = ext_len
+            self.mem_len = mem_len
+            self.same_length = same_length
+            self.attn_type = attn_type
+            self.clamp_len = clamp_len
+            self.sample_softmax = sample_softmax
+            self.adaptive = adaptive
+            self.dropout = dropout
+            self.dropatt = dropatt
+            self.untie_r = untie_r
+            self.init = init
+            self.init_range = init_range
+            self.proj_init_std = proj_init_std
+            self.init_std = init_std
+        else:
+            raise ValueError("First argument must be either a vocabulary size (int)"
+                             "or the path to a pretrained model config file (str)")
+
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `TransfoXLConfig` from a Python dictionary of parameters."""
+        config = TransfoXLConfig(vocab_size_or_config_json_file=-1)
+        for key, value in json_object.items():
+            config.__dict__[key] = value
+        return config
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `TransfoXLConfig` from a json file of parameters."""
+        with open(json_file, "r", encoding='utf-8') as reader:
+            text = reader.read()
+        return cls.from_dict(json.loads(text))
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+
+    def to_json_file(self, json_file_path):
+        """ Save this instance to a json file."""
+        with open(json_file_path, "w", encoding='utf-8') as writer:
+            writer.write(self.to_json_string())
+
+
+class PositionalEmbedding(nn.Module):
+    def __init__(self, demb):
+        super(PositionalEmbedding, self).__init__()
+
+        self.demb = demb
+
+        inv_freq = 1 / (10000 ** (torch.arange(0.0, demb, 2.0) / demb))
+        self.register_buffer('inv_freq', inv_freq)
+
+    def forward(self, pos_seq, bsz=None):
+        sinusoid_inp = torch.ger(pos_seq, self.inv_freq)
+        pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1)
+
+        if bsz is not None:
+            return pos_emb[:,None,:].expand(-1, bsz, -1)
+        else:
+            return pos_emb[:,None,:]
+
+
+class PositionwiseFF(nn.Module):
+    def __init__(self, d_model, d_inner, dropout, pre_lnorm=False):
+        super(PositionwiseFF, self).__init__()
+
+        self.d_model = d_model
+        self.d_inner = d_inner
+        self.dropout = dropout
+
+        self.CoreNet = nn.Sequential(
+            nn.Linear(d_model, d_inner), nn.ReLU(inplace=True),
+            nn.Dropout(dropout),
+            nn.Linear(d_inner, d_model),
+            nn.Dropout(dropout),
+        )
+
+        self.layer_norm = LayerNorm(d_model)
+
+        self.pre_lnorm = pre_lnorm
+
+    def forward(self, inp):
+        if self.pre_lnorm:
+            ##### layer normalization + positionwise feed-forward
+            core_out = self.CoreNet(self.layer_norm(inp))
+
+            ##### residual connection
+            output = core_out + inp
+        else:
+            ##### positionwise feed-forward
+            core_out = self.CoreNet(inp)
+
+            ##### residual connection + layer normalization
+            output = self.layer_norm(inp + core_out)
+
+        return output
+
+class MultiHeadAttn(nn.Module):
+    def __init__(self, n_head, d_model, d_head, dropout, dropatt=0, 
+                 pre_lnorm=False, r_r_bias=None, r_w_bias=None):
+        super(MultiHeadAttn, self).__init__()
+
+        self.n_head = n_head
+        self.d_model = d_model
+        self.d_head = d_head
+        self.dropout = dropout
+
+        self.q_net = nn.Linear(d_model, n_head * d_head, bias=False)
+        self.kv_net = nn.Linear(d_model, 2 * n_head * d_head, bias=False)
+
+        self.drop = nn.Dropout(dropout)
+        self.dropatt = nn.Dropout(dropatt)
+        self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)
+
+        self.layer_norm = LayerNorm(d_model)
+
+        self.scale = 1 / (d_head ** 0.5)
+
+        self.pre_lnorm = pre_lnorm
+
+        if r_r_bias is None or r_w_bias is None: # Biases are not shared
+            self.r_r_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
+            self.r_w_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
+        else:
+            self.r_r_bias = r_r_bias
+            self.r_w_bias = r_w_bias
+
+    def forward(self, h, attn_mask=None, mems=None):
+        ##### multihead attention
+        # [hlen x bsz x n_head x d_head]
+
+        if mems is not None:
+            c = torch.cat([mems, h], 0)
+        else:
+            c = h
+
+        if self.pre_lnorm:
+            ##### layer normalization
+            c = self.layer_norm(c)
+
+        head_q = self.q_net(h)
+        head_k, head_v = torch.chunk(self.kv_net(c), 2, -1)
+
+        head_q = head_q.view(h.size(0), h.size(1), self.n_head, self.d_head)
+        head_k = head_k.view(c.size(0), c.size(1), self.n_head, self.d_head)
+        head_v = head_v.view(c.size(0), c.size(1), self.n_head, self.d_head)
+
+        # [qlen x klen x bsz x n_head]
+        attn_score = torch.einsum('ibnd,jbnd->ijbn', (head_q, head_k))
+        attn_score.mul_(self.scale)
+        if attn_mask is not None and attn_mask.any().item():
+            if attn_mask.dim() == 2:
+                attn_score.masked_fill_(attn_mask[None,:,:,None], -float('inf'))
+            elif attn_mask.dim() == 3:
+                attn_score.masked_fill_(attn_mask[:,:,:,None], -float('inf'))
+
+        # [qlen x klen x bsz x n_head]
+        attn_prob = F.softmax(attn_score, dim=1)
+        attn_prob = self.dropatt(attn_prob)
+
+        # [qlen x klen x bsz x n_head] + [klen x bsz x n_head x d_head] -> [qlen x bsz x n_head x d_head]
+        attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, head_v))
+        attn_vec = attn_vec.contiguous().view(
+            attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head)
+
+        ##### linear projection
+        attn_out = self.o_net(attn_vec)
+        attn_out = self.drop(attn_out)
+
+        if self.pre_lnorm:
+            ##### residual connection
+            output = h + attn_out
+        else:
+            ##### residual connection + layer normalization
+            output = self.layer_norm(h + attn_out)
+
+        return output
+
+class RelMultiHeadAttn(nn.Module):
+    def __init__(self, n_head, d_model, d_head, dropout, dropatt=0,
+                 tgt_len=None, ext_len=None, mem_len=None, pre_lnorm=False,
+                 r_r_bias=None, r_w_bias=None):
+        super(RelMultiHeadAttn, self).__init__()
+
+        self.n_head = n_head
+        self.d_model = d_model
+        self.d_head = d_head
+        self.dropout = dropout
+
+        self.qkv_net = nn.Linear(d_model, 3 * n_head * d_head, bias=False)
+
+        self.drop = nn.Dropout(dropout)
+        self.dropatt = nn.Dropout(dropatt)
+        self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)
+
+        self.layer_norm = LayerNorm(d_model)
+
+        self.scale = 1 / (d_head ** 0.5)
+
+        self.pre_lnorm = pre_lnorm
+
+        if r_r_bias is None or r_w_bias is None: # Biases are not shared
+            self.r_r_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
+            self.r_w_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
+        else:
+            self.r_r_bias = r_r_bias
+            self.r_w_bias = r_w_bias
+
+    def _parallelogram_mask(self, h, w, left=False):
+        mask = torch.ones((h, w)).byte()
+        m = min(h, w)
+        mask[:m,:m] = torch.triu(mask[:m,:m])
+        mask[-m:,-m:] = torch.tril(mask[-m:,-m:])
+
+        if left:
+            return mask
+        else:
+            return mask.flip(0)
+
+    def _shift(self, x, qlen, klen, mask, left=False):
+        if qlen > 1:
+            zero_pad = torch.zeros((x.size(0), qlen-1, x.size(2), x.size(3)),
+                                    device=x.device, dtype=x.dtype)
+        else:
+            zero_pad = torch.zeros(0, device=x.device, dtype=x.dtype)
+
+        if left:
+            mask = mask.flip(1)
+            x_padded = torch.cat([zero_pad, x], dim=1).expand(qlen, -1, -1, -1)
+        else:
+            x_padded = torch.cat([x, zero_pad], dim=1).expand(qlen, -1, -1, -1)
+
+        x = x_padded.masked_select(mask[:,:,None,None]) \
+                    .view(qlen, klen, x.size(2), x.size(3))
+
+        return x
+
+    def _rel_shift(self, x, zero_triu=False):
+        zero_pad_shape = (x.size(0), 1) + x.size()[2:]
+        zero_pad = torch.zeros(zero_pad_shape, device=x.device, dtype=x.dtype)
+        x_padded = torch.cat([zero_pad, x], dim=1)
+
+        x_padded_shape = (x.size(1) + 1, x.size(0)) + x.size()[2:]
+        x_padded = x_padded.view(*x_padded_shape)
+
+        x = x_padded[1:].view_as(x)
+
+        if zero_triu:
+            ones = torch.ones((x.size(0), x.size(1)))
+            x = x * torch.tril(ones, x.size(1) - x.size(0))[:,:,None,None]
+
+        return x
+
+    def forward(self, w, r, attn_mask=None, mems=None):
+        raise NotImplementedError
+
+class RelPartialLearnableMultiHeadAttn(RelMultiHeadAttn):
+    def __init__(self, *args, **kwargs):
+        super(RelPartialLearnableMultiHeadAttn, self).__init__(*args, **kwargs)
+
+        self.r_net = nn.Linear(self.d_model, self.n_head * self.d_head, bias=False)
+
+    def forward(self, w, r, attn_mask=None, mems=None):
+        qlen, rlen, bsz = w.size(0), r.size(0), w.size(1)
+
+        if mems is not None:
+            cat = torch.cat([mems, w], 0)
+            if self.pre_lnorm:
+                w_heads = self.qkv_net(self.layer_norm(cat))
+            else:
+                w_heads = self.qkv_net(cat)
+            r_head_k = self.r_net(r)
+
+            w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1)
+            w_head_q = w_head_q[-qlen:]
+        else:
+            if self.pre_lnorm:
+                w_heads = self.qkv_net(self.layer_norm(w))
+            else:
+                w_heads = self.qkv_net(w)
+            r_head_k = self.r_net(r)
+
+            w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1)
+
+        klen = w_head_k.size(0)
+
+        w_head_q = w_head_q.view(qlen, bsz, self.n_head, self.d_head)           # qlen x bsz x n_head x d_head
+        w_head_k = w_head_k.view(klen, bsz, self.n_head, self.d_head)           # qlen x bsz x n_head x d_head
+        w_head_v = w_head_v.view(klen, bsz, self.n_head, self.d_head)           # qlen x bsz x n_head x d_head
+
+        r_head_k = r_head_k.view(rlen, self.n_head, self.d_head)                # qlen x n_head x d_head
+
+        #### compute attention score
+        rw_head_q = w_head_q + self.r_w_bias                                    # qlen x bsz x n_head x d_head
+        AC = torch.einsum('ibnd,jbnd->ijbn', (rw_head_q, w_head_k))             # qlen x klen x bsz x n_head
+
+        rr_head_q = w_head_q + self.r_r_bias
+        BD = torch.einsum('ibnd,jnd->ijbn', (rr_head_q, r_head_k))              # qlen x klen x bsz x n_head
+        BD = self._rel_shift(BD)
+
+        # [qlen x klen x bsz x n_head]
+        attn_score = AC + BD
+        attn_score.mul_(self.scale)
+
+        #### compute attention probability
+        if attn_mask is not None and attn_mask.any().item():
+            if attn_mask.dim() == 2:
+                attn_score = attn_score.float().masked_fill(
+                    attn_mask[None,:,:,None], -1e30).type_as(attn_score)
+            elif attn_mask.dim() == 3:
+                attn_score = attn_score.float().masked_fill(
+                    attn_mask[:,:,:,None], -1e30).type_as(attn_score)
+
+        # [qlen x klen x bsz x n_head]
+        attn_prob = F.softmax(attn_score, dim=1)
+        attn_prob = self.dropatt(attn_prob)
+
+        #### compute attention vector
+        attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, w_head_v))
+
+        # [qlen x bsz x n_head x d_head]
+        attn_vec = attn_vec.contiguous().view(
+            attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head)
+
+        ##### linear projection
+        attn_out = self.o_net(attn_vec)
+        attn_out = self.drop(attn_out)
+
+        if self.pre_lnorm:
+            ##### residual connection
+            output = w + attn_out
+        else:
+            ##### residual connection + layer normalization
+            output = self.layer_norm(w + attn_out)
+
+        return output
+
+class RelLearnableMultiHeadAttn(RelMultiHeadAttn):
+    def __init__(self, *args, **kwargs):
+        super(RelLearnableMultiHeadAttn, self).__init__(*args, **kwargs)
+
+    def forward(self, w, r_emb, r_w_bias, r_bias, attn_mask=None, mems=None):
+        # r_emb: [klen, n_head, d_head], used for term B
+        # r_w_bias: [n_head, d_head], used for term C
+        # r_bias: [klen, n_head], used for term D
+
+        qlen, bsz = w.size(0), w.size(1)
+
+        if mems is not None:
+            cat = torch.cat([mems, w], 0)
+            if self.pre_lnorm:
+                w_heads = self.qkv_net(self.layer_norm(cat))
+            else:
+                w_heads = self.qkv_net(cat)
+            w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1)
+
+            w_head_q = w_head_q[-qlen:]
+        else:
+            if self.pre_lnorm:
+                w_heads = self.qkv_net(self.layer_norm(w))
+            else:
+                w_heads = self.qkv_net(w)
+            w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1)
+
+        klen = w_head_k.size(0)
+
+        w_head_q = w_head_q.view(qlen, bsz, self.n_head, self.d_head)
+        w_head_k = w_head_k.view(klen, bsz, self.n_head, self.d_head)
+        w_head_v = w_head_v.view(klen, bsz, self.n_head, self.d_head)
+
+        if klen > r_emb.size(0):
+            r_emb_pad = r_emb[0:1].expand(klen-r_emb.size(0), -1, -1)
+            r_emb = torch.cat([r_emb_pad, r_emb], 0)
+            r_bias_pad = r_bias[0:1].expand(klen-r_bias.size(0), -1)
+            r_bias = torch.cat([r_bias_pad, r_bias], 0)
+        else:
+            r_emb = r_emb[-klen:]
+            r_bias = r_bias[-klen:]
+
+        #### compute attention score
+        rw_head_q = w_head_q + r_w_bias[None]                                   # qlen x bsz x n_head x d_head
+
+        AC = torch.einsum('ibnd,jbnd->ijbn', (rw_head_q, w_head_k))             # qlen x klen x bsz x n_head
+        B_ = torch.einsum('ibnd,jnd->ijbn', (w_head_q, r_emb))                  # qlen x klen x bsz x n_head
+        D_ = r_bias[None, :, None]                                              # 1    x klen x 1   x n_head
+        BD = self._rel_shift(B_ + D_)
+
+        # [qlen x klen x bsz x n_head]
+        attn_score = AC + BD
+        attn_score.mul_(self.scale)
+
+        #### compute attention probability
+        if attn_mask is not None and attn_mask.any().item():
+            if attn_mask.dim() == 2:
+                attn_score.masked_fill_(attn_mask[None,:,:,None], -float('inf'))
+            elif attn_mask.dim() == 3:
+                attn_score.masked_fill_(attn_mask[:,:,:,None], -float('inf'))
+
+        # [qlen x klen x bsz x n_head]
+        attn_prob = F.softmax(attn_score, dim=1)
+        attn_prob = self.dropatt(attn_prob)
+
+        #### compute attention vector
+        attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, w_head_v))
+
+        # [qlen x bsz x n_head x d_head]
+        attn_vec = attn_vec.contiguous().view(
+            attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head)
+
+        ##### linear projection
+        attn_out = self.o_net(attn_vec)
+        attn_out = self.drop(attn_out)
+
+        if self.pre_lnorm:
+            ##### residual connection
+            output = w + attn_out
+        else:
+            ##### residual connection + layer normalization
+            output = self.layer_norm(w + attn_out)
+
+        return output
+
+class DecoderLayer(nn.Module):
+    def __init__(self, n_head, d_model, d_head, d_inner, dropout, **kwargs):
+        super(DecoderLayer, self).__init__()
+
+        self.dec_attn = MultiHeadAttn(n_head, d_model, d_head, dropout, **kwargs)
+        self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, 
+                                     pre_lnorm=kwargs.get('pre_lnorm'))
+
+    def forward(self, dec_inp, dec_attn_mask=None, mems=None):
+
+        output = self.dec_attn(dec_inp, attn_mask=dec_attn_mask,
+                               mems=mems)
+        output = self.pos_ff(output)
+
+        return output
+
+class RelLearnableDecoderLayer(nn.Module):
+    def __init__(self, n_head, d_model, d_head, d_inner, dropout,
+                 **kwargs):
+        super(RelLearnableDecoderLayer, self).__init__()
+
+        self.dec_attn = RelLearnableMultiHeadAttn(n_head, d_model, d_head, dropout,
+                                         **kwargs)
+        self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, 
+                                     pre_lnorm=kwargs.get('pre_lnorm'))
+
+    def forward(self, dec_inp, r_emb, r_w_bias, r_bias, dec_attn_mask=None, mems=None):
+
+        output = self.dec_attn(dec_inp, r_emb, r_w_bias, r_bias,
+                               attn_mask=dec_attn_mask,
+                               mems=mems)
+        output = self.pos_ff(output)
+
+        return output
+
+class RelPartialLearnableDecoderLayer(nn.Module):
+    def __init__(self, n_head, d_model, d_head, d_inner, dropout,
+                 **kwargs):
+        super(RelPartialLearnableDecoderLayer, self).__init__()
+
+        self.dec_attn = RelPartialLearnableMultiHeadAttn(n_head, d_model,
+                            d_head, dropout, **kwargs)
+        self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, 
+                                     pre_lnorm=kwargs.get('pre_lnorm'))
+
+    def forward(self, dec_inp, r, dec_attn_mask=None, mems=None):
+
+        output = self.dec_attn(dec_inp, r,
+                               attn_mask=dec_attn_mask,
+                               mems=mems)
+        output = self.pos_ff(output)
+
+        return output
+
+
+class AdaptiveEmbedding(nn.Module):
+    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, 
+                 sample_softmax=False):
+        super(AdaptiveEmbedding, self).__init__()
+
+        self.n_token = n_token
+        self.d_embed = d_embed
+
+        self.cutoffs = cutoffs + [n_token]
+        self.div_val = div_val
+        self.d_proj = d_proj
+
+        self.emb_scale = d_proj ** 0.5
+
+        self.cutoff_ends = [0] + self.cutoffs
+
+        self.emb_layers = nn.ModuleList()
+        self.emb_projs = nn.ParameterList()
+        if div_val == 1:
+            self.emb_layers.append(
+                nn.Embedding(n_token, d_embed, sparse=sample_softmax>0)
+            )
+            if d_proj != d_embed:
+                self.emb_projs.append(nn.Parameter(torch.Tensor(d_proj, d_embed)))
+        else:
+            for i in range(len(self.cutoffs)):
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1]
+                d_emb_i = d_embed // (div_val ** i)
+                self.emb_layers.append(nn.Embedding(r_idx-l_idx, d_emb_i))
+                self.emb_projs.append(nn.Parameter(torch.Tensor(d_proj, d_emb_i)))
+
+    def forward(self, inp):
+        if self.div_val == 1:
+            embed = self.emb_layers[0](inp)
+            if self.d_proj != self.d_embed:
+                embed  = F.linear(embed, self.emb_projs[0])
+        else:
+            param = next(self.parameters())
+            inp_flat = inp.view(-1)
+            emb_flat = torch.zeros([inp_flat.size(0), self.d_proj], 
+                dtype=param.dtype, device=param.device)
+            for i in range(len(self.cutoffs)):
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
+
+                mask_i = (inp_flat >= l_idx) & (inp_flat < r_idx)
+                indices_i = mask_i.nonzero().squeeze()
+
+                if indices_i.numel() == 0:
+                    continue
+
+                inp_i = inp_flat.index_select(0, indices_i) - l_idx
+                emb_i = self.emb_layers[i](inp_i)
+                emb_i = F.linear(emb_i, self.emb_projs[i])
+
+                emb_flat.index_copy_(0, indices_i, emb_i)
+
+            embed_shape = inp.size() + (self.d_proj,)
+            embed = emb_flat.view(embed_shape)
+
+        embed.mul_(self.emb_scale)
+
+        return embed
+
+
+class TransfoXLPreTrainedModel(nn.Module):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TransfoXLPreTrainedModel, self).__init__()
+        if not isinstance(config, TransfoXLConfig):
+            raise ValueError(
+                "Parameter config in `{}(config)` should be an instance of class `TransfoXLConfig`. "
+                "To create a model from a pretrained model use "
+                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
+                    self.__class__.__name__, self.__class__.__name__
+                ))
+        self.config = config
+
+    def init_weight(self, weight):
+        if self.config.init == 'uniform':
+            nn.init.uniform_(weight, -self.config.init_range, self.config.init_range)
+        elif self.config.init == 'normal':
+            nn.init.normal_(weight, 0.0, self.config.init_std)
+
+    def init_bias(self, bias):
+        nn.init.constant_(bias, 0.0)
+
+    def init_weights(self, m):
+        """ Initialize the weights.
+        """
+        classname = m.__class__.__name__
+        if classname.find('Linear') != -1:
+            if hasattr(m, 'weight') and m.weight is not None:
+                self.init_weight(m.weight)
+            if hasattr(m, 'bias') and m.bias is not None:
+                self.init_bias(m.bias)
+        elif classname.find('AdaptiveEmbedding') != -1:
+            if hasattr(m, 'emb_projs'):
+                for i in range(len(m.emb_projs)):
+                    if m.emb_projs[i] is not None:
+                        nn.init.normal_(m.emb_projs[i], 0.0, self.config.proj_init_std)
+        elif classname.find('Embedding') != -1:
+            if hasattr(m, 'weight'):
+                self.init_weight(m.weight)
+        elif classname.find('ProjectedAdaptiveLogSoftmax') != -1:
+            if hasattr(m, 'cluster_weight') and m.cluster_weight is not None:
+                self.init_weight(m.cluster_weight)
+            if hasattr(m, 'cluster_bias') and m.cluster_bias is not None:
+                self.init_bias(m.cluster_bias)
+            if hasattr(m, 'out_projs'):
+                for i in range(len(m.out_projs)):
+                    if m.out_projs[i] is not None:
+                        nn.init.normal_(m.out_projs[i], 0.0, self.config.proj_init_std)
+        elif classname.find('LayerNorm') != -1:
+            if hasattr(m, 'weight'):
+                nn.init.normal_(m.weight, 1.0, self.config.init_std)
+            if hasattr(m, 'bias') and m.bias is not None:
+                self.init_bias(m.bias)
+        elif classname.find('TransformerLM') != -1:
+            if hasattr(m, 'r_emb'):
+                self.init_weight(m.r_emb)
+            if hasattr(m, 'r_w_bias'):
+                self.init_weight(m.r_w_bias)
+            if hasattr(m, 'r_r_bias'):
+                self.init_weight(m.r_r_bias)
+            if hasattr(m, 'r_bias'):
+                self.init_bias(m.r_bias)
+
+    def set_num_special_tokens(self, num_special_tokens):
+        pass
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
+        """
+        Instantiate a TransfoXLPreTrainedModel from a pre-trained model file or a pytorch state dict.
+        Download and cache the pre-trained model file if needed.
+
+        Params:
+            pretrained_model_name_or_path: either:
+                - a str with the name of a pre-trained model to load selected in the list of:
+                    . `transfo-xl-wt103`
+                - a path or url to a pretrained model archive containing:
+                    . `transfo_xl_config.json` a configuration file for the model
+                    . `pytorch_model.bin` a PyTorch dump of a TransfoXLModel instance
+                - a path or url to a pretrained model archive containing:
+                    . `transfo_xl_config.json` a configuration file for the model
+                    . `model.chkpt` a TensorFlow checkpoint
+            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
+            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
+            *inputs, **kwargs: additional input for the specific TransformerXL class
+        """
+        state_dict = kwargs.get('state_dict', None)
+        kwargs.pop('state_dict', None)
+        cache_dir = kwargs.get('cache_dir', None)
+        kwargs.pop('cache_dir', None)
+        from_tf = kwargs.get('from_tf', False)
+        kwargs.pop('from_tf', None)
+
+        if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
+            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
+            config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
+        else:
+            archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
+            config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
+        # redirect to the cache, if necessary
+        try:
+            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
+            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find files {} and {} "
+                "at this path or url.".format(
+                    pretrained_model_name_or_path,
+                    ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
+                    pretrained_model_name_or_path,
+                    archive_file, config_file))
+            return None
+        if resolved_archive_file == archive_file and resolved_config_file == config_file:
+            logger.info("loading weights file {}".format(archive_file))
+            logger.info("loading configuration file {}".format(config_file))
+        else:
+            logger.info("loading weights file {} from cache at {}".format(
+                archive_file, resolved_archive_file))
+            logger.info("loading configuration file {} from cache at {}".format(
+                config_file, resolved_config_file))
+        # Load config
+        config = TransfoXLConfig.from_json_file(resolved_config_file)
+        logger.info("Model config {}".format(config))
+        # Instantiate model.
+        model = cls(config, *inputs, **kwargs)
+        if state_dict is None and not from_tf:
+            state_dict = torch.load(resolved_archive_file, map_location='cpu')
+        if from_tf:
+            # Directly load from a TensorFlow checkpoint
+            return load_tf_weights_in_transfo_xl(model, config, pretrained_model_name_or_path)
+
+        missing_keys = []
+        unexpected_keys = []
+        error_msgs = []
+        # copy state_dict so _load_from_state_dict can modify it
+        metadata = getattr(state_dict, '_metadata', None)
+        state_dict = state_dict.copy()
+        if metadata is not None:
+            state_dict._metadata = metadata
+
+        def load(module, prefix=''):
+            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+            module._load_from_state_dict(
+                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
+            for name, child in module._modules.items():
+                if child is not None:
+                    load(child, prefix + name + '.')
+
+        start_prefix = ''
+        if not hasattr(model, 'transformer') and any(s.startswith('transformer.') for s in state_dict.keys()):
+            start_prefix = 'transformer.'
+        load(model, prefix=start_prefix)
+
+        if len(missing_keys) > 0:
+            logger.info("Weights of {} not initialized from pretrained model: {}".format(
+                model.__class__.__name__, missing_keys))
+        if len(unexpected_keys) > 0:
+            logger.info("Weights from pretrained model not used in {}: {}".format(
+                model.__class__.__name__, unexpected_keys))
+        if len(error_msgs) > 0:
+            raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
+                               model.__class__.__name__, "\n\t".join(error_msgs)))
+        # Make sure we are still sharing the input and output embeddings
+        if hasattr(model, 'tie_weights'):
+            model.tie_weights()
+        return model
+
+
+class TransfoXLModel(TransfoXLPreTrainedModel):
+    """Transformer XL model ("Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context").
+
+    Transformer XL use a relative positioning (with sinusiodal patterns) and adaptive softmax inputs which means that:
+    - you don't need to specify positioning embeddings indices
+    - the tokens in the vocabulary have to be sorted to decreasing frequency.
+
+    Params:
+        config: a TransfoXLConfig class instance with the configuration to build a new model
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the token indices selected in the range [0, self.config.n_token[
+        `mems`: optional memomry of hidden states from previous forward passes
+            as a list (num layers) of hidden states at the entry of each layer
+            each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]
+            Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `target`
+    Outputs:
+        A tuple of (last_hidden_state, new_mems)
+        `last_hidden_state`: the encoded-hidden-states at the top of the model
+            as a torch.FloatTensor of size [batch_size, sequence_length, self.config.d_model]
+        `new_mems`: list (num layers) of updated mem states at the entry of each layer
+            each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]
+            Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `target`
+
+    Example usage:
+    ```python
+    # Already been converted into BPE token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_ids_next = torch.LongTensor([[53, 21, 1], [64, 23, 100]])
+
+    config = TransfoXLConfig()
+
+    model = TransfoXLModel(config)
+    last_hidden_state, new_mems = model(input_ids)
+
+    # Another time on input_ids_next using the memory:
+    last_hidden_state, new_mems = model(input_ids_next, new_mems)
+    ```
+    """
+    def __init__(self, config):
+        super(TransfoXLModel, self).__init__(config)
+        self.n_token = config.n_token
+
+        self.d_embed = config.d_embed
+        self.d_model = config.d_model
+        self.n_head = config.n_head
+        self.d_head = config.d_head
+
+        self.word_emb = AdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs, 
+                                          div_val=config.div_val)
+
+        self.drop = nn.Dropout(config.dropout)
+
+        self.n_layer = config.n_layer
+
+        self.tgt_len = config.tgt_len
+        self.mem_len = config.mem_len
+        self.ext_len = config.ext_len
+        self.max_klen = config.tgt_len + config.ext_len + config.mem_len
+
+        self.attn_type = config.attn_type
+
+        if not config.untie_r:
+            self.r_w_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
+            self.r_r_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
+
+        self.layers = nn.ModuleList()
+        if config.attn_type == 0: # the default attention
+            for i in range(config.n_layer):
+                self.layers.append(
+                    RelPartialLearnableDecoderLayer(
+                        config.n_head, config.d_model, config.d_head, config.d_inner, config.dropout,
+                        tgt_len=config.tgt_len, ext_len=config.ext_len, mem_len=config.mem_len,
+                        dropatt=config.dropatt, pre_lnorm=config.pre_lnorm,
+                        r_w_bias=None if config.untie_r else self.r_w_bias,
+                        r_r_bias=None if config.untie_r else self.r_r_bias)
+                )
+        elif config.attn_type == 1: # learnable embeddings
+            for i in range(config.n_layer):
+                self.layers.append(
+                    RelLearnableDecoderLayer(
+                        config.n_head, config.d_model, config.d_head, config.d_inner, config.dropout,
+                        tgt_len=config.tgt_len, ext_len=config.ext_len, mem_len=config.mem_len,
+                        dropatt=config.dropatt, pre_lnorm=config.pre_lnorm,
+                        r_w_bias=None if config.untie_r else self.r_w_bias,
+                        r_r_bias=None if config.untie_r else self.r_r_bias)
+                )
+        elif config.attn_type in [2, 3]: # absolute embeddings
+            for i in range(config.n_layer):
+                self.layers.append(
+                    DecoderLayer(
+                        config.n_head, config.d_model, config.d_head, config.d_inner, config.dropout,
+                        dropatt=config.dropatt, pre_lnorm=config.pre_lnorm,
+                        r_w_bias=None if config.untie_r else self.r_w_bias,
+                        r_r_bias=None if config.untie_r else self.r_r_bias)
+                )
+
+        self.same_length = config.same_length
+        self.clamp_len = config.clamp_len
+
+        if self.attn_type == 0: # default attention
+            self.pos_emb = PositionalEmbedding(self.d_model)
+        elif self.attn_type == 1: # learnable
+            self.r_emb = nn.Parameter(torch.Tensor(
+                    self.n_layer, self.max_klen, self.n_head, self.d_head))
+            self.r_bias = nn.Parameter(torch.Tensor(
+                    self.n_layer, self.max_klen, self.n_head))
+        elif self.attn_type == 2: # absolute standard
+            self.pos_emb = PositionalEmbedding(self.d_model)
+        elif self.attn_type == 3: # absolute deeper SA
+            self.r_emb = nn.Parameter(torch.Tensor(
+                    self.n_layer, self.max_klen, self.n_head, self.d_head))
+        self.apply(self.init_weights)
+
+    def backward_compatible(self):
+        self.sample_softmax = -1
+
+
+    def reset_length(self, tgt_len, ext_len, mem_len):
+        self.tgt_len = tgt_len
+        self.mem_len = mem_len
+        self.ext_len = ext_len
+
+    def init_mems(self, data):
+        if self.mem_len > 0:
+            mems = []
+            param = next(self.parameters())
+            for i in range(self.n_layer):
+                empty = torch.zeros(self.mem_len, data.size(1), self.config.d_model,
+                                    dtype=param.dtype, device=param.device)
+                mems.append(empty)
+
+            return mems
+        else:
+            return None
+
+    def _update_mems(self, hids, mems, qlen, mlen):
+        # does not deal with None
+        if mems is None: return None
+
+        # mems is not None
+        assert len(hids) == len(mems), 'len(hids) != len(mems)'
+
+        # There are `mlen + qlen` steps that can be cached into mems
+        # For the next step, the last `ext_len` of the `qlen` tokens
+        # will be used as the extended context. Hence, we only cache
+        # the tokens from `mlen + qlen - self.ext_len - self.mem_len`
+        # to `mlen + qlen - self.ext_len`.
+        with torch.no_grad():
+            new_mems = []
+            end_idx = mlen + max(0, qlen - 0 - self.ext_len)
+            beg_idx = max(0, end_idx - self.mem_len)
+            for i in range(len(hids)):
+
+                cat = torch.cat([mems[i], hids[i]], dim=0)
+                new_mems.append(cat[beg_idx:end_idx].detach())
+
+        return new_mems
+
+    def _forward(self, dec_inp, mems=None):
+        qlen, bsz = dec_inp.size()
+
+        word_emb = self.word_emb(dec_inp)
+
+        mlen = mems[0].size(0) if mems is not None else 0
+        klen = mlen + qlen
+        if self.same_length:
+            all_ones = word_emb.new_ones(qlen, klen)
+            mask_len = klen - self.mem_len
+            if mask_len > 0:
+                mask_shift_len = qlen - mask_len
+            else:
+                mask_shift_len = qlen
+            dec_attn_mask = (torch.triu(all_ones, 1+mlen)
+                    + torch.tril(all_ones, -mask_shift_len)).byte()[:, :, None] # -1
+        else:
+            dec_attn_mask = torch.triu(
+                word_emb.new_ones(qlen, klen), diagonal=1+mlen).byte()[:,:,None]
+
+        hids = []
+        if self.attn_type == 0: # default
+            pos_seq = torch.arange(klen-1, -1, -1.0, device=word_emb.device, 
+                                   dtype=word_emb.dtype)
+            if self.clamp_len > 0:
+                pos_seq.clamp_(max=self.clamp_len)
+            pos_emb = self.pos_emb(pos_seq)
+
+            core_out = self.drop(word_emb)
+            pos_emb = self.drop(pos_emb)
+
+            for i, layer in enumerate(self.layers):
+                hids.append(core_out)
+                mems_i = None if mems is None else mems[i]
+                core_out = layer(core_out, pos_emb, dec_attn_mask=dec_attn_mask, mems=mems_i)
+        elif self.attn_type == 1: # learnable
+            core_out = self.drop(word_emb)
+            for i, layer in enumerate(self.layers):
+                hids.append(core_out)
+                if self.clamp_len > 0:
+                    r_emb = self.r_emb[i][-self.clamp_len :]
+                    r_bias = self.r_bias[i][-self.clamp_len :]
+                else:
+                    r_emb, r_bias = self.r_emb[i], self.r_bias[i]
+
+                mems_i = None if mems is None else mems[i]
+                core_out = layer(core_out, r_emb, self.r_w_bias[i],
+                        r_bias, dec_attn_mask=dec_attn_mask, mems=mems_i)
+        elif self.attn_type == 2: # absolute
+            pos_seq = torch.arange(klen - 1, -1, -1.0, device=word_emb.device,
+                                   dtype=word_emb.dtype)
+            if self.clamp_len > 0:
+                pos_seq.clamp_(max=self.clamp_len)
+            pos_emb = self.pos_emb(pos_seq)
+
+            core_out = self.drop(word_emb + pos_emb[-qlen:])
+
+            for i, layer in enumerate(self.layers):
+                hids.append(core_out)
+                mems_i = None if mems is None else mems[i]
+                if mems_i is not None and i == 0:
+                    mems_i += pos_emb[:mlen]
+                core_out = layer(core_out, dec_attn_mask=dec_attn_mask,
+                                 mems=mems_i)
+        elif self.attn_type == 3:
+            core_out = self.drop(word_emb)
+
+            for i, layer in enumerate(self.layers):
+                hids.append(core_out)
+                mems_i = None if mems is None else mems[i]
+                if mems_i is not None and mlen > 0:
+                    cur_emb = self.r_emb[i][:-qlen]
+                    cur_size = cur_emb.size(0)
+                    if cur_size < mlen:
+                        cur_emb_pad = cur_emb[0:1].expand(mlen-cur_size, -1, -1)
+                        cur_emb = torch.cat([cur_emb_pad, cur_emb], 0)
+                    else:
+                        cur_emb = cur_emb[-mlen:]
+                    mems_i += cur_emb.view(mlen, 1, -1)
+                core_out += self.r_emb[i][-qlen:].view(qlen, 1, -1)
+
+                core_out = layer(core_out, dec_attn_mask=dec_attn_mask,
+                                 mems=mems_i)
+
+        core_out = self.drop(core_out)
+
+        new_mems = self._update_mems(hids, mems, mlen, qlen)
+
+        return core_out, new_mems
+
+    def forward(self, input_ids, mems=None):
+        """ Params:
+                input_ids :: [bsz, len]
+                mems :: optional mems from previous forwar passes (or init_mems)
+                    list (num layers) of mem states at the entry of each layer
+                        shape :: [self.config.mem_len, bsz, self.config.d_model]
+                    Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `target`
+            Returns:
+                tuple (last_hidden, new_mems) where:
+                    new_mems: list (num layers) of mem states at the entry of each layer
+                        shape :: [self.config.mem_len, bsz, self.config.d_model]
+                    last_hidden: output of the last layer:
+                        shape :: [bsz, len, self.config.d_model]
+        """
+        # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
+        # so we transpose here from shape [bsz, len] to shape [len, bsz]
+        input_ids = input_ids.transpose(0, 1).contiguous()
+
+        if mems is None:
+            mems = self.init_mems(input_ids)
+        last_hidden, new_mems = self._forward(input_ids, mems=mems)
+
+        # We transpose back here to shape [bsz, len, hidden_dim]
+        last_hidden = last_hidden.transpose(0, 1).contiguous()
+        return (last_hidden, new_mems)
+
+
+class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
+    """Transformer XL model ("Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context").
+
+    This model add an (adaptive) softmax head on top of the TransfoXLModel
+
+    Transformer XL use a relative positioning (with sinusiodal patterns) and adaptive softmax inputs which means that:
+    - you don't need to specify positioning embeddings indices
+    - the tokens in the vocabulary have to be sorted to decreasing frequency.
+
+    Call self.tie_weights() if you update/load the weights of the transformer to keep the weights tied.
+
+    Params:
+        config: a TransfoXLConfig class instance with the configuration to build a new model
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the token indices selected in the range [0, self.config.n_token[
+        `target`: an optional torch.LongTensor of shape [batch_size, sequence_length]
+            with the target token indices selected in the range [0, self.config.n_token[
+        `mems`: an optional memory of hidden states from previous forward passes
+            as a list (num layers) of hidden states at the entry of each layer
+            each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]
+            Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `target`
+
+    Outputs:
+        A tuple of (last_hidden_state, new_mems)
+        `softmax_output`: output of the (adaptive) softmax:
+            if target is None:
+                Negative log likelihood of shape [batch_size, sequence_length] 
+            else:
+                log probabilities of tokens, shape [batch_size, sequence_length, n_tokens]
+        `new_mems`: list (num layers) of updated mem states at the entry of each layer
+            each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]
+            Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `target`
+
+    Example usage:
+    ```python
+    # Already been converted into BPE token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_ids_next = torch.LongTensor([[53, 21, 1], [64, 23, 100]])
+
+    config = TransfoXLConfig()
+
+    model = TransfoXLModel(config)
+    last_hidden_state, new_mems = model(input_ids)
+
+    # Another time on input_ids_next using the memory:
+    last_hidden_state, new_mems = model(input_ids_next, mems=new_mems)
+    ```
+    """
+    def __init__(self, config):
+        super(TransfoXLLMHeadModel, self).__init__(config)
+        self.transformer = TransfoXLModel(config)
+        self.sample_softmax = config.sample_softmax
+        # use sampled softmax
+        if config.sample_softmax > 0:
+            self.out_layer = nn.Linear(config.d_model, config.n_token)
+            self.sampler = LogUniformSampler(config.n_token, config.sample_softmax)
+        # use adaptive softmax (including standard softmax)
+        else:
+            self.crit = ProjectedAdaptiveLogSoftmax(config.n_token, config.d_embed, config.d_model, 
+                                                    config.cutoffs, div_val=config.div_val)
+        self.apply(self.init_weights)
+        self.tie_weights()
+
+    def tie_weights(self):
+        """ Run this to be sure output and input (adaptive) softmax weights are tied """
+        # sampled softmax
+        if self.sample_softmax > 0:
+            if self.config.tie_weight:
+                self.out_layer.weight = self.transformer.word_emb.weight
+        # adaptive softmax (including standard softmax)
+        else:
+            if self.config.tie_weight:
+                for i in range(len(self.crit.out_layers)):
+                    self.crit.out_layers[i].weight = self.transformer.word_emb.emb_layers[i].weight
+            if self.config.tie_projs:
+                for i, tie_proj in enumerate(self.config.tie_projs):
+                    if tie_proj and self.config.div_val == 1 and self.config.d_model != self.config.d_embed:
+                        self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[0]
+                    elif tie_proj and self.config.div_val != 1:
+                        self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[i]
+
+    def reset_length(self, tgt_len, ext_len, mem_len):
+        self.transformer.reset_length(tgt_len, ext_len, mem_len)
+
+    def init_mems(self, data):
+        return self.transformer.init_mems(data)
+
+    def forward(self, input_ids, target=None, mems=None):
+        """ Params:
+                input_ids :: [bsz, len]
+                target :: [bsz, len]
+            Returns:
+                tuple(softmax_output, new_mems) where:
+                    new_mems: list (num layers) of hidden states at the entry of each layer
+                        shape :: [mem_len, bsz, self.config.d_model] :: Warning: shapes are transposed here w. regards to input_ids
+                    softmax_output: output of the (adaptive) softmax:
+                        if target is None:
+                            Negative log likelihood of shape :: [bsz, len] 
+                        else:
+                            log probabilities of tokens, shape :: [bsz, len, n_tokens]
+        """
+        bsz = input_ids.size(0)
+        tgt_len = input_ids.size(1)
+
+        last_hidden, new_mems = self.transformer(input_ids, mems)
+
+        pred_hid = last_hidden[:, -tgt_len:]
+        if self.sample_softmax > 0 and self.training:
+            assert self.config.tie_weight
+            logit = sample_logits(self.transformer.word_emb, self.out_layer.bias, target, pred_hid, self.sampler)
+            softmax_output = -F.log_softmax(logit, -1)[:, :, 0]
+        else:
+            softmax_output = self.crit(pred_hid.view(-1, pred_hid.size(-1)), target)
+            if target is None:
+                softmax_output = softmax_output.view(bsz, tgt_len, -1)
+            else:
+                softmax_output = softmax_output.view(bsz, tgt_len)
+
+        # We transpose back
+        return (softmax_output, new_mems)
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py b/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
new file mode 100644
index 0000000..7fd67ad
--- /dev/null
+++ b/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
@@ -0,0 +1,402 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Utilities for PyTorch Transformer XL model.
+    Directly adapted from https://github.com/kimiyoung/transformer-xl.
+"""
+
+from collections import defaultdict
+
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+# CUDA_MAJOR = int(torch.version.cuda.split('.')[0])
+# CUDA_MINOR = int(torch.version.cuda.split('.')[1])
+
+class ProjectedAdaptiveLogSoftmax(nn.Module):
+    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1,
+                 keep_order=False):
+        super(ProjectedAdaptiveLogSoftmax, self).__init__()
+
+        self.n_token = n_token
+        self.d_embed = d_embed
+        self.d_proj = d_proj
+
+        self.cutoffs = cutoffs + [n_token]
+        self.cutoff_ends = [0] + self.cutoffs
+        self.div_val = div_val
+
+        self.shortlist_size = self.cutoffs[0]
+        self.n_clusters = len(self.cutoffs) - 1
+        self.head_size = self.shortlist_size + self.n_clusters
+
+        if self.n_clusters > 0:
+            self.cluster_weight = nn.Parameter(torch.zeros(self.n_clusters, self.d_embed))
+            self.cluster_bias = nn.Parameter(torch.zeros(self.n_clusters))
+
+        self.out_layers = nn.ModuleList()
+        self.out_projs = nn.ParameterList()
+
+        if div_val == 1:
+            for i in range(len(self.cutoffs)):
+                if d_proj != d_embed:
+                    self.out_projs.append(
+                        nn.Parameter(torch.Tensor(d_proj, d_embed))
+                    )
+                else:
+                    self.out_projs.append(None)
+
+            self.out_layers.append(nn.Linear(d_embed, n_token))
+        else:
+            for i in range(len(self.cutoffs)):
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1]
+                d_emb_i = d_embed // (div_val ** i)
+
+                self.out_projs.append(
+                    nn.Parameter(torch.Tensor(d_proj, d_emb_i))
+                )
+
+                self.out_layers.append(nn.Linear(d_emb_i, r_idx-l_idx))
+
+        self.keep_order = keep_order
+
+    def _compute_logit(self, hidden, weight, bias, proj):
+        if proj is None:
+            logit = F.linear(hidden, weight, bias=bias)
+        else:
+            # if CUDA_MAJOR <= 9 and CUDA_MINOR <= 1:
+            proj_hid = F.linear(hidden, proj.t().contiguous())
+            logit = F.linear(proj_hid, weight, bias=bias)
+            # else:
+            #     logit = torch.einsum('bd,de,ev->bv', (hidden, proj, weight.t()))
+            #     if bias is not None:
+            #         logit = logit + bias
+
+        return logit
+
+    def forward(self, hidden, target=None, keep_order=False):
+        '''
+            Params:
+                hidden :: [len*bsz x d_proj]
+                target :: [len*bsz]
+            Return:
+                if target is None:
+                    out :: [len*bsz] Negative log likelihood
+                else:
+                    out :: [len*bsz x n_tokens] log probabilities of tokens over the vocabulary
+            We could replace this implementation by the native PyTorch one
+            if their's had an option to set bias on all clusters in the native one.
+            here: https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138
+        '''
+
+        if target is not None:
+            target = target.view(-1)
+            if hidden.size(0) != target.size(0):
+                raise RuntimeError('Input and target should have the same size '
+                                'in the batch dimension.')
+
+        if self.n_clusters == 0:
+            logit = self._compute_logit(hidden, self.out_layers[0].weight,
+                                        self.out_layers[0].bias, self.out_projs[0])
+            if target is not None:
+                out = -F.log_softmax(logit, dim=-1) \
+                        .gather(1, target.unsqueeze(1)).squeeze(1)
+            else:
+                out = F.log_softmax(logit, dim=-1)
+        else:
+            # construct weights and biases
+            weights, biases = [], []
+            for i in range(len(self.cutoffs)):
+                if self.div_val == 1:
+                    l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
+                    weight_i = self.out_layers[0].weight[l_idx:r_idx]
+                    bias_i = self.out_layers[0].bias[l_idx:r_idx]
+                else:
+                    weight_i = self.out_layers[i].weight
+                    bias_i = self.out_layers[i].bias
+
+                if i == 0:
+                    weight_i = torch.cat(
+                        [weight_i, self.cluster_weight], dim=0)
+                    bias_i = torch.cat(
+                        [bias_i, self.cluster_bias], dim=0)
+
+                weights.append(weight_i)
+                biases.append(bias_i)
+
+            head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0]
+
+            head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj)
+            head_logprob = F.log_softmax(head_logit, dim=1)
+
+            if target is None:
+                out = hidden.new_empty((head_logit.size(0), self.n_token))
+            else:
+                out = torch.zeros_like(target, dtype=hidden.dtype, device=hidden.device)
+
+            offset = 0
+            cutoff_values = [0] + self.cutoffs
+            for i in range(len(cutoff_values) - 1):
+                l_idx, r_idx = cutoff_values[i], cutoff_values[i + 1]
+
+                if target is not None:
+                    mask_i = (target >= l_idx) & (target < r_idx)
+                    indices_i = mask_i.nonzero().squeeze()
+
+                    if indices_i.numel() == 0:
+                        continue
+
+                    target_i = target.index_select(0, indices_i) - l_idx
+                    head_logprob_i = head_logprob.index_select(0, indices_i)
+                    hidden_i = hidden.index_select(0, indices_i)
+                else:
+                    hidden_i = hidden
+
+                if i == 0:
+                    if target is not None:
+                        logprob_i = head_logprob_i.gather(1, target_i[:, None]).squeeze(1)
+                    else:
+                        out[:, :self.cutoffs[0]] = head_logprob[:, :self.cutoffs[0]]
+                else:
+                    weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i]
+
+                    tail_logit_i = self._compute_logit(hidden_i, weight_i, bias_i, proj_i)
+                    tail_logprob_i = F.log_softmax(tail_logit_i, dim=1)
+                    cluster_prob_idx = self.cutoffs[0] + i - 1  # No probability for the head cluster
+                    if target is not None:
+                        logprob_i = head_logprob_i[:, cluster_prob_idx] \
+                                + tail_logprob_i.gather(1, target_i[:, None]).squeeze(1)
+                    else:
+                        logprob_i = head_logprob[:, cluster_prob_idx, None] + tail_logprob_i
+                        out[:, l_idx:r_idx] = logprob_i
+
+                if target is not None:
+                    if (hasattr(self, 'keep_order') and self.keep_order) or keep_order:
+                        out.index_copy_(0, indices_i, -logprob_i)
+                    else:
+                        out[offset:offset+logprob_i.size(0)].copy_(-logprob_i)
+                    offset += logprob_i.size(0)
+
+        return out
+
+
+    def log_prob(self, hidden):
+        r""" Computes log probabilities for all :math:`n\_classes`
+        From: https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/adaptive.py
+        Args:
+            hidden (Tensor): a minibatch of examples
+        Returns:
+            log-probabilities of for each class :math:`c`
+            in range :math:`0 <= c <= n\_classes`, where :math:`n\_classes` is a
+            parameter passed to ``AdaptiveLogSoftmaxWithLoss`` constructor.
+        Shape:
+            - Input: :math:`(N, in\_features)`
+            - Output: :math:`(N, n\_classes)`
+        """
+        if self.n_clusters == 0:
+            logit = self._compute_logit(hidden, self.out_layers[0].weight,
+                                        self.out_layers[0].bias, self.out_projs[0])
+            return F.log_softmax(logit, dim=-1)
+        else:
+            # construct weights and biases
+            weights, biases = [], []
+            for i in range(len(self.cutoffs)):
+                if self.div_val == 1:
+                    l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
+                    weight_i = self.out_layers[0].weight[l_idx:r_idx]
+                    bias_i = self.out_layers[0].bias[l_idx:r_idx]
+                else:
+                    weight_i = self.out_layers[i].weight
+                    bias_i = self.out_layers[i].bias
+
+                if i == 0:
+                    weight_i = torch.cat(
+                        [weight_i, self.cluster_weight], dim=0)
+                    bias_i = torch.cat(
+                        [bias_i, self.cluster_bias], dim=0)
+
+                weights.append(weight_i)
+                biases.append(bias_i)
+
+            head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0]
+            head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj)
+
+            out = hidden.new_empty((head_logit.size(0), self.n_token))
+            head_logprob = F.log_softmax(head_logit, dim=1)
+
+            cutoff_values = [0] + self.cutoffs
+            for i in range(len(cutoff_values) - 1):
+                start_idx, stop_idx = cutoff_values[i], cutoff_values[i + 1]
+
+                if i == 0:
+                    out[:, :self.cutoffs[0]] = head_logprob[:, :self.cutoffs[0]]
+                else:
+                    weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i]
+
+                    tail_logit_i = self._compute_logit(hidden, weight_i, bias_i, proj_i)
+                    tail_logprob_i = F.log_softmax(tail_logit_i, dim=1)
+
+                    logprob_i = head_logprob[:, -i] + tail_logprob_i
+                    out[:, start_idx, stop_idx] = logprob_i
+
+            return out
+
+
+class LogUniformSampler(object):
+    def __init__(self, range_max, n_sample):
+        """
+        Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py
+            `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)`
+
+        expected count can be approximated by 1 - (1 - p)^n
+        and we use a numerically stable version -expm1(num_tries * log1p(-p))
+
+        Our implementation fixes num_tries at 2 * n_sample, and the actual #samples will vary from run to run
+        """
+        with torch.no_grad():
+            self.range_max = range_max
+            log_indices = torch.arange(1., range_max+2., 1.).log_()
+            self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1]
+            # print('P', self.dist.numpy().tolist()[-30:])
+
+            self.log_q = (- (-self.dist.double().log1p_() * 2 * n_sample).expm1_()).log_().float()
+
+        self.n_sample = n_sample
+
+    def sample(self, labels):
+        """
+            labels: [b1, b2]
+        Return
+            true_log_probs: [b1, b2]
+            samp_log_probs: [n_sample]
+            neg_samples: [n_sample]
+        """
+
+        # neg_samples = torch.empty(0).long()
+        n_sample = self.n_sample
+        n_tries = 2 * n_sample
+
+        with torch.no_grad():
+            neg_samples = torch.multinomial(self.dist, n_tries, replacement=True).unique()
+            device = labels.device
+            neg_samples = neg_samples.to(device)
+            true_log_probs = self.log_q[labels].to(device)
+            samp_log_probs = self.log_q[neg_samples].to(device)
+            return true_log_probs, samp_log_probs, neg_samples
+
+def sample_logits(embedding, bias, labels, inputs, sampler):
+    """
+        embedding: an nn.Embedding layer
+        bias: [n_vocab]
+        labels: [b1, b2]
+        inputs: [b1, b2, n_emb]
+        sampler: you may use a LogUniformSampler
+    Return
+        logits: [b1, b2, 1 + n_sample]
+    """
+    true_log_probs, samp_log_probs, neg_samples = sampler.sample(labels)
+    n_sample = neg_samples.size(0)
+    b1, b2 = labels.size(0), labels.size(1)
+    all_ids = torch.cat([labels.view(-1), neg_samples])
+    all_w = embedding(all_ids)
+    true_w = all_w[: -n_sample].view(b1, b2, -1)
+    sample_w = all_w[- n_sample:].view(n_sample, -1)
+
+    all_b = bias[all_ids]
+    true_b = all_b[: -n_sample].view(b1, b2)
+    sample_b = all_b[- n_sample:]
+
+    hit = (labels[:, :, None] == neg_samples).detach()
+
+    true_logits = torch.einsum('ijk,ijk->ij',
+        [true_w, inputs]) + true_b - true_log_probs
+    sample_logits = torch.einsum('lk,ijk->ijl',
+        [sample_w, inputs]) + sample_b - samp_log_probs
+    sample_logits.masked_fill_(hit, -1e30)
+    logits = torch.cat([true_logits[:, :, None], sample_logits], -1)
+
+    return logits
+
+
+# class LogUniformSampler(object):
+#     def __init__(self, range_max, unique=False):
+#         """
+#         Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py
+#             `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)`
+#         """
+#         self.range_max = range_max
+#         log_indices = torch.arange(1., range_max+2., 1.).log_()
+#         self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1]
+
+#         self.unique = unique
+
+#         if self.unique:
+#             self.exclude_mask = torch.ByteTensor(range_max).fill_(0)
+
+#     def sample(self, n_sample, labels):
+#         pos_sample, new_labels = labels.unique(return_inverse=True)
+#         n_pos_sample = pos_sample.size(0)
+#         n_neg_sample = n_sample - n_pos_sample
+
+#         if self.unique:
+#             self.exclude_mask.index_fill_(0, pos_sample, 1)
+#             sample_dist = self.dist.clone().masked_fill_(self.exclude_mask, 0)
+#             self.exclude_mask.index_fill_(0, pos_sample, 0)
+#         else:
+#             sample_dist = self.dist
+
+#         neg_sample = torch.multinomial(sample_dist, n_neg_sample)
+
+#         sample = torch.cat([pos_sample, neg_sample])
+#         sample_prob = self.dist[sample]
+
+#         return new_labels, sample, sample_prob
+
+
+if __name__ == '__main__':
+    S, B = 3, 4
+    n_vocab = 10000
+    n_sample = 5
+    H = 32
+
+    labels = torch.LongTensor(S, B).random_(0, n_vocab)
+
+    # sampler = LogUniformSampler(n_vocab, unique=False)
+    # new_labels, sample, sample_prob = sampler.sample(n_sample, labels)
+
+    sampler = LogUniformSampler(n_vocab, n_sample)#, unique=True)
+    # true_probs, samp_probs, neg_samples = sampler.sample(n_sample, labels)
+
+    # print('true_probs', true_probs.numpy().tolist())
+    # print('samp_probs', samp_probs.numpy().tolist())
+    # print('neg_samples', neg_samples.numpy().tolist())
+
+    # print('sum', torch.sum(sampler.dist).item())
+
+    # assert torch.all(torch.sort(sample.unique())[0].eq(torch.sort(sample)[0])).item()
+
+    embedding = nn.Embedding(n_vocab, H)
+    bias = torch.zeros(n_vocab)
+    inputs = torch.Tensor(S, B, H).normal_()
+
+    logits, out_labels = sample_logits(embedding, bias, labels, inputs, sampler, n_sample)
+    print('logits', logits.detach().numpy().tolist())
+    print('logits shape', logits.size())
+    print('out_labels', out_labels.detach().numpy().tolist())
+    print('out_labels shape', out_labels.size())
+
diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py
new file mode 100644
index 0000000..0385695
--- /dev/null
+++ b/pytorch_pretrained_bert/optimization.py
@@ -0,0 +1,302 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch optimization for BERT model."""
+
+import math
+import torch
+from torch.optim import Optimizer
+from torch.optim.optimizer import required
+from torch.nn.utils import clip_grad_norm_
+import logging
+import abc
+import sys
+
+logger = logging.getLogger(__name__)
+
+
+if sys.version_info >= (3, 4):
+    ABC = abc.ABC
+else:
+    ABC = abc.ABCMeta('ABC', (), {})
+
+
+class _LRSchedule(ABC):
+    """ Parent of all LRSchedules here. """
+    warn_t_total = False        # is set to True for schedules where progressing beyond t_total steps doesn't make sense
+    def __init__(self, warmup=0.002, t_total=-1, **kw):
+        """
+        :param warmup:  what fraction of t_total steps will be used for linear warmup
+        :param t_total: how many training steps (updates) are planned
+        :param kw:
+        """
+        super(_LRSchedule, self).__init__(**kw)
+        if t_total < 0:
+            logger.warning("t_total value of {} results in schedule not being applied".format(t_total))
+        if not 0.0 <= warmup < 1.0 and not warmup == -1:
+            raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
+        warmup = max(warmup, 0.)
+        self.warmup, self.t_total = float(warmup), float(t_total)
+        self.warned_for_t_total_at_progress = -1
+
+    def get_lr(self, step, nowarn=False):
+        """
+        :param step:    which of t_total steps we're on
+        :param nowarn:  set to True to suppress warning regarding training beyond specified 't_total' steps
+        :return:        learning rate multiplier for current update
+        """
+        if self.t_total < 0:
+            return 1.
+        progress = float(step) / self.t_total
+        ret = self.get_lr_(progress)
+        # warning for exceeding t_total (only active with warmup_linear
+        if not nowarn and self.warn_t_total and progress > 1. and progress > self.warned_for_t_total_at_progress:
+            logger.warning(
+                "Training beyond specified 't_total'. Learning rate multiplier set to {}. Please set 't_total' of {} correctly."
+                    .format(ret, self.__class__.__name__))
+            self.warned_for_t_total_at_progress = progress
+        # end warning
+        return ret
+
+    @abc.abstractmethod
+    def get_lr_(self, progress):
+        """
+        :param progress:    value between 0 and 1 (unless going beyond t_total steps) specifying training progress
+        :return:            learning rate multiplier for current update
+        """
+        return 1.
+
+
+class ConstantLR(_LRSchedule):
+    def get_lr_(self, progress):
+        return 1.
+
+
+class WarmupCosineSchedule(_LRSchedule):
+    """
+    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    Decreases learning rate from 1. to 0. over remaining `1 - warmup` steps following a cosine curve.
+    If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
+    """
+    warn_t_total = True
+    def __init__(self, warmup=0.002, t_total=-1, cycles=.5, **kw):
+        """
+        :param warmup:      see LRSchedule
+        :param t_total:     see LRSchedule
+        :param cycles:      number of cycles. Default: 0.5, corresponding to cosine decay from 1. at progress==warmup and 0 at progress==1.
+        :param kw:
+        """
+        super(WarmupCosineSchedule, self).__init__(warmup=warmup, t_total=t_total, **kw)
+        self.cycles = cycles
+
+    def get_lr_(self, progress):
+        if progress < self.warmup:
+            return progress / self.warmup
+        else:
+            progress = (progress - self.warmup) / (1 - self.warmup)   # progress after warmup
+            return 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress))
+
+
+class WarmupCosineWithHardRestartsSchedule(WarmupCosineSchedule):
+    """
+    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying
+    learning rate (with hard restarts).
+    """
+    def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
+        super(WarmupCosineWithHardRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
+        assert(cycles >= 1.)
+
+    def get_lr_(self, progress):
+        if progress < self.warmup:
+            return progress / self.warmup
+        else:
+            progress = (progress - self.warmup) / (1 - self.warmup)     # progress after warmup
+            ret = 0.5 * (1. + math.cos(math.pi * ((self.cycles * progress) % 1)))
+            return ret
+
+
+class WarmupCosineWithWarmupRestartsSchedule(WarmupCosineWithHardRestartsSchedule):
+    """
+    All training progress is divided in `cycles` (default=1.) parts of equal length.
+    Every part follows a schedule with the first `warmup` fraction of the training steps linearly increasing from 0. to 1.,
+    followed by a learning rate decreasing from 1. to 0. following a cosine curve.
+    """
+    def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
+        assert(warmup * cycles < 1.)
+        warmup = warmup * cycles if warmup >= 0 else warmup
+        super(WarmupCosineWithWarmupRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
+
+    def get_lr_(self, progress):
+        progress = progress * self.cycles % 1.
+        if progress < self.warmup:
+            return progress / self.warmup
+        else:
+            progress = (progress - self.warmup) / (1 - self.warmup)     # progress after warmup
+            ret = 0.5 * (1. + math.cos(math.pi * progress))
+            return ret
+
+
+class WarmupConstantSchedule(_LRSchedule):
+    """
+    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    Keeps learning rate equal to 1. after warmup.
+    """
+    def get_lr_(self, progress):
+        if progress < self.warmup:
+            return progress / self.warmup
+        return 1.
+
+
+class WarmupLinearSchedule(_LRSchedule):
+    """
+    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    Linearly decreases learning rate from 1. to 0. over remaining `1 - warmup` steps.
+    """
+    warn_t_total = True
+    def get_lr_(self, progress):
+        if progress < self.warmup:
+            return progress / self.warmup
+        return max((progress - 1.) / (self.warmup - 1.), 0.)
+
+
+SCHEDULES = {
+    None:       ConstantLR,
+    "none":     ConstantLR,
+    "warmup_cosine": WarmupCosineSchedule,
+    "warmup_constant": WarmupConstantSchedule,
+    "warmup_linear": WarmupLinearSchedule
+}
+
+
+class BertAdam(Optimizer):
+    """Implements BERT version of Adam algorithm with weight decay fix.
+    Params:
+        lr: learning rate
+        warmup: portion of t_total for the warmup, -1  means no warmup. Default: -1
+        t_total: total number of training steps for the learning
+            rate schedule, -1  means constant learning rate of 1. (no warmup regardless of warmup setting). Default: -1
+        schedule: schedule to use for the warmup (see above).
+            Can be `'warmup_linear'`, `'warmup_constant'`, `'warmup_cosine'`, `'none'`, `None` or a `_LRSchedule` object (see below).
+            If `None` or `'none'`, learning rate is always kept constant.
+            Default : `'warmup_linear'`
+        b1: Adams b1. Default: 0.9
+        b2: Adams b2. Default: 0.999
+        e: Adams epsilon. Default: 1e-6
+        weight_decay: Weight decay. Default: 0.01
+        max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
+    """
+    def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear',
+                 b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01, max_grad_norm=1.0, **kwargs):
+        if lr is not required and lr < 0.0:
+            raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
+        if not isinstance(schedule, _LRSchedule) and schedule not in SCHEDULES:
+            raise ValueError("Invalid schedule parameter: {}".format(schedule))
+        if not 0.0 <= b1 < 1.0:
+            raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
+        if not 0.0 <= b2 < 1.0:
+            raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
+        if not e >= 0.0:
+            raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
+        # initialize schedule object
+        if not isinstance(schedule, _LRSchedule):
+            schedule_type = SCHEDULES[schedule]
+            schedule = schedule_type(warmup=warmup, t_total=t_total)
+        else:
+            if warmup != -1 or t_total != -1:
+                logger.warning("warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. "
+                               "Please specify custom warmup and t_total in _LRSchedule object.")
+        defaults = dict(lr=lr, schedule=schedule,
+                        b1=b1, b2=b2, e=e, weight_decay=weight_decay,
+                        max_grad_norm=max_grad_norm)
+        super(BertAdam, self).__init__(params, defaults)
+
+    def get_lr(self):
+        lr = []
+        for group in self.param_groups:
+            for p in group['params']:
+                state = self.state[p]
+                if len(state) == 0:
+                    return [0]
+                lr_scheduled = group['lr']
+                lr_scheduled *= group['schedule'].get_lr(state['step'])
+                lr.append(lr_scheduled)
+        return lr
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['next_m'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['next_v'] = torch.zeros_like(p.data)
+
+                next_m, next_v = state['next_m'], state['next_v']
+                beta1, beta2 = group['b1'], group['b2']
+
+                # Add grad clipping
+                if group['max_grad_norm'] > 0:
+                    clip_grad_norm_(p, group['max_grad_norm'])
+
+                # Decay the first and second moment running average coefficient
+                # In-place operations to update the averages at the same time
+                next_m.mul_(beta1).add_(1 - beta1, grad)
+                next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+                update = next_m / (next_v.sqrt() + group['e'])
+
+                # Just adding the square of the weights to the loss function is *not*
+                # the correct way of using L2 regularization/weight decay with Adam,
+                # since that will interact with the m and v parameters in strange ways.
+                #
+                # Instead we want to decay the weights in a manner that doesn't interact
+                # with the m/v parameters. This is equivalent to adding the square
+                # of the weights to the loss with plain (non-momentum) SGD.
+                if group['weight_decay'] > 0.0:
+                    update += group['weight_decay'] * p.data
+
+                lr_scheduled = group['lr']
+                lr_scheduled *= group['schedule'].get_lr(state['step'])
+
+                update_with_lr = lr_scheduled * update
+                p.data.add_(-update_with_lr)
+
+                state['step'] += 1
+
+                # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
+                # No bias correction
+                # bias_correction1 = 1 - beta1 ** state['step']
+                # bias_correction2 = 1 - beta2 ** state['step']
+
+        return loss
diff --git a/pytorch_pretrained_bert/optimization_openai.py b/pytorch_pretrained_bert/optimization_openai.py
new file mode 100644
index 0000000..bff4ebe
--- /dev/null
+++ b/pytorch_pretrained_bert/optimization_openai.py
@@ -0,0 +1,127 @@
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch optimization for OpenAI GPT model."""
+
+import math
+import torch
+from torch.optim import Optimizer
+from torch.optim.optimizer import required
+from torch.nn.utils import clip_grad_norm_
+import logging
+from .optimization import SCHEDULES, _LRSchedule, WarmupCosineWithWarmupRestartsSchedule, \
+    WarmupCosineWithHardRestartsSchedule, WarmupCosineSchedule, WarmupLinearSchedule, WarmupConstantSchedule
+
+logger = logging.getLogger(__name__)
+
+
+class OpenAIAdam(Optimizer):
+    """Implements Open AI version of Adam algorithm with weight decay fix.
+    """
+    def __init__(self, params, lr=required, schedule='warmup_linear', warmup=-1, t_total=-1,
+                 b1=0.9, b2=0.999, e=1e-8, weight_decay=0,
+                 vector_l2=False, max_grad_norm=-1, **kwargs):
+        if lr is not required and lr < 0.0:
+            raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
+        if not isinstance(schedule, _LRSchedule) and schedule not in SCHEDULES:
+            raise ValueError("Invalid schedule parameter: {}".format(schedule))
+        if not 0.0 <= b1 < 1.0:
+            raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
+        if not 0.0 <= b2 < 1.0:
+            raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
+        if not e >= 0.0:
+            raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
+        # initialize schedule object
+        if not isinstance(schedule, _LRSchedule):
+            schedule_type = SCHEDULES[schedule]
+            schedule = schedule_type(warmup=warmup, t_total=t_total)
+        else:
+            if warmup != -1 or t_total != -1:
+                logger.warning("warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. "
+                               "Please specify custom warmup and t_total in _LRSchedule object.")
+        defaults = dict(lr=lr, schedule=schedule,
+                        b1=b1, b2=b2, e=e, weight_decay=weight_decay, vector_l2=vector_l2,
+                        max_grad_norm=max_grad_norm)
+        super(OpenAIAdam, self).__init__(params, defaults)
+
+    def get_lr(self):
+        lr = []
+        for group in self.param_groups:
+            for p in group['params']:
+                state = self.state[p]
+                if len(state) == 0:
+                    return [0]
+                lr_scheduled = group['lr']
+                lr_scheduled *= group['schedule'].get_lr(state['step'])
+                lr.append(lr_scheduled)
+        return lr
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p.data)
+
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                beta1, beta2 = group['b1'], group['b2']
+
+                state['step'] += 1
+
+                # Add grad clipping
+                if group['max_grad_norm'] > 0:
+                    clip_grad_norm_(p, group['max_grad_norm'])
+
+                # Decay the first and second moment running average coefficient
+                exp_avg.mul_(beta1).add_(1 - beta1, grad)
+                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+                denom = exp_avg_sq.sqrt().add_(group['e'])
+
+                bias_correction1 = 1 - beta1 ** state['step']
+                bias_correction2 = 1 - beta2 ** state['step']
+
+                lr_scheduled = group['lr']
+                lr_scheduled *= group['schedule'].get_lr(state['step'])
+
+                step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
+
+                p.data.addcdiv_(-step_size, exp_avg, denom)
+
+                # Add weight decay at the end (fixed version)
+                if (len(p.size()) > 1 or group['vector_l2']) and group['weight_decay'] > 0:
+                    p.data.add_(-lr_scheduled * group['weight_decay'], p.data)
+
+        return loss
diff --git a/pytorch_pretrained_bert/tokenization.py b/pytorch_pretrained_bert/tokenization.py
new file mode 100644
index 0000000..4199870
--- /dev/null
+++ b/pytorch_pretrained_bert/tokenization.py
@@ -0,0 +1,434 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import collections
+import logging
+import os
+import unicodedata
+from io import open
+
+from .file_utils import cached_path
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_VOCAB_ARCHIVE_MAP = {
+    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
+    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
+    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
+    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
+    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
+    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
+    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
+    'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt",
+}
+PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+    'bert-base-uncased': 512,
+    'bert-large-uncased': 512,
+    'bert-base-cased': 512,
+    'bert-large-cased': 512,
+    'bert-base-multilingual-uncased': 512,
+    'bert-base-multilingual-cased': 512,
+    'bert-base-chinese': 512,
+    'bert-base-german-cased': 512,
+}
+VOCAB_NAME = 'vocab.txt'
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    index = 0
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        while True:
+            token = reader.readline()
+            if not token:
+                break
+            token = token.strip()
+            vocab[token] = index
+            index += 1
+    return vocab
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class BertTokenizer(object):
+    """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
+
+    def __init__(self, vocab_file, do_lower_case=True, max_len=None, do_basic_tokenize=True,
+                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]",
+                              "[MASK]", "[BOS]", "[EOS]", "[SPEAKER1]", "[SPEAKER2]")):
+        """Constructs a BertTokenizer.
+
+        Args:
+          vocab_file: Path to a one-wordpiece-per-line vocabulary file
+          do_lower_case: Whether to lower case the input
+                         Only has an effect when do_wordpiece_only=False
+          do_basic_tokenize: Whether to do basic tokenization before wordpiece.
+          max_len: An artificial maximum length to truncate tokenized sequences to;
+                         Effective maximum length is always the minimum of this
+                         value (if specified) and the underlying BERT model's
+                         sequence length.
+          never_split: List of tokens which will never be split during tokenization.
+                         Only has an effect when do_wordpiece_only=False
+        """
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
+                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict(
+            [(ids, tok) for tok, ids in self.vocab.items()])
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+          self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
+                                                never_split=never_split)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+        self.max_len = max_len if max_len is not None else int(1e12)
+
+    def tokenize(self, text):
+        split_tokens = []
+        if self.do_basic_tokenize:
+            for token in self.basic_tokenizer.tokenize(text):
+                for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                    split_tokens.append(sub_token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        """Converts a sequence of tokens into ids using the vocab."""
+        ids = []
+        for token in tokens:
+            ids.append(self.vocab[token])
+        if len(ids) > self.max_len:
+            logger.warning(
+                "Token indices sequence length is longer than the specified maximum "
+                " sequence length for this BERT model ({} > {}). Running this"
+                " sequence through BERT will result in indexing errors".format(len(ids), self.max_len)
+            )
+        return ids
+
+
+    def convert_ids_to_tokens(self, ids):
+        """Converts a sequence of ids in wordpiece tokens using the vocab."""
+        tokens = []
+        for i in ids:
+            tokens.append(self.ids_to_tokens[i])
+        return tokens
+
+    def save_vocabulary(self, vocab_path):
+        """Save the tokenizer vocabulary to a directory or file."""
+        index = 0
+        if os.path.isdir(vocab_path):
+            vocab_file = os.path.join(vocab_path, VOCAB_NAME)
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning("Saving vocabulary to {}: vocabulary indices are not consecutive."
+                                   " Please check that the vocabulary is not corrupted!".format(vocab_file))
+                    index = token_index
+                writer.write(token + u'\n')
+                index += 1
+        return vocab_file
+
+    def encode(self, text):
+        return self.convert_tokens_to_ids(self.tokenize(text))
+
+    def decode(self, tokens, skip_special_tokens=False, clean_up_tokenization_spaces=True):
+        text = ' '.join(self.convert_ids_to_tokens(tokens))
+        if clean_up_tokenization_spaces:
+            text = text.replace('<unk>', '')
+            text = text.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
+                                                                                         ).replace(" ' ", "'").replace(
+                " n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
+                                                            ).replace(" 's", "'s").replace(" 've", "'ve").replace(
+                " 're", "'re")
+        return text
+
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
+        """
+        Instantiate a PreTrainedBertModel from a pre-trained model file.
+        Download and cache the pre-trained model file if needed.
+        """
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
+            if '-cased' in pretrained_model_name_or_path and kwargs.get('do_lower_case', True):
+                logger.warning("The pre-trained model you are loading is a cased model but you have not set "
+                               "`do_lower_case` to False. We are setting `do_lower_case=False` for you but "
+                               "you may want to check this behavior.")
+                kwargs['do_lower_case'] = False
+            elif '-cased' not in pretrained_model_name_or_path and not kwargs.get('do_lower_case', True):
+                logger.warning("The pre-trained model you are loading is an uncased model but you have set "
+                               "`do_lower_case` to False. We are setting `do_lower_case=True` for you "
+                               "but you may want to check this behavior.")
+                kwargs['do_lower_case'] = True
+        else:
+            vocab_file = pretrained_model_name_or_path
+        if os.path.isdir(vocab_file):
+            vocab_file = os.path.join(vocab_file, VOCAB_NAME)
+        # redirect to the cache, if necessary
+        try:
+            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find any file "
+                "associated to this path or url.".format(
+                    pretrained_model_name_or_path,
+                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                    vocab_file))
+            return None
+        if resolved_vocab_file == vocab_file:
+            logger.info("loading vocabulary file {}".format(vocab_file))
+        else:
+            logger.info("loading vocabulary file {} from cache at {}".format(
+                vocab_file, resolved_vocab_file))
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
+            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
+            # than the number of positional embeddings
+            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
+            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
+        # Instantiate tokenizer.
+        tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
+        return tokenizer
+
+
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+    def __init__(self,
+                 do_lower_case=True,
+                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
+        """Constructs a BasicTokenizer.
+
+        Args:
+          do_lower_case: Whether to lower case the input.
+        """
+        self.do_lower_case = do_lower_case
+        self.never_split = never_split
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = self._clean_text(text)
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case and token not in self.never_split:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        if text in self.never_split:
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+                (cp >= 0x3400 and cp <= 0x4DBF) or  #
+                (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+                (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+                (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+                (cp >= 0x2B820 and cp <= 0x2CEAF) or
+                (cp >= 0xF900 and cp <= 0xFAFF) or  #
+                (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenization."""
+
+    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+
+        For example:
+          input = "unaffable"
+          output = ["un", "##aff", "##able"]
+
+        Args:
+          text: A single token or whitespace separated tokens. This should have
+            already been passed through `BasicTokenizer`.
+
+        Returns:
+          A list of wordpiece tokens.
+        """
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
diff --git a/pytorch_pretrained_bert/tokenization_gpt2.py b/pytorch_pretrained_bert/tokenization_gpt2.py
new file mode 100644
index 0000000..af75cac
--- /dev/null
+++ b/pytorch_pretrained_bert/tokenization_gpt2.py
@@ -0,0 +1,311 @@
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for OpenAI GPT."""
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import sys
+import json
+import logging
+import os
+import regex as re
+from io import open
+
+try:
+    from functools import lru_cache
+except ImportError:
+    # Just a dummy decorator to get the checks to run on python2
+    # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
+    def lru_cache():
+        return lambda func: func
+
+from .file_utils import cached_path
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_VOCAB_ARCHIVE_MAP = {
+    'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
+    'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json",
+}
+PRETRAINED_MERGES_ARCHIVE_MAP = {
+    'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
+    'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt",
+}
+PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+    'gpt2': 1024,
+}
+VOCAB_NAME = 'vocab.json'
+MERGES_NAME = 'merges.txt'
+SPECIAL_TOKENS_NAME = 'special_tokens.txt'
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    _chr = unichr if sys.version_info[0] == 2 else chr
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [_chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+class GPT2Tokenizer(object):
+    """
+    GPT-2 BPE tokenizer. Peculiarities:
+        - Byte-level BPE
+    """
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
+        """
+        Instantiate a GPT2Tokenizer from a pre-trained model file.
+        Download and cache the pre-trained model file if needed.
+        """
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
+            merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
+            special_tokens_file = None
+        else:
+            vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
+            merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
+            special_tokens_file = os.path.join(pretrained_model_name_or_path, SPECIAL_TOKENS_NAME)
+            if not os.path.exists(special_tokens_file):
+                special_tokens_file = None
+            else:
+                logger.info("loading special tokens file {}".format(special_tokens_file))
+        # redirect to the cache, if necessary
+        try:
+            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
+            resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find files {} and {} "
+                "at this path or url.".format(
+                    pretrained_model_name_or_path,
+                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                    pretrained_model_name_or_path,
+                    vocab_file, merges_file))
+            return None
+        if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
+            logger.info("loading vocabulary file {}".format(vocab_file))
+            logger.info("loading merges file {}".format(merges_file))
+        else:
+            logger.info("loading vocabulary file {} from cache at {}".format(
+                vocab_file, resolved_vocab_file))
+            logger.info("loading merges file {} from cache at {}".format(
+                merges_file, resolved_merges_file))
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
+            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
+            # than the number of positional embeddings
+            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
+            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
+        # Instantiate tokenizer.
+        if special_tokens_file and 'special_tokens' not in kwargs:
+            special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
+        else:
+            special_tokens = kwargs.pop('special_tokens', [])
+        tokenizer = cls(resolved_vocab_file, resolved_merges_file, special_tokens=special_tokens, *inputs, **kwargs)
+        return tokenizer
+
+    def __init__(self, vocab_file, merges_file, errors='replace', special_tokens=None, max_len=None):
+        self.max_len = max_len if max_len is not None else int(1e12)
+        self.encoder = json.load(open(vocab_file))
+        self.decoder = {v:k for k,v in self.encoder.items()}
+        self.errors = errors # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
+        bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+
+        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
+        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+
+        self.special_tokens = {}
+        self.special_tokens_decoder = {}
+        self.set_special_tokens(special_tokens)
+
+    def __len__(self):
+        return len(self.encoder) + len(self.special_tokens)
+
+    def set_special_tokens(self, special_tokens):
+        """ Add a list of additional tokens to the encoder.
+            The additional tokens are indexed starting from the last index of the
+            current vocabulary in the order of the `special_tokens` list.
+        """
+        if not special_tokens:
+            self.special_tokens = {}
+            self.special_tokens_decoder = {}
+            return
+        self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens))
+        self.special_tokens_decoder = {v:k for k, v in self.special_tokens.items()}
+        logger.info("Special tokens {}".format(self.special_tokens))
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word)-1 and word[i+1] == second:
+                    new_word.append(first+second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+
+    def tokenize(self, text):
+        """ Tokenize a string. """
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            if sys.version_info[0] == 2:
+                token = ''.join(self.byte_encoder[ord(b)] for b in token)
+            else:
+                token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        """ Converts a sequence of tokens into ids using the vocab. """
+        ids = []
+        if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.encoder.get(tokens, 0)
+        for token in tokens:
+            if token in self.special_tokens:
+                ids.append(self.special_tokens[token])
+            else:
+                ids.append(self.encoder.get(token, 0))
+        if len(ids) > self.max_len:
+            logger.warning(
+                "Token indices sequence length is longer than the specified maximum "
+                " sequence length for this OpenAI GPT model ({} > {}). Running this"
+                " sequence through the model will result in indexing errors".format(len(ids), self.max_len)
+            )
+        return ids
+
+    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
+        """Converts a sequence of ids in BPE tokens using the vocab."""
+        tokens = []
+        for i in ids:
+            if i in self.special_tokens_decoder:
+                if not skip_special_tokens:
+                    tokens.append(self.special_tokens_decoder[i])
+            else:
+                tokens.append(self.decoder[i])
+        return tokens
+
+    def encode(self, text):
+        return self.convert_tokens_to_ids(self.tokenize(text))
+
+    def decode(self, tokens, skip_special_tokens=False, clean_up_tokenization_spaces=True):
+        text = ''.join(self.convert_ids_to_tokens(tokens, skip_special_tokens=skip_special_tokens))
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
+        if clean_up_tokenization_spaces:
+            text = text.replace('<unk>', '')
+            text = text.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
+                    ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
+                    ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
+        return text
+
+    def save_vocabulary(self, vocab_path):
+        """Save the tokenizer vocabulary and merge files to a directory."""
+        if not os.path.isdir(vocab_path):
+            logger.error("Vocabulary path ({}) should be a directory".format(vocab_path))
+            return
+        vocab_file = os.path.join(vocab_path, VOCAB_NAME)
+        merge_file = os.path.join(vocab_path, MERGES_NAME)
+        special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
+
+        with open(vocab_file, 'w', encoding='utf-8') as f:
+            f.write(json.dumps(self.encoder, ensure_ascii=False))
+
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write(u'#version: 0.2\n')
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
+                                   " Please check that the tokenizer is not corrupted!".format(merge_file))
+                    index = token_index
+                writer.write(' '.join(bpe_tokens) + u'\n')
+                index += 1
+
+        index = len(self.encoder)
+        with open(special_tokens_file, 'w', encoding='utf-8') as writer:
+            for token, token_index in sorted(self.special_tokens.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning("Saving special tokens vocabulary to {}: BPE indices are not consecutive."
+                                   " Please check that the tokenizer is not corrupted!".format(special_tokens_file))
+                    index = token_index
+                writer.write(token + u'\n')
+                index += 1
+
+        return vocab_file, merge_file, special_tokens_file
diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_pretrained_bert/tokenization_openai.py
new file mode 100644
index 0000000..d6fbc68
--- /dev/null
+++ b/pytorch_pretrained_bert/tokenization_openai.py
@@ -0,0 +1,313 @@
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for OpenAI GPT."""
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import json
+import logging
+import os
+import re
+import sys
+from io import open
+
+from tqdm import tqdm
+
+from .file_utils import cached_path
+from .tokenization import BasicTokenizer
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_VOCAB_ARCHIVE_MAP = {
+    'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json",
+}
+PRETRAINED_MERGES_ARCHIVE_MAP = {
+    'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt",
+}
+PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+    'openai-gpt': 512,
+}
+VOCAB_NAME = 'vocab.json'
+MERGES_NAME = 'merges.txt'
+SPECIAL_TOKENS_NAME = 'special_tokens.txt'
+
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word.
+    word is represented as tuple of symbols (symbols being variable-length strings)
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+def text_standardize(text):
+    """
+    fixes some issues the spacy tokenizer had on books corpus
+    also does some whitespace standardization
+    """
+    text = text.replace('—', '-')
+    text = text.replace('–', '-')
+    text = text.replace('―', '-')
+    text = text.replace('…', '...')
+    text = text.replace('´', "'")
+    text = re.sub(r'''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text)
+    text = re.sub(r'\s*\n\s*', ' \n ', text)
+    text = re.sub(r'[^\S\n]+', ' ', text)
+    return text.strip()
+
+class OpenAIGPTTokenizer(object):
+    """
+    BPE tokenizer. Peculiarities:
+        - lower case all inputs
+        - uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
+        - argument special_tokens and function set_special_tokens:
+            can be used to add additional symbols (ex: "__classify__") to a vocabulary.
+    """
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
+        """
+        Instantiate a PreTrainedBertModel from a pre-trained model file.
+        Download and cache the pre-trained model file if needed.
+        """
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
+            merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
+            special_tokens_file = None
+        else:
+            vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
+            merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
+            special_tokens_file = os.path.join(pretrained_model_name_or_path, SPECIAL_TOKENS_NAME)
+            if not os.path.exists(special_tokens_file):
+                special_tokens_file = None
+            else:
+                logger.info("loading special tokens file {}".format(special_tokens_file))
+        # redirect to the cache, if necessary
+        try:
+            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
+            resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find files {} and {} "
+                "at this path or url.".format(
+                    pretrained_model_name_or_path,
+                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                    pretrained_model_name_or_path,
+                    vocab_file, merges_file))
+            return None
+        if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
+            logger.info("loading vocabulary file {}".format(vocab_file))
+            logger.info("loading merges file {}".format(merges_file))
+        else:
+            logger.info("loading vocabulary file {} from cache at {}".format(
+                vocab_file, resolved_vocab_file))
+            logger.info("loading merges file {} from cache at {}".format(
+                merges_file, resolved_merges_file))
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
+            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
+            # than the number of positional embeddings
+            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
+            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
+        # Instantiate tokenizer.
+        if special_tokens_file and 'special_tokens' not in kwargs:
+            special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
+        else:
+            special_tokens = kwargs.pop('special_tokens', [])
+        tokenizer = cls(resolved_vocab_file, resolved_merges_file, special_tokens=special_tokens, *inputs, **kwargs)
+        return tokenizer
+
+    def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None):
+        try:
+            import ftfy
+            import spacy
+            self.nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger', 'ner', 'textcat'])
+            self.fix_text = ftfy.fix_text
+        except ImportError:
+            logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
+            self.nlp = BasicTokenizer(do_lower_case=True,
+                                      never_split=special_tokens if special_tokens is not None else [])
+            self.fix_text = None
+
+        self.max_len = max_len if max_len is not None else int(1e12)
+        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
+        self.decoder = {v:k for k,v in self.encoder.items()}
+        merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
+        merges = [tuple(merge.split()) for merge in merges]
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {}
+        self.special_tokens = {}
+        self.special_tokens_decoder = {}
+        self.set_special_tokens(special_tokens)
+
+    def __len__(self):
+        return len(self.encoder) + len(self.special_tokens)
+
+    def set_special_tokens(self, special_tokens):
+        """ Add a list of additional tokens to the encoder.
+            The additional tokens are indexed starting from the last index of the
+            current vocabulary in the order of the `special_tokens` list.
+        """
+        if not special_tokens:
+            self.special_tokens = {}
+            self.special_tokens_decoder = {}
+            return
+        self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens))
+        self.special_tokens_decoder = {v:k for k, v in self.special_tokens.items()}
+        if self.fix_text is None:
+            # Using BERT's BasicTokenizer: we can update the tokenizer
+            self.nlp.never_split = special_tokens
+        logger.info("Special tokens {}".format(self.special_tokens))
+
+    def bpe(self, token):
+        word = tuple(token[:-1]) + (token[-1] + '</w>',)
+        if token in self.cache:
+            return self.cache[token]
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token+'</w>'
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word)-1 and word[i+1] == second:
+                    new_word.append(first+second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        if word == '\n  </w>':
+            word = '\n</w>'
+        self.cache[token] = word
+        return word
+
+    def tokenize(self, text):
+        """ Tokenize a string. """
+        split_tokens = []
+        if self.fix_text is None:
+            # Using BERT's BasicTokenizer
+            text = self.nlp.tokenize(text)
+            for token in text:
+                split_tokens.extend([t for t in self.bpe(token).split(' ')])
+        else:
+            # Using SpaCy & ftfy (original tokenization process of OpenAI GPT)
+            text = self.nlp(text_standardize(self.fix_text(text)))
+            for token in text:
+                split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')])
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        """ Converts a sequence of tokens into ids using the vocab. """
+        ids = []
+        if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.encoder.get(tokens, 0)
+        for token in tokens:
+            if token in self.special_tokens:
+                ids.append(self.special_tokens[token])
+            else:
+                ids.append(self.encoder.get(token, 0))
+        if len(ids) > self.max_len:
+            logger.warning(
+                "Token indices sequence length is longer than the specified maximum "
+                " sequence length for this OpenAI GPT model ({} > {}). Running this"
+                " sequence through the model will result in indexing errors".format(len(ids), self.max_len)
+            )
+        return ids
+
+    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
+        """Converts a sequence of ids in BPE tokens using the vocab."""
+        tokens = []
+        for i in ids:
+            if i in self.special_tokens_decoder:
+                if not skip_special_tokens:
+                    tokens.append(self.special_tokens_decoder[i])
+            else:
+                tokens.append(self.decoder[i])
+        return tokens
+
+    def encode(self, text):
+        return self.convert_tokens_to_ids(self.tokenize(text))
+
+    def decode(self, ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
+        """Converts a sequence of ids in a string."""
+        tokens = self.convert_ids_to_tokens(ids, skip_special_tokens=skip_special_tokens)
+        out_string = ''.join(tokens).replace('</w>', ' ').strip()
+        if clean_up_tokenization_spaces:
+            out_string = out_string.replace('<unk>', '')
+            out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
+                    ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
+                    ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
+        return out_string
+
+    def save_vocabulary(self, vocab_path):
+        """Save the tokenizer vocabulary and merge files to a directory."""
+        if not os.path.isdir(vocab_path):
+            logger.error("Vocabulary path ({}) should be a directory".format(vocab_path))
+            return
+        vocab_file = os.path.join(vocab_path, VOCAB_NAME)
+        merge_file = os.path.join(vocab_path, MERGES_NAME)
+        special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
+
+        with open(vocab_file, 'w', encoding='utf-8') as f:
+            f.write(json.dumps(self.encoder, ensure_ascii=False))
+
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write(u'#version: 0.2\n')
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
+                                   " Please check that the tokenizer is not corrupted!".format(merge_file))
+                    index = token_index
+                writer.write(' '.join(bpe_tokens) + u'\n')
+                index += 1
+
+        index = len(self.encoder)
+        with open(special_tokens_file, 'w', encoding='utf-8') as writer:
+            for token, token_index in sorted(self.special_tokens.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning("Saving special tokens vocabulary to {}: BPE indices are not consecutive."
+                                   " Please check that the tokenizer is not corrupted!".format(special_tokens_file))
+                    index = token_index
+                writer.write(token + u'\n')
+                index += 1
+
+        return vocab_file, merge_file, special_tokens_file
diff --git a/pytorch_pretrained_bert/tokenization_transfo_xl.py b/pytorch_pretrained_bert/tokenization_transfo_xl.py
new file mode 100644
index 0000000..ddebc57
--- /dev/null
+++ b/pytorch_pretrained_bert/tokenization_transfo_xl.py
@@ -0,0 +1,586 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Tokenization classes for Transformer XL model.
+    Adapted from https://github.com/kimiyoung/transformer-xl.
+"""
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import glob
+import logging
+import os
+import sys
+from collections import Counter, OrderedDict
+from io import open
+import unicodedata
+
+import torch
+import numpy as np
+
+from .file_utils import cached_path
+
+if sys.version_info[0] == 2:
+    import cPickle as pickle
+else:
+    import pickle
+
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_VOCAB_ARCHIVE_MAP = {
+    'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-vocab.bin",
+}
+VOCAB_NAME = 'vocab.bin'
+
+PRETRAINED_CORPUS_ARCHIVE_MAP = {
+    'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-corpus.bin",
+}
+CORPUS_NAME = 'corpus.bin'
+
+class TransfoXLTokenizer(object):
+    """
+    Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl
+    """
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
+        """
+        Instantiate a TransfoXLTokenizer.
+        The TransfoXLTokenizer.
+        """
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
+        else:
+            if os.path.isdir(pretrained_model_name_or_path):
+                vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
+            else:
+                vocab_file = pretrained_model_name_or_path
+        # redirect to the cache, if necessary
+        try:
+            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find files {} "
+                "at this path or url.".format(
+                    pretrained_model_name_or_path,
+                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                    pretrained_model_name_or_path,
+                    vocab_file))
+            return None
+        if resolved_vocab_file == vocab_file:
+            logger.info("loading vocabulary file {}".format(vocab_file))
+        else:
+            logger.info("loading vocabulary file {} from cache at {}".format(
+                vocab_file, resolved_vocab_file))
+
+        # Instantiate tokenizer.
+        tokenizer = cls(*inputs, **kwargs)
+        vocab_dict = torch.load(resolved_vocab_file)
+        for key, value in vocab_dict.items():
+            tokenizer.__dict__[key] = value
+        return tokenizer
+
+    def __init__(self, special=[], min_freq=0, max_size=None, lower_case=False,
+                 delimiter=None, vocab_file=None, never_split=("<unk>", "<eos>", "<formula>")):
+        self.counter = Counter()
+        self.special = special
+        self.min_freq = min_freq
+        self.max_size = max_size
+        self.lower_case = lower_case
+        self.delimiter = delimiter
+        self.vocab_file = vocab_file
+        self.never_split = never_split
+
+    def count_file(self, path, verbose=False, add_eos=False):
+        if verbose: print('counting file {} ...'.format(path))
+        assert os.path.exists(path)
+
+        sents = []
+        with open(path, 'r', encoding='utf-8') as f:
+            for idx, line in enumerate(f):
+                if verbose and idx > 0 and idx % 500000 == 0:
+                    print('    line {}'.format(idx))
+                symbols = self.tokenize(line, add_eos=add_eos)
+                self.counter.update(symbols)
+                sents.append(symbols)
+
+        return sents
+
+    def count_sents(self, sents, verbose=False):
+        """
+            sents : a list of sentences, each a list of tokenized symbols
+        """
+        if verbose: print('counting {} sents ...'.format(len(sents)))
+        for idx, symbols in enumerate(sents):
+            if verbose and idx > 0 and idx % 500000 == 0:
+                print('    line {}'.format(idx))
+            self.counter.update(symbols)
+
+    def _build_from_file(self, vocab_file):
+        self.idx2sym = []
+        self.sym2idx = OrderedDict()
+
+        with open(vocab_file, 'r', encoding='utf-8') as f:
+            for line in f:
+                symb = line.strip().split()[0]
+                self.add_symbol(symb)
+        if '<UNK>' in self.sym2idx:
+            self.unk_idx = self.sym2idx['<UNK>']
+        elif '<unk>' in self.sym2idx:
+            self.unk_idx = self.sym2idx['<unk>']
+        else:
+            raise ValueError('No <unkown> token in vocabulary')
+
+    def save_vocabulary(self, vocab_path):
+        """Save the tokenizer vocabulary to a directory or file."""
+        index = 0
+        if os.path.isdir(vocab_path):
+            vocab_file = os.path.join(vocab_path, VOCAB_NAME)
+        torch.save(self.__dict__, vocab_file)
+        return vocab_file
+
+    def build_vocab(self):
+        if self.vocab_file:
+            print('building vocab from {}'.format(self.vocab_file))
+            self._build_from_file(self.vocab_file)
+            print('final vocab size {}'.format(len(self)))
+        else:
+            print('building vocab with min_freq={}, max_size={}'.format(
+                self.min_freq, self.max_size))
+            self.idx2sym = []
+            self.sym2idx = OrderedDict()
+
+            for sym in self.special:
+                self.add_special(sym)
+
+            for sym, cnt in self.counter.most_common(self.max_size):
+                if cnt < self.min_freq: break
+                self.add_symbol(sym)
+
+            print('final vocab size {} from {} unique tokens'.format(
+                len(self), len(self.counter)))
+
+    def encode_file(self, path, ordered=False, verbose=False, add_eos=True,
+            add_double_eos=False):
+        if verbose: print('encoding file {} ...'.format(path))
+        assert os.path.exists(path)
+        encoded = []
+        with open(path, 'r', encoding='utf-8') as f:
+            for idx, line in enumerate(f):
+                if verbose and idx > 0 and idx % 500000 == 0:
+                    print('    line {}'.format(idx))
+                symbols = self.tokenize(line, add_eos=add_eos,
+                    add_double_eos=add_double_eos)
+                encoded.append(self.convert_to_tensor(symbols))
+
+        if ordered:
+            encoded = torch.cat(encoded)
+
+        return encoded
+
+    def encode_sents(self, sents, ordered=False, verbose=False):
+        if verbose: print('encoding {} sents ...'.format(len(sents)))
+        encoded = []
+        for idx, symbols in enumerate(sents):
+            if verbose and idx > 0 and idx % 500000 == 0:
+                print('    line {}'.format(idx))
+            encoded.append(self.convert_to_tensor(symbols))
+
+        if ordered:
+            encoded = torch.cat(encoded)
+
+        return encoded
+
+    def add_special(self, sym):
+        if sym not in self.sym2idx:
+            self.idx2sym.append(sym)
+            self.sym2idx[sym] = len(self.idx2sym) - 1
+            setattr(self, '{}_idx'.format(sym.strip('<>')), self.sym2idx[sym])
+
+    def add_symbol(self, sym):
+        if sym not in self.sym2idx:
+            self.idx2sym.append(sym)
+            self.sym2idx[sym] = len(self.idx2sym) - 1
+
+    def get_sym(self, idx):
+        assert 0 <= idx < len(self), 'Index {} out of vocabulary range'.format(idx)
+        return self.idx2sym[idx]
+
+    def get_idx(self, sym):
+        if sym in self.sym2idx:
+            return self.sym2idx[sym]
+        else:
+            # print('encounter unk {}'.format(sym))
+            # assert '<eos>' not in sym
+            if hasattr(self, 'unk_idx'):
+                return self.sym2idx.get(sym, self.unk_idx)
+            # Backward compatibility with pre-trained models
+            elif '<unk>' in self.sym2idx:
+                return self.sym2idx['<unk>']
+            elif '<UNK>' in self.sym2idx:
+                return self.sym2idx['<UNK>']
+            else:
+                raise ValueError('Token not in vocabulary and no <unk> token in vocabulary for replacement')
+
+    def convert_ids_to_tokens(self, indices):
+        """Converts a sequence of indices in symbols using the vocab."""
+        return [self.get_sym(idx) for idx in indices]
+
+    def convert_tokens_to_ids(self, symbols):
+        """Converts a sequence of symbols into ids using the vocab."""
+        return [self.get_idx(sym) for sym in symbols]
+
+    def convert_to_tensor(self, symbols):
+        return torch.LongTensor(self.convert_tokens_to_ids(symbols))
+
+    def decode(self, indices, exclude=None):
+        """Converts a sequence of indices in a string."""
+        if exclude is None:
+            return ' '.join([self.get_sym(idx) for idx in indices])
+        else:
+            return ' '.join([self.get_sym(idx) for idx in indices if idx not in exclude])
+
+    def __len__(self):
+        return len(self.idx2sym)
+
+    def tokenize(self, line, add_eos=False, add_double_eos=False):
+        line = line.strip()
+        # convert to lower case
+        if self.lower_case:
+            line = line.lower()
+
+        # empty delimiter '' will evaluate False
+        if self.delimiter == '':
+            symbols = line
+        else:
+            symbols = line.split(self.delimiter)
+
+        if add_double_eos: # lm1b
+            return ['<S>'] + symbols + ['<S>']
+        elif add_eos:
+            return symbols + ['<eos>']
+        else:
+            return symbols
+
+
+class LMOrderedIterator(object):
+    def __init__(self, data, bsz, bptt, device='cpu', ext_len=None):
+        """
+            data -- LongTensor -- the LongTensor is strictly ordered
+        """
+        self.bsz = bsz
+        self.bptt = bptt
+        self.ext_len = ext_len if ext_len is not None else 0
+
+        self.device = device
+
+        # Work out how cleanly we can divide the dataset into bsz parts.
+        self.n_step = data.size(0) // bsz
+
+        # Trim off any extra elements that wouldn't cleanly fit (remainders).
+        data = data.narrow(0, 0, self.n_step * bsz)
+
+        # Evenly divide the data across the bsz batches.
+        self.data = data.view(bsz, -1).t().contiguous().to(device)
+
+        # Number of mini-batches
+        self.n_batch = (self.n_step + self.bptt - 1) // self.bptt
+
+    def get_batch(self, i, bptt=None):
+        if bptt is None: bptt = self.bptt
+        seq_len = min(bptt, self.data.size(0) - 1 - i)
+
+        end_idx = i + seq_len
+        beg_idx = max(0, i - self.ext_len)
+
+        data = self.data[beg_idx:end_idx]
+        target = self.data[i+1:i+1+seq_len]
+
+        data_out = data.transpose(0, 1).contiguous().to(self.device)
+        target_out = target.transpose(0, 1).contiguous().to(self.device)
+
+        return data_out, target_out, seq_len
+
+    def get_fixlen_iter(self, start=0):
+        for i in range(start, self.data.size(0) - 1, self.bptt):
+            yield self.get_batch(i)
+
+    def get_varlen_iter(self, start=0, std=5, min_len=5, max_deviation=3):
+        max_len = self.bptt + max_deviation * std
+        i = start
+        while True:
+            bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2.
+            bptt = min(max_len, max(min_len, int(np.random.normal(bptt, std))))
+            data, target, seq_len = self.get_batch(i, bptt)
+            i += seq_len
+            yield data, target, seq_len
+            if i >= self.data.size(0) - 2:
+                break
+
+    def __iter__(self):
+        return self.get_fixlen_iter()
+
+
+class LMShuffledIterator(object):
+    def __init__(self, data, bsz, bptt, device='cpu', ext_len=None, shuffle=False):
+        """
+            data -- list[LongTensor] -- there is no order among the LongTensors
+        """
+        self.data = data
+
+        self.bsz = bsz
+        self.bptt = bptt
+        self.ext_len = ext_len if ext_len is not None else 0
+
+        self.device = device
+        self.shuffle = shuffle
+
+    def get_sent_stream(self):
+        # index iterator
+        epoch_indices = np.random.permutation(len(self.data)) if self.shuffle \
+            else np.array(range(len(self.data)))
+
+        # sentence iterator
+        for idx in epoch_indices:
+            yield self.data[idx]
+
+    def stream_iterator(self, sent_stream):
+        # streams for each data in the batch
+        streams = [None] * self.bsz
+
+        data = torch.LongTensor(self.bptt, self.bsz)
+        target = torch.LongTensor(self.bptt, self.bsz)
+
+        n_retain = 0
+
+        while True:
+            # data   : [n_retain+bptt x bsz]
+            # target : [bptt x bsz]
+            data[n_retain:].fill_(-1)
+            target.fill_(-1)
+
+            valid_batch = True
+
+            for i in range(self.bsz):
+                n_filled = 0
+                try:
+                    while n_filled < self.bptt:
+                        if streams[i] is None or len(streams[i]) <= 1:
+                            streams[i] = next(sent_stream)
+                        # number of new tokens to fill in
+                        n_new = min(len(streams[i]) - 1, self.bptt - n_filled)
+                        # first n_retain tokens are retained from last batch
+                        data[n_retain+n_filled:n_retain+n_filled+n_new, i] = \
+                            streams[i][:n_new]
+                        target[n_filled:n_filled+n_new, i] = \
+                            streams[i][1:n_new+1]
+                        streams[i] = streams[i][n_new:]
+                        n_filled += n_new
+                except StopIteration:
+                    valid_batch = False
+                    break
+
+            if not valid_batch:
+                return
+
+            data_out = data.transpose(0, 1).contiguous().to(self.device)
+            target_out = target.transpose(0, 1).contiguous().to(self.device)
+
+            yield data_out, target_out, self.bptt
+
+            n_retain = min(data.size(0), self.ext_len)
+            if n_retain > 0:
+                data[:n_retain] = data[-n_retain:]
+            data.resize_(n_retain + self.bptt, data.size(1))
+
+    def __iter__(self):
+        # sent_stream is an iterator
+        sent_stream = self.get_sent_stream()
+
+        for batch in self.stream_iterator(sent_stream):
+            yield batch
+
+
+class LMMultiFileIterator(LMShuffledIterator):
+    def __init__(self, paths, vocab, bsz, bptt, device='cpu', ext_len=None,
+        shuffle=False):
+
+        self.paths = paths
+        self.vocab = vocab
+
+        self.bsz = bsz
+        self.bptt = bptt
+        self.ext_len = ext_len if ext_len is not None else 0
+
+        self.device = device
+        self.shuffle = shuffle
+
+    def get_sent_stream(self, path):
+        sents = self.vocab.encode_file(path, add_double_eos=True)
+        if self.shuffle:
+            np.random.shuffle(sents)
+        sent_stream = iter(sents)
+
+        return sent_stream
+
+    def __iter__(self):
+        if self.shuffle:
+            np.random.shuffle(self.paths)
+
+        for path in self.paths:
+            # sent_stream is an iterator
+            sent_stream = self.get_sent_stream(path)
+            for batch in self.stream_iterator(sent_stream):
+                yield batch
+
+
+class TransfoXLCorpus(object):
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
+        """
+        Instantiate a pre-processed corpus.
+        """
+        vocab = TransfoXLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        if pretrained_model_name_or_path in PRETRAINED_CORPUS_ARCHIVE_MAP:
+            corpus_file = PRETRAINED_CORPUS_ARCHIVE_MAP[pretrained_model_name_or_path]
+        else:
+            corpus_file = os.path.join(pretrained_model_name_or_path, CORPUS_NAME)
+        # redirect to the cache, if necessary
+        try:
+            resolved_corpus_file = cached_path(corpus_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            logger.error(
+                "Corpus '{}' was not found in corpus list ({}). "
+                "We assumed '{}' was a path or url but couldn't find files {} "
+                "at this path or url.".format(
+                    pretrained_model_name_or_path,
+                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                    pretrained_model_name_or_path,
+                    corpus_file))
+            return None
+        if resolved_corpus_file == corpus_file:
+            logger.info("loading corpus file {}".format(corpus_file))
+        else:
+            logger.info("loading corpus file {} from cache at {}".format(
+                corpus_file, resolved_corpus_file))
+
+        # Instantiate tokenizer.
+        corpus = cls(*inputs, **kwargs)
+        corpus_dict = torch.load(resolved_corpus_file)
+        for key, value in corpus_dict.items():
+            corpus.__dict__[key] = value
+        corpus.vocab = vocab
+        if corpus.train is not None:
+            corpus.train = torch.tensor(corpus.train, dtype=torch.long)
+        if corpus.valid is not None:
+            corpus.valid = torch.tensor(corpus.valid, dtype=torch.long)
+        if corpus.test is not None:
+            corpus.test = torch.tensor(corpus.test, dtype=torch.long)
+        return corpus
+
+    def __init__(self, *args, **kwargs):
+        self.vocab = TransfoXLTokenizer(*args, **kwargs)
+        self.dataset = None
+        self.train = None
+        self.valid = None
+        self.test = None
+
+    def build_corpus(self, path, dataset):
+        self.dataset = dataset
+
+        if self.dataset in ['ptb', 'wt2', 'enwik8', 'text8']:
+            self.vocab.count_file(os.path.join(path, 'train.txt'))
+            self.vocab.count_file(os.path.join(path, 'valid.txt'))
+            self.vocab.count_file(os.path.join(path, 'test.txt'))
+        elif self.dataset == 'wt103':
+            self.vocab.count_file(os.path.join(path, 'train.txt'))
+        elif self.dataset == 'lm1b':
+            train_path_pattern = os.path.join(
+                path, '1-billion-word-language-modeling-benchmark-r13output',
+                'training-monolingual.tokenized.shuffled', 'news.en-*')
+            train_paths = glob.glob(train_path_pattern)
+            # the vocab will load from file when build_vocab() is called
+
+        self.vocab.build_vocab()
+
+        if self.dataset in ['ptb', 'wt2', 'wt103']:
+            self.train = self.vocab.encode_file(
+                os.path.join(path, 'train.txt'), ordered=True)
+            self.valid = self.vocab.encode_file(
+                os.path.join(path, 'valid.txt'), ordered=True)
+            self.test = self.vocab.encode_file(
+                os.path.join(path, 'test.txt'), ordered=True)
+        elif self.dataset in ['enwik8', 'text8']:
+            self.train = self.vocab.encode_file(
+                os.path.join(path, 'train.txt'), ordered=True, add_eos=False)
+            self.valid = self.vocab.encode_file(
+                os.path.join(path, 'valid.txt'), ordered=True, add_eos=False)
+            self.test = self.vocab.encode_file(
+                os.path.join(path, 'test.txt'), ordered=True, add_eos=False)
+        elif self.dataset == 'lm1b':
+            self.train = train_paths
+            self.valid = self.vocab.encode_file(
+                os.path.join(path, 'valid.txt'), ordered=False, add_double_eos=True)
+            self.test = self.vocab.encode_file(
+                os.path.join(path, 'test.txt'), ordered=False, add_double_eos=True)
+
+    def get_iterator(self, split, *args, **kwargs):
+        if split == 'train':
+            if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']:
+                data_iter = LMOrderedIterator(self.train, *args, **kwargs)
+            elif self.dataset == 'lm1b':
+                kwargs['shuffle'] = True
+                data_iter = LMMultiFileIterator(self.train, self.vocab, *args, **kwargs)
+        elif split in ['valid', 'test']:
+            data = self.valid if split == 'valid' else self.test
+            if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']:
+                data_iter = LMOrderedIterator(data, *args, **kwargs)
+            elif self.dataset == 'lm1b':
+                data_iter = LMShuffledIterator(data, *args, **kwargs)
+
+        return data_iter
+
+
+def get_lm_corpus(datadir, dataset):
+    fn = os.path.join(datadir, 'cache.pt')
+    fn_pickle = os.path.join(datadir, 'cache.pkl')
+    if os.path.exists(fn):
+        print('Loading cached dataset...')
+        corpus = torch.load(fn_pickle)
+    elif os.path.exists(fn):
+        print('Loading cached dataset from pickle...')
+        with open(fn, "rb") as fp:
+            corpus = pickle.load(fp)
+    else:
+        print('Producing dataset {}...'.format(dataset))
+        kwargs = {}
+        if dataset in ['wt103', 'wt2']:
+            kwargs['special'] = ['<eos>']
+            kwargs['lower_case'] = False
+        elif dataset == 'ptb':
+            kwargs['special'] = ['<eos>']
+            kwargs['lower_case'] = True
+        elif dataset == 'lm1b':
+            kwargs['special'] = []
+            kwargs['lower_case'] = False
+            kwargs['vocab_file'] = os.path.join(datadir, '1b_word_vocab.txt')
+        elif dataset in ['enwik8', 'text8']:
+            pass
+
+        corpus = TransfoXLCorpus(datadir, dataset, **kwargs)
+        torch.save(corpus, fn)
+
+    return corpus
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..8c47219
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,9 @@
+torch
+pytorch-ignite
+#pytorch-pretrained-bert >= 0.6.2
+tensorboardX==1.6
+tensorflow  # for tensorboardX
+boto3
+requests
+tqdm
+regex
\ No newline at end of file
diff --git a/train.py b/train.py
new file mode 100644
index 0000000..688f1ca
--- /dev/null
+++ b/train.py
@@ -0,0 +1,239 @@
+# Copyright (c) 2019-present, HuggingFace Inc.
+# All rights reserved. This source code is licensed under the BSD-style license found in the LICENSE file in the root directory of this source tree.
+import os
+import math
+import logging
+from pprint import pformat
+from argparse import ArgumentParser
+from collections import defaultdict
+from itertools import chain
+from config import Config
+
+import torch
+from torch.nn.parallel import DistributedDataParallel
+from torch.utils.data import DataLoader, TensorDataset
+from ignite.engine import Engine, Events
+from ignite.handlers import ModelCheckpoint
+from ignite.metrics import Accuracy, Loss, MetricsLambda, RunningAverage
+from ignite.contrib.handlers import ProgressBar, PiecewiseLinear
+from ignite.contrib.handlers.tensorboard_logger import TensorboardLogger, OutputHandler, OptimizerParamsHandler
+from pytorch_pretrained_bert import (OpenAIAdam, OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
+                                     GPT2DoubleHeadsModel, GPT2Tokenizer, WEIGHTS_NAME, CONFIG_NAME,
+                                     BertModel, BertTokenizer)
+
+from utils import get_dataset
+
+SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"]
+MODEL_INPUTS = ["input_ids", "mc_token_ids", "lm_labels", "mc_labels", "token_type_ids"]
+PADDED_INPUTS = ["input_ids", "lm_labels", "token_type_ids"]
+
+logger = logging.getLogger(__file__)
+
+def average_distributed_scalar(scalar, config):
+    """ Average a scalar over the nodes if we are in distributed training. We use this for distributed evaluation. """
+    if config.local_rank == -1:
+        return scalar
+    scalar_t = torch.tensor(scalar, dtype=torch.float, device=config.device) / torch.distributed.get_world_size()
+    torch.distributed.all_reduce(scalar_t, op=torch.distributed.ReduceOp.SUM)
+    return scalar_t.item()
+
+
+def pad_dataset(dataset, padding=0):
+    """ Pad the dataset. This could be optimized by defining a Dataset class and padd only batches but this is simpler. """
+    max_l = max(len(x) for x in dataset["input_ids"])
+    for name in PADDED_INPUTS:
+        dataset[name] = [x + [padding if name != "lm_labels" else -1] * (max_l - len(x)) for x in dataset[name]]
+    return dataset
+
+
+def build_input_from_segments(history, reply, tokenizer, lm_labels=False, with_eos=True):
+    """ Build a sequence of input from 3 segments: persona, history and last reply """
+    bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1])
+
+    instance = {}
+    sequence = [[bos] + history[0]] + history[1:] +[reply +([eos] if with_eos else [])]
+    sequence = [sequence[0]] + [[speaker2 if (len(sequence)-i) % 2 else speaker1] + s for i, s in enumerate(sequence[1:])]
+
+    instance["input_ids"] = list(chain(*sequence))
+    instance["token_type_ids"] = [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in s] # the last for is for repeating the speaker1 and speaker2 for all tokens
+    instance["mc_token_ids"] = len(instance["input_ids"]) - 1
+    instance["lm_labels"] = [-1] * len(instance["input_ids"])
+    if lm_labels:
+        instance["lm_labels"] = ([-1] * sum(len(s) for s in sequence[:-1])) + [-1] + sequence[-1][1:] #all -1 except for reply, reply is just the ids
+    return instance, sequence
+
+
+def get_data_loaders(config, tokenizer):
+    """ Prepare the dataset for training and evaluation """
+    personachat = get_dataset(tokenizer, config.dataset_path, config.dataset_cache)
+
+    logger.info("Build inputs and labels")
+    datasets = {"train": defaultdict(list), "valid": defaultdict(list)}
+
+    gpu_max_length = 310 #this depends on the gpu memory size, using bigger gpu memory you can increase this to include longer inputs
+    for dataset_name, dataset in personachat.items():
+        num_candidates = len(dataset[0]["utterances"][0]["candidates"])
+        if config.num_candidates > 0 and dataset_name == 'train':
+            num_candidates = min(config.num_candidates, num_candidates)
+        for dialog in dataset:
+            for utterance in dialog["utterances"]:
+                history = utterance["history"][-(2*config.max_history+1):]
+                for j, candidate in enumerate(utterance["candidates"][-num_candidates:]):
+                    lm_labels = bool(j == num_candidates-1) #the true label is always the last one in list of candidates
+                    instance, _ = build_input_from_segments(history, candidate, tokenizer, lm_labels)
+                    #print(len(instance["input_ids"]))
+                    ##
+                    if len(instance["input_ids"]) > gpu_max_length:
+                        truncated_history = [hist[:10] for hist in history]
+                        truncated_candidate = candidate[:10]
+                        instance, _ = build_input_from_segments(truncated_history, truncated_candidate, tokenizer, lm_labels)
+
+                    for input_name, input_array in instance.items():
+                        datasets[dataset_name][input_name].append(input_array)
+                datasets[dataset_name]["mc_labels"].append(num_candidates - 1)
+                datasets[dataset_name]["n_candidates"] = num_candidates
+    logger.info("Pad inputs and convert to Tensor")
+    tensor_datasets = {"train": [], "valid": []}
+    for dataset_name, dataset in datasets.items():
+        dataset = pad_dataset(dataset, padding=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]))
+        for input_name in MODEL_INPUTS:
+            tensor = torch.tensor(dataset[input_name])
+            if input_name != "mc_labels":
+                tensor = tensor.view((-1, datasets[dataset_name]["n_candidates"]) + tensor.shape[1:])
+            tensor_datasets[dataset_name].append(tensor)
+
+    logger.info("Build train and validation dataloaders")
+    train_dataset, valid_dataset = TensorDataset(*tensor_datasets["train"]), TensorDataset(*tensor_datasets["valid"])
+    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if config.distributed else None
+    valid_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset) if config.distributed else None
+    train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=config.train_batch_size, shuffle=False)
+    valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=config.valid_batch_size, shuffle=False)
+
+    logger.info("Train dataset (Batch, Candidates, Seq length): {}".format(train_dataset.tensors[0].shape))
+    logger.info("Valid dataset (Batch, Candidates, Seq length): {}".format(valid_dataset.tensors[0].shape))
+    return train_loader, valid_loader, train_sampler, valid_sampler
+
+
+def train():
+    config_file = "configs/train_full_config.json"
+    config = Config.from_json_file(config_file)
+
+    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes
+    logging.basicConfig(level=logging.INFO if config.local_rank in [-1, 0] else logging.WARN)
+    logger.warning("Running process %d", config.local_rank)  # This is a logger.warning: it will be printed by all distributed processes
+    logger.info("Arguments: %s", pformat(config))
+
+    # Initialize distributed training if needed
+    config.distributed = (config.local_rank != -1)
+    if config.distributed:
+        torch.cuda.set_device(config.local_rank)
+        config.device = torch.device("cuda", config.local_rank)
+        torch.distributed.init_process_group(backend='nccl', init_method='env://')
+
+    logger.info("Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning")
+    tokenizer_class = GPT2Tokenizer if "gpt2" in config.model_checkpoint else OpenAIGPTTokenizer
+    tokenizer = tokenizer_class.from_pretrained(config.model_checkpoint)
+    model_class = GPT2DoubleHeadsModel if "gpt2" in config.model_checkpoint else OpenAIGPTDoubleHeadsModel
+    model = model_class.from_pretrained(config.model_checkpoint)
+    tokenizer.set_special_tokens(SPECIAL_TOKENS)
+    model.set_num_special_tokens(len(SPECIAL_TOKENS))
+    model.to(config.device)
+    optimizer = OpenAIAdam(model.parameters(), lr=config.lr)
+
+    # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
+    if config.fp16:
+        from apex import amp  # Apex is only required if we use fp16 training
+        model, optimizer = amp.initialize(model, optimizer, opt_level=config.fp16)
+    if config.distributed:
+        model = DistributedDataParallel(model, device_ids=[config.local_rank], output_device=config.local_rank)
+
+    logger.info("Prepare datasets")
+    train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(config, tokenizer)
+
+    # Training function and trainer
+    def update(engine, batch):
+        model.train()
+        batch = tuple(input_tensor.to(config.device) for input_tensor in batch)
+        lm_loss, mc_loss = model(*batch)
+        loss = (lm_loss * config.lm_coef + mc_loss * config.mc_coef) / config.gradient_accumulation_steps
+        if config.fp16:
+            with amp.scale_loss(loss, optimizer) as scaled_loss:
+                scaled_loss.backward()
+            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), config.max_norm)
+        else:
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_norm)
+        if engine.state.iteration % config.gradient_accumulation_steps == 0:
+            optimizer.step()
+            optimizer.zero_grad()
+        return loss.item()
+    trainer = Engine(update)
+
+    # Evaluation function and evaluator (evaluator output is the input of the metrics)
+    def inference(engine, batch):
+        model.eval()
+        with torch.no_grad():
+            batch = tuple(input_tensor.to(config.device) for input_tensor in batch)
+            input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch
+            #logger.info(tokenizer.decode(input_ids[0, -1, :].tolist()))
+            model_outputs = model(input_ids, mc_token_ids, token_type_ids=token_type_ids)
+            lm_logits, mc_logits = model_outputs[0], model_outputs[1]  # So we can also use GPT2 outputs
+            lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1))
+            lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
+            return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels)
+    evaluator = Engine(inference)
+
+    # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
+    trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader))
+    if config.n_epochs < 1:
+        trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader))
+    if config.eval_before_start:
+        trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader))
+
+    # Make sure distributed data samplers split the dataset nicely between the distributed processes
+    if config.distributed:
+        trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch))
+        evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch))
+
+    # Linearly decrease the learning rate from lr to zero
+    scheduler = PiecewiseLinear(optimizer, "lr", [(0, config.lr), (config.n_epochs * len(train_loader), 0.0)])
+    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)
+
+    # Prepare metrics - note how we compute distributed metrics
+    RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
+    metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0])),
+               "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))}
+    metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], config),
+                    "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], config)})
+    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
+    for name, metric in metrics.items():
+        metric.attach(evaluator, name)
+
+    # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
+    if config.local_rank in [-1, 0]:
+        pbar = ProgressBar(persist=True)
+        pbar.attach(trainer, metric_names=["loss"])
+        evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics)))
+
+        tb_logger = TensorboardLogger(log_dir=config.log_dir)
+        tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED)
+        tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED)
+        tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED)
+
+        checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir, 'checkpoint', save_interval=1, n_saved=3)
+        trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)})  # "getattr" take care of distributed encapsulation
+
+        torch.save(config, tb_logger.writer.log_dir + '/model_training_args.bin')
+        getattr(model, 'module', model).config.to_json_file(os.path.join(tb_logger.writer.log_dir, CONFIG_NAME))
+        tokenizer.save_vocabulary(tb_logger.writer.log_dir)
+
+    # Run the training
+    trainer.run(train_loader, max_epochs=config.n_epochs)
+
+    # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method)
+    if config.local_rank in [-1, 0] and config.n_epochs > 0:
+        os.rename(checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.log_dir, WEIGHTS_NAME))  # TODO: PR in ignite to have better access to saved file paths (cleaner)
+        tb_logger.close()
+
+if __name__ == "__main__":
+    train()
diff --git a/train_emotion_recognition.py b/train_emotion_recognition.py
new file mode 100644
index 0000000..e232066
--- /dev/null
+++ b/train_emotion_recognition.py
@@ -0,0 +1,286 @@
+# Copyright (c) 2019-present, HuggingFace Inc.
+# All rights reserved. This source code is licensed under the BSD-style license found in the LICENSE file in the root directory of this source tree.
+import os
+import math
+import logging
+from pprint import pformat
+from argparse import ArgumentParser
+from collections import defaultdict
+from itertools import chain
+
+import torch
+from torch.nn.parallel import DistributedDataParallel
+from torch.utils.data import DataLoader, TensorDataset
+from ignite.engine import Engine, Events
+from ignite.handlers import ModelCheckpoint
+from ignite.metrics import Accuracy, Recall, Loss, MetricsLambda, RunningAverage, Precision, ConfusionMatrix
+from ignite.contrib.handlers import ProgressBar, PiecewiseLinear
+from ignite.contrib.handlers.tensorboard_logger import TensorboardLogger, OutputHandler, OptimizerParamsHandler
+
+from config import Config
+from pytorch_pretrained_bert import (OpenAIAdam, OpenAIGPTDoubleHeadLMEmotionRecognitionModel, OpenAIGPTTokenizer,
+                                     GPT2DoubleHeadsModel, GPT2Tokenizer, WEIGHTS_NAME, CONFIG_NAME)
+
+from utils import get_dataset, get_dataset_for_daily_dialog
+
+SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>",
+                  "<no_emotion>", "<happiness>", "<surprise>", "<sadness>", "<disgust>", "<anger>", "<fear>",
+                  "<directive>", "<inform>", "<commissive>", "<question>",
+                  "<pad>"]
+MODEL_INPUTS = ["input_ids", "mc_token_ids", "lm_labels", "mc_labels", "token_type_ids", "token_emotion_ids"]
+PADDED_INPUTS = ["input_ids", "lm_labels", "token_type_ids", "token_emotion_ids"]
+
+logger = logging.getLogger(__file__)
+
+def average_distributed_scalar(scalar, config):
+    """ Average a scalar over the nodes if we are in distributed training. We use this for distributed evaluation. """
+    if config.local_rank == -1:
+        return scalar
+    scalar_t = torch.tensor(scalar, dtype=torch.float, device=config.device) / torch.distributed.get_world_size()
+    torch.distributed.all_reduce(scalar_t, op=torch.distributed.ReduceOp.SUM)
+    return scalar_t.item()
+
+
+def pad_dataset(dataset, padding=0):
+    """ Pad the dataset. This could be optimized by defining a Dataset class and padd only batches but this is simpler. """
+    max_l = max(len(x) for x in dataset["input_ids"])
+    for name in PADDED_INPUTS:
+        dataset[name] = [x + [padding if name != "lm_labels" else -1] * (max_l - len(x)) for x in dataset[name]]
+    return dataset
+
+
+def get_emotion_label(tokenizer, candidate_emotion):
+    _, _, _, _, no_emotion_id, happiness_id, surprise_id, sadness_id, disgust_id, anger_id, fear_id, _, _, _, _, _ = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS)
+    if candidate_emotion == happiness_id:
+        return 0
+    elif candidate_emotion == surprise_id:
+        return 1
+    elif candidate_emotion == sadness_id:
+        return 2
+    elif candidate_emotion == disgust_id:
+        return 3
+    elif candidate_emotion == anger_id:
+        return 4
+    elif candidate_emotion == fear_id:
+        return 5
+    elif candidate_emotion == no_emotion_id:
+        return 6
+
+
+def build_input_from_segments(history, emotions, reply, true_emotion, tokenizer, with_eos=True):
+    """ Build a sequence of input from 3 segments: persona, history and last reply """
+    bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:4])
+    #tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1])
+
+    instance = {}
+    # sequence = [[bos] + history[0] + list(chain(*history[1:]))]  + [reply + ([eos] if with_eos else [])] #seq = [personas, history, reply] concatenate all persona sentences
+    sequence = [[bos] + history[0]] + history[1:] + [reply + ([eos] if with_eos else [])]
+    sequence = [[speaker2 if (len(sequence)-i) % 2 else speaker1] + s for i, s in enumerate(sequence)]
+
+    instance["input_ids"] = list(chain(*sequence))
+    instance["token_type_ids"] = [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in s] # the last for is for repeating the speaker1 and speaker2 for all tokens
+    #instance["token_emotion_ids"] = [emotions[i] for i, s in enumerate(sequence[:-1]) for _ in s] + [true_emotion] * len(sequence[-1])
+    instance["token_emotion_ids"] = [emotions[i] for i, s in enumerate(sequence[:-1]) for _ in s]
+
+    instance["mc_token_ids"] = len(instance["input_ids"]) - 1
+    instance["mc_labels"] = get_emotion_label(tokenizer, true_emotion)
+    instance["lm_labels"] = ([-1] * sum(len(s) for s in sequence[:-1])) + [-1] + sequence[-1][1:] #all -1 except for reply, reply is just the ids
+    return instance, sequence
+
+
+def get_data_loaders(config, tokenizer):
+    """ Prepare the dataset for training and evaluation """
+    personachat = get_dataset_for_daily_dialog(tokenizer, config.dataset_path, config.dataset_cache, SPECIAL_TOKENS)
+
+    # personachat["train"] = personachat["train"][:100]
+    # personachat["valid"] = personachat["valid"][:10]
+
+    logger.info("Build inputs and labels")
+    datasets = {"train": defaultdict(list), "valid": defaultdict(list)}
+    gpu_max_length = 310
+    for dataset_name, dataset in personachat.items():
+        num_candidates = 2#len(dataset[0]["utterances"][0]["candidates"])
+        if config.num_candidates > 0 and dataset_name == 'train':
+            num_candidates = min(config.num_candidates, num_candidates)
+        for dialog in dataset:
+            for utterance in dialog["utterances"]:
+                history = utterance["history"][-(2 * config.max_history + 1):]
+                emotions = utterance["emotion"][-(2 * config.max_history + 1):]
+                reply = utterance["candidates"][-1]
+                true_emotion = utterance['candidates_emotions'][-1]
+                if true_emotion == tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS)[4]:
+                   continue
+                instance, _ = build_input_from_segments(history,
+                                                        emotions,
+                                                        reply,
+                                                        true_emotion,
+                                                        tokenizer)
+
+                if len(instance["input_ids"]) > gpu_max_length:
+                    truncated_history = [hist[:10] for hist in history]
+                    truncated_candidate = reply[:10]
+                    true_emotion = utterance['candidates_emotions'][-1]
+                    instance, _ = build_input_from_segments(truncated_history,
+                                                            emotions,
+                                                            truncated_candidate,
+                                                            true_emotion,
+                                                            tokenizer)
+
+
+                for input_name, input_array in instance.items():
+                    datasets[dataset_name][input_name].append(input_array)
+
+                datasets[dataset_name]["n_candidates"] = num_candidates
+
+    logger.info("Pad inputs and convert to Tensor")
+    tensor_datasets = {"train": [], "valid": []}
+    for dataset_name, dataset in datasets.items():
+        dataset = pad_dataset(dataset, padding=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]))
+        for input_name in MODEL_INPUTS:
+            tensor = torch.tensor(dataset[input_name])
+            #if input_name != "mc_labels":
+            #    tensor = tensor.view((-1, datasets[dataset_name]["n_candidates"]) + tensor.shape[1:])
+            tensor_datasets[dataset_name].append(tensor)
+
+    logger.info("Build train and validation dataloaders")
+    train_dataset, valid_dataset = TensorDataset(*tensor_datasets["train"]), TensorDataset(*tensor_datasets["valid"])
+    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if config.distributed else None
+    valid_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset) if config.distributed else None
+    train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=config.train_batch_size, shuffle=False)
+    valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=config.valid_batch_size, shuffle=False)
+
+    logger.info("Train dataset (Batch, Candidates, Seq length): {}".format(train_dataset.tensors[0].shape))
+    logger.info("Valid dataset (Batch, Candidates, Seq length): {}".format(valid_dataset.tensors[0].shape))
+    return train_loader, valid_loader, train_sampler, valid_sampler
+
+
+def train():
+    config_file = "configs/train_emotion_recognition_config.json"
+    config = Config.from_json_file(config_file)
+
+    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes
+    logging.basicConfig(level=logging.INFO if config.local_rank in [-1, 0] else logging.WARN)
+    logger.warning("Running process %d", config.local_rank)  # This is a logger.warning: it will be printed by all distributed processes
+    logger.info("Arguments: %s", pformat(config))
+
+    # Initialize distributed training if needed
+    config.distributed = (config.local_rank != -1)
+    if config.distributed:
+        torch.cuda.set_device(config.local_rank)
+        config.device = torch.device("cuda", config.local_rank)
+        torch.distributed.init_process_group(backend='nccl', init_method='env://')
+
+    logger.info("Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning")
+    tokenizer_class = GPT2Tokenizer if "gpt2" in config.model_checkpoint else OpenAIGPTTokenizer
+    tokenizer = tokenizer_class.from_pretrained(config.model_checkpoint)
+    model_class = OpenAIGPTDoubleHeadLMEmotionRecognitionModel
+    model = model_class.from_pretrained(config.model_checkpoint)
+    tokenizer.set_special_tokens(SPECIAL_TOKENS)
+    model.set_num_special_tokens(len(SPECIAL_TOKENS))
+    model.to(config.device)
+    optimizer = OpenAIAdam(model.parameters(), lr=config.lr)
+
+    # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
+    if config.fp16:
+        from apex import amp  # Apex is only required if we use fp16 training
+        model, optimizer = amp.initialize(model, optimizer, opt_level=config.fp16)
+    if config.distributed:
+        model = DistributedDataParallel(model, device_ids=[config.local_rank], output_device=config.local_rank)
+
+    logger.info("Prepare datasets")
+    train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(config, tokenizer)
+
+    # Training function and trainer
+    def update(engine, batch):
+        model.train()
+        input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids = tuple(input_tensor.to(config.device) for input_tensor in batch)
+        #token_emotion_ids = None
+        lm_loss, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids)
+        loss = (lm_loss * config.lm_coef + mc_loss * config.mc_coef) / config.gradient_accumulation_steps
+        if config.fp16:
+            with amp.scale_loss(loss, optimizer) as scaled_loss:
+                scaled_loss.backward()
+            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), config.max_norm)
+        else:
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_norm)
+        if engine.state.iteration % config.gradient_accumulation_steps == 0:
+            optimizer.step()
+            optimizer.zero_grad()
+        return loss.item()
+    trainer = Engine(update)
+
+    # Evaluation function and evaluator (evaluator output is the input of the metrics)
+    def inference(engine, batch):
+        model.eval()
+        with torch.no_grad():
+            batch = tuple(input_tensor.to(config.device) for input_tensor in batch)
+            input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids = batch
+            #token_emotion_ids = None
+            model_outputs = model(input_ids, mc_token_ids, token_type_ids=token_type_ids, token_emotion_ids=token_emotion_ids)
+            lm_logits, mc_logits = model_outputs[0], model_outputs[1]  # So we can also use GPT2 outputs
+            lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1))
+            lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
+            return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels)
+    evaluator = Engine(inference)
+
+    # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
+    trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader))
+    if config.n_epochs < 1:
+        trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader))
+    if config.eval_before_start:
+        trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader))
+
+    # Make sure distributed data samplers split the dataset nicely between the distributed processes
+    if config.distributed:
+        trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch))
+        evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch))
+
+    # Linearly decrease the learning rate from lr to zero
+    scheduler = PiecewiseLinear(optimizer, "lr", [(0, config.lr), (config.n_epochs * len(train_loader), 0.0)])
+    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)
+
+    # Prepare metrics - note how we compute distributed metrics
+    RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
+    metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0])),
+               "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))}
+
+    metrics.update({"precision": Precision(output_transform=lambda x: (x[0][1], x[1][1])),
+                    "recall": Recall(output_transform=lambda x: (x[0][1], x[1][1]))})
+
+    metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], config),
+                    "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], config)})
+
+    metrics.update({"confusion_matrix": ConfusionMatrix(num_classes=6, output_transform=lambda x: (x[0][1], x[1][1]))})
+    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
+    for name, metric in metrics.items():
+        metric.attach(evaluator, name)
+
+    # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
+    if config.local_rank in [-1, 0]:
+        pbar = ProgressBar(persist=True)
+        pbar.attach(trainer, metric_names=["loss"])
+        evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics)))
+
+        tb_logger = TensorboardLogger(log_dir=config.log_dir)
+        tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED)
+        tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED)
+        tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED)
+
+        checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir, 'checkpoint', save_interval=1, n_saved=3)
+        trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)})  # "getattr" take care of distributed encapsulation
+
+        torch.save(config, tb_logger.writer.log_dir + '/model_training_args.bin')
+        getattr(model, 'module', model).config.to_json_file(os.path.join(tb_logger.writer.log_dir, CONFIG_NAME))
+        tokenizer.save_vocabulary(tb_logger.writer.log_dir)
+
+    # Run the training
+    trainer.run(train_loader, max_epochs=config.n_epochs)
+
+    # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method)
+    if config.local_rank in [-1, 0] and config.n_epochs > 0:
+        os.rename(checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.log_dir, WEIGHTS_NAME))  # TODO: PR in ignite to have better access to saved file paths (cleaner)
+        tb_logger.close()
+
+if __name__ == "__main__":
+    train()
diff --git a/train_full.py b/train_full.py
new file mode 100644
index 0000000..30af959
--- /dev/null
+++ b/train_full.py
@@ -0,0 +1,253 @@
+# Copyright (c) 2019-present, HuggingFace Inc.
+# All rights reserved. This source code is licensed under the BSD-style license found in the LICENSE file in the root directory of this source tree.
+import os
+import math
+import logging
+from pprint import pformat
+from argparse import ArgumentParser
+from collections import defaultdict
+from itertools import chain
+
+import torch
+from torch.nn.parallel import DistributedDataParallel
+from torch.utils.data import DataLoader, TensorDataset
+from ignite.engine import Engine, Events
+from ignite.handlers import ModelCheckpoint
+from ignite.metrics import Accuracy, Loss, MetricsLambda, RunningAverage
+from ignite.contrib.handlers import ProgressBar, PiecewiseLinear
+from config import Config
+from ignite.contrib.handlers.tensorboard_logger import TensorboardLogger, OutputHandler, OptimizerParamsHandler
+from pytorch_pretrained_bert import (OpenAIAdam, OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
+                                     GPT2DoubleHeadsModel, GPT2Tokenizer, WEIGHTS_NAME, CONFIG_NAME,
+                                     BertModel, BertTokenizer)
+
+from utils import get_dataset, get_dataset_for_daily_dialog
+
+SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>",
+                  "<no_emotion>", "<happiness>", "<surprise>", "<sadness>", "<disgust>", "<anger>", "<fear>",
+                  "<directive>", "<inform>", "<commissive>", "<question>",
+                  "<pad>"]
+MODEL_INPUTS = ["input_ids", "mc_token_ids", "lm_labels", "mc_labels", "token_type_ids", "token_emotion_ids"]
+PADDED_INPUTS = ["input_ids", "lm_labels", "token_type_ids", "token_emotion_ids"]
+
+logger = logging.getLogger(__file__)
+
+def average_distributed_scalar(scalar, config):
+    """ Average a scalar over the nodes if we are in distributed training. We use this for distributed evaluation. """
+    if config.local_rank == -1:
+        return scalar
+    scalar_t = torch.tensor(scalar, dtype=torch.float, device=config.device) / torch.distributed.get_world_size()
+    torch.distributed.all_reduce(scalar_t, op=torch.distributed.ReduceOp.SUM)
+    return scalar_t.item()
+
+
+def pad_dataset(dataset, padding=0):
+    """ Pad the dataset. This could be optimized by defining a Dataset class and padd only batches but this is simpler. """
+    max_l = max(len(x) for x in dataset["input_ids"])
+    for name in PADDED_INPUTS:
+        dataset[name] = [x + [padding if name != "lm_labels" else -1] * (max_l - len(x)) for x in dataset[name]]
+    return dataset
+
+
+def build_input_from_segments(history, emotions,  reply, candidate_emotion,  tokenizer, lm_labels=False, with_eos=True):
+    """ Build a sequence of input from 3 segments: persona, history and last reply """
+    bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:4])
+
+    instance = {}
+    #sequence = [[bos] + history[0] + list(chain(*history[1:]))] + [reply + ([eos] if with_eos else [])] #seq = [personas, history, reply] concatenate all persona sentences
+    sequence = [[bos] + history[0]] + history[1:] +[reply +([eos] if with_eos else [])]
+    sequence = [[speaker2 if (len(sequence)-i) % 2 else speaker1] + s for i, s in enumerate(sequence)]
+    all_emotions = emotions + [candidate_emotion]
+    sequence = [[all_emotions[i]] + s for i, s in enumerate(sequence)]
+
+    instance["input_ids"] = list(chain(*sequence))
+    instance["token_type_ids"] = [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in s] # the last for is for repeating the speaker1 and speaker2 for all tokens
+    instance["token_emotion_ids"] = [emotions[i] for i, s in enumerate(sequence[:-1]) for _ in s]+[candidate_emotion]*len(sequence[-1])
+
+    instance["mc_token_ids"] = len(instance["input_ids"]) - 1
+    instance["lm_labels"] = [-1] * len(instance["input_ids"])
+    if lm_labels:
+        instance["lm_labels"] = ([-1] * sum(len(s) for s in sequence[:-1])) + [-1] + sequence[-1][1:] #all -1 except for reply, reply is just the ids
+    return instance, sequence
+
+
+def get_data_loaders(config, tokenizer):
+    """ Prepare the dataset for training and evaluation """
+    personachat = get_dataset_for_daily_dialog(tokenizer, config.dataset_path, config.dataset_cache, SPECIAL_TOKENS)
+
+    # personachat["train"] = personachat["train"][:100]
+    # personachat["valid"] = personachat["valid"][:10]
+
+
+    logger.info("Build inputs and labels")
+    datasets = {"train": defaultdict(list), "valid": defaultdict(list)}
+    gpu_max_length = 310
+    for dataset_name, dataset in personachat.items():
+        num_candidates = len(dataset[0]["utterances"][0]["candidates"])
+        if config.num_candidates > 0 and dataset_name == 'train':
+            num_candidates = min(config.num_candidates, num_candidates)
+        for dialog in dataset:
+            for utterance in dialog["utterances"]:
+                history = utterance["history"][-(2*config.max_history+1):]
+                emotions = utterance["emotion"][-(2 * config.max_history + 1):]
+                for j, candidate in enumerate(utterance["candidates"][-num_candidates:]):
+                    lm_labels = bool(j == num_candidates-1) #the true label is always the last one in list of candidates
+                    candidate_emotion = utterance['candidates_emotions'][j]
+                    instance, _ = build_input_from_segments(history, emotions, candidate, candidate_emotion, tokenizer, lm_labels)
+                    #print(len(instance["input_ids"]))
+                    if len(instance["input_ids"]) > gpu_max_length:
+                        truncated_history = [hist[:10] for hist in history]
+                        truncated_candidate = candidate[:10]
+                        instance, _ = build_input_from_segments(truncated_history, emotions, truncated_candidate, candidate_emotion, tokenizer, lm_labels)
+
+                    for input_name, input_array in instance.items():
+                        datasets[dataset_name][input_name].append(input_array)
+                datasets[dataset_name]["mc_labels"].append(num_candidates - 1)
+                datasets[dataset_name]["n_candidates"] = num_candidates
+
+    logger.info("Pad inputs and convert to Tensor")
+    tensor_datasets = {"train": [], "valid": []}
+    for dataset_name, dataset in datasets.items():
+        dataset = pad_dataset(dataset, padding=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]))
+        for input_name in MODEL_INPUTS:
+            tensor = torch.tensor(dataset[input_name])
+            if input_name != "mc_labels":
+                tensor = tensor.view((-1, datasets[dataset_name]["n_candidates"]) + tensor.shape[1:])
+            tensor_datasets[dataset_name].append(tensor)
+
+    logger.info("Build train and validation dataloaders")
+    train_dataset, valid_dataset = TensorDataset(*tensor_datasets["train"]), TensorDataset(*tensor_datasets["valid"])
+    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if config.distributed else None
+    valid_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset) if config.distributed else None
+    train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=config.train_batch_size, shuffle=False)
+    valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=config.valid_batch_size, shuffle=False)
+
+    logger.info("Train dataset (Batch, Candidates, Seq length): {}".format(train_dataset.tensors[0].shape))
+    logger.info("Valid dataset (Batch, Candidates, Seq length): {}".format(valid_dataset.tensors[0].shape))
+    return train_loader, valid_loader, train_sampler, valid_sampler
+
+
+def train():
+    config_file = "configs/train_full_config.json"
+    config = Config.from_json_file(config_file)
+
+
+    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes
+    logging.basicConfig(level=logging.INFO if config.local_rank in [-1, 0] else logging.WARN)
+    logger.warning("Running process %d", config.local_rank)  # This is a logger.warning: it will be printed by all distributed processes
+    logger.info("Arguments: %s", pformat(config))
+
+    # Initialize distributed training if needed
+    config.distributed = (config.local_rank != -1)
+    if config.distributed:
+        torch.cuda.set_device(config.local_rank)
+        config.device = torch.device("cuda", config.local_rank)
+        torch.distributed.init_process_group(backend='nccl', init_method='env://')
+
+    logger.info("Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning")
+    tokenizer_class = GPT2Tokenizer if "gpt2" in config.model_checkpoint else OpenAIGPTTokenizer
+    tokenizer = tokenizer_class.from_pretrained(config.model_checkpoint)
+    model_class = GPT2DoubleHeadsModel if "gpt2" in config.model_checkpoint else OpenAIGPTDoubleHeadsModel
+    model = model_class.from_pretrained(config.model_checkpoint)
+    tokenizer.set_special_tokens(SPECIAL_TOKENS)
+    model.set_num_special_tokens(len(SPECIAL_TOKENS))
+    model.to(config.device)
+    optimizer = OpenAIAdam(model.parameters(), lr=config.lr)
+
+    # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
+    if config.fp16:
+        from apex import amp  # Apex is only required if we use fp16 training
+        model, optimizer = amp.initialize(model, optimizer, opt_level=config.fp16)
+    if config.distributed:
+        model = DistributedDataParallel(model, device_ids=[config.local_rank], output_device=config.local_rank)
+
+    logger.info("Prepare datasets")
+    train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(config, tokenizer)
+
+    # Training function and trainer
+    def update(engine, batch):
+        model.train()
+        input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids = tuple(input_tensor.to(config.device) for input_tensor in batch)
+        lm_loss, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids)
+        loss = (lm_loss * config.lm_coef + mc_loss * config.mc_coef) / config.gradient_accumulation_steps
+        if config.fp16:
+            with amp.scale_loss(loss, optimizer) as scaled_loss:
+                scaled_loss.backward()
+            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), config.max_norm)
+        else:
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_norm)
+        if engine.state.iteration % config.gradient_accumulation_steps == 0:
+            optimizer.step()
+            optimizer.zero_grad()
+        return loss.item()
+    trainer = Engine(update)
+
+    # Evaluation function and evaluator (evaluator output is the input of the metrics)
+    def inference(engine, batch):
+        model.eval()
+        with torch.no_grad():
+            batch = tuple(input_tensor.to(config.device) for input_tensor in batch)
+            input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids = batch
+            #logger.info(tokenizer.decode(input_ids[0, -1, :].tolist()))
+            model_outputs = model(input_ids, mc_token_ids, token_type_ids=token_type_ids, token_emotion_ids=token_emotion_ids)
+            lm_logits, mc_logits = model_outputs[0], model_outputs[1]  # So we can also use GPT2 outputs
+            lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1))
+            lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
+            return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels)
+    evaluator = Engine(inference)
+
+    # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
+    trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader))
+    if config.n_epochs < 1:
+        trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader))
+    if config.eval_before_start:
+        trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader))
+
+    # Make sure distributed data samplers split the dataset nicely between the distributed processes
+    if config.distributed:
+        trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch))
+        evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch))
+
+    # Linearly decrease the learning rate from lr to zero
+    scheduler = PiecewiseLinear(optimizer, "lr", [(0, config.lr), (config.n_epochs * len(train_loader), 0.0)])
+    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)
+
+    # Prepare metrics - note how we compute distributed metrics
+    RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
+    metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0])),
+               "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))}
+    metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], config),
+                    "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], config)})
+    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
+    for name, metric in metrics.items():
+        metric.attach(evaluator, name)
+
+    # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
+    if config.local_rank in [-1, 0]:
+        pbar = ProgressBar(persist=True)
+        pbar.attach(trainer, metric_names=["loss"])
+        evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics)))
+
+        tb_logger = TensorboardLogger(log_dir=config.log_dir)
+        tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED)
+        tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED)
+        tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED)
+
+        checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir, 'checkpoint', save_interval=1, n_saved=3)
+        trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)})  # "getattr" take care of distributed encapsulation
+
+        torch.save(config, tb_logger.writer.log_dir + '/model_training_args.bin')
+        getattr(model, 'module', model).config.to_json_file(os.path.join(tb_logger.writer.log_dir, CONFIG_NAME))
+        tokenizer.save_vocabulary(tb_logger.writer.log_dir)
+
+    # Run the training
+    trainer.run(train_loader, max_epochs=config.n_epochs)
+
+    # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method)
+    if config.local_rank in [-1, 0] and config.n_epochs > 0:
+        os.rename(checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.log_dir, WEIGHTS_NAME))  # TODO: PR in ignite to have better access to saved file paths (cleaner)
+        tb_logger.close()
+
+if __name__ == "__main__":
+    train()
diff --git a/train_multihead.py b/train_multihead.py
new file mode 100644
index 0000000..950878a
--- /dev/null
+++ b/train_multihead.py
@@ -0,0 +1,297 @@
+# Copyright (c) 2019-present, HuggingFace Inc.
+# All rights reserved. This source code is licensed under the BSD-style license found in the LICENSE file in the root directory of this source tree.
+import os
+import math
+import logging
+from pprint import pformat
+from argparse import ArgumentParser
+from collections import defaultdict
+from itertools import chain
+
+import torch
+from torch.nn.parallel import DistributedDataParallel
+from torch.utils.data import DataLoader, TensorDataset
+from ignite.engine import Engine, Events
+from ignite.handlers import ModelCheckpoint
+from ignite.metrics import Accuracy, Loss, MetricsLambda, RunningAverage
+from ignite.contrib.handlers import ProgressBar, PiecewiseLinear
+from config import Config
+from ignite.contrib.handlers.tensorboard_logger import TensorboardLogger, OutputHandler, OptimizerParamsHandler
+from pytorch_pretrained_bert import (OpenAIAdam, OpenAIGPTMultiHeadModel, OpenAIGPTTokenizer,
+                                     GPT2DoubleHeadsModel, GPT2Tokenizer, WEIGHTS_NAME, CONFIG_NAME,
+                                     BertModel, BertTokenizer)
+
+from utils import get_dataset, get_dataset_for_daily_dialog
+
+SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>",
+
+                  "<no_emotion>", "<happiness>", "<surprise>", "<sadness>", "<disgust>", "<anger>", "<fear>",
+
+                 "<work>", "<finance>", "<relationship>", "<attitude_and_emotion>", "<culture_and_education>",
+                  "<school_life>", "<tourism>", "<ordinary_life>", "<politics>", "<health>",
+
+                  "<directive>", "<inform>", "<commissive>", "<question>",
+                  "<pad>"]
+MODEL_INPUTS = ["input_ids", "ec_token_ids", "sc_token_ids", "lm_labels", "ec_labels", "sc_labels",
+                "token_type_ids", "token_emotion_ids", "token_action_ids"]
+PADDED_INPUTS = ["input_ids", "lm_labels", "token_type_ids", "token_emotion_ids", "token_action_ids"]
+
+logger = logging.getLogger(__file__)
+
+def average_distributed_scalar(scalar, config):
+    """ Average a scalar over the nodes if we are in distributed training. We use this for distributed evaluation. """
+    if config.local_rank == -1:
+        return scalar
+    scalar_t = torch.tensor(scalar, dtype=torch.float, device=config.device) / torch.distributed.get_world_size()
+    torch.distributed.all_reduce(scalar_t, op=torch.distributed.ReduceOp.SUM)
+    return scalar_t.item()
+
+
+def pad_dataset(dataset, padding=0):
+    """ Pad the dataset. This could be optimized by defining a Dataset class and padd only batches but this is simpler. """
+    max_l = max(len(x) for x in dataset["input_ids"])
+    for name in PADDED_INPUTS:
+        dataset[name] = [x + [padding if name != "lm_labels" else -1] * (max_l - len(x)) for x in dataset[name]]
+    return dataset
+
+
+def get_emotion_label(tokenizer, candidate_emotion):
+    no_emotion_id, happiness_id, surprise_id, sadness_id, disgust_id, anger_id, fear_id = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[4:11])
+
+    if candidate_emotion == no_emotion_id:
+        return 0
+    elif candidate_emotion == happiness_id:
+        return 1
+    elif candidate_emotion == surprise_id:
+        return 2
+    elif candidate_emotion == sadness_id:
+        return 3
+    elif candidate_emotion == disgust_id:
+        return 4
+    elif candidate_emotion == anger_id:
+        return 5
+    elif candidate_emotion == fear_id:
+        return 6
+
+def build_input_from_segments(topic, history, emotions, actions, reply, candidate_emotion,  canidate_act, tokenizer, lm_labels=False, with_eos=True):
+    """ Build a sequence of input from 3 segments: persona, history and last reply """
+    bos, eos, speaker1, speaker2, no_emotion = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:5])
+
+    inform = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-4])
+    emotions = [no_emotion] + emotions
+    actions = [inform] + actions
+
+    instance = {}
+    #sequence = [[bos] + history[0] + list(chain(*history[1:]))] + [reply + ([eos] if with_eos else [])] #seq = [personas, history, reply] concatenate all persona sentences
+    sequence = [[bos] + [topic]] + history + [reply + ([eos] if with_eos else [])]
+    sequence = [[speaker2 if (len(sequence)-i) % 2 else speaker1] + s for i, s in enumerate(sequence)]
+
+
+    instance["input_ids"] = list(chain(*sequence))
+    instance["token_type_ids"] = [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in s] # the last for is for repeating the speaker1 and speaker2 for all tokens
+    instance["token_emotion_ids"] = [emotions[i] for i, s in enumerate(sequence[:-1]) for _ in s] + [candidate_emotion]*len(sequence[-1])
+    instance["token_action_ids"] = [actions[i] for i, s in enumerate(sequence[:-1]) for _ in s] + [canidate_act]*len(sequence[-1])
+
+    instance["ec_token_ids"] = len(instance["input_ids"]) - 1
+    instance["sc_token_ids"] = len(instance["input_ids"]) - 2
+    instance["ec_labels"] = -1
+    instance["lm_labels"] = [-1] * len(instance["input_ids"])
+    if lm_labels:
+        instance["lm_labels"] = ([-1] * sum(len(s) for s in sequence[:-1])) + [-1] + sequence[-1][1:] #all -1 except for reply, reply is just the ids
+        instance["ec_labels"] = get_emotion_label(tokenizer, candidate_emotion)
+    return instance, sequence
+
+
+def get_data_loaders(config, tokenizer):
+    """ Prepare the dataset for training and evaluation """
+    personachat = get_dataset_for_daily_dialog(tokenizer, config.dataset_path, config.dataset_cache, SPECIAL_TOKENS)
+
+
+    logger.info("Build inputs and labels")
+    datasets = {"train": defaultdict(list), "valid": defaultdict(list)}
+    gpu_max_length = 310
+    for dataset_name, dataset in personachat.items():
+        num_candidates = len(dataset[0]["utterances"][0]["candidates"])
+        if config.num_candidates > 0 and dataset_name == 'train':
+            num_candidates = min(config.num_candidates, num_candidates)
+        for dialog in dataset:
+            topic = dialog["topic"]
+            for utterance in dialog["utterances"]:
+                history = utterance["history"][-(2 * config.max_history+1):]
+                emotions = utterance["emotion"][-(2 * config.max_history + 1):]
+                actions = utterance["act"][-(2 * config.max_history+1):]
+                for j, candidate in enumerate(utterance["candidates"][-num_candidates:]):
+                    lm_labels = bool(j == num_candidates-1) #the true label is always the last one in list of candidates
+                    candidate_emotion = utterance['candidates_emotions'][j]
+                    candidate_act = utterance['candidates_acts'][j]
+                    instance, _ = build_input_from_segments(topic, history, emotions, actions, candidate,
+                                                            candidate_emotion, candidate_act, tokenizer, lm_labels)
+
+                    if len(instance["input_ids"]) > gpu_max_length:
+                        truncated_history = [hist[:10] for hist in history]
+                        truncated_candidate = candidate[:10]
+                        instance, _ = build_input_from_segments(topic, truncated_history, emotions, actions, truncated_candidate,
+                                                                candidate_emotion, candidate_act, tokenizer, lm_labels)
+
+
+                    for input_name, input_array in instance.items():
+                        datasets[dataset_name][input_name].append(input_array)
+
+                datasets[dataset_name]["sc_labels"].append(num_candidates - 1)
+                datasets[dataset_name]["n_candidates"] = num_candidates
+
+    logger.info("Pad inputs and convert to Tensor")
+    tensor_datasets = {"train": [], "valid": []}
+    for dataset_name, dataset in datasets.items():
+        dataset = pad_dataset(dataset, padding=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]))
+        for input_name in MODEL_INPUTS:
+            tensor = torch.tensor(dataset[input_name])
+            if input_name != "sc_labels":
+                tensor = tensor.view((-1, datasets[dataset_name]["n_candidates"]) + tensor.shape[1:])
+            tensor_datasets[dataset_name].append(tensor)
+
+    logger.info("Build train and validation dataloaders")
+    train_dataset, valid_dataset = TensorDataset(*tensor_datasets["train"]), TensorDataset(*tensor_datasets["valid"])
+    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if config.distributed else None
+    valid_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset) if config.distributed else None
+    train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=config.train_batch_size, shuffle=False)
+    valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=config.valid_batch_size, shuffle=False)
+
+    logger.info("Train dataset (Batch, Candidates, Seq length): {}".format(train_dataset.tensors[0].shape))
+    logger.info("Valid dataset (Batch, Candidates, Seq length): {}".format(valid_dataset.tensors[0].shape))
+    return train_loader, valid_loader, train_sampler, valid_sampler
+
+
+def train():
+    config_file = "configs/train_multihead_config.json"
+    config = Config.from_json_file(config_file)
+
+    ec_coef = 1
+    sc_coef = 1
+
+    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes
+    logging.basicConfig(level=logging.INFO if config.local_rank in [-1, 0] else logging.WARN)
+    logger.warning("Running process %d", config.local_rank)  # This is a logger.warning: it will be printed by all distributed processes
+    logger.info("Arguments: %s", pformat(config))
+
+    # Initialize distributed training if needed
+    config.distributed = (config.local_rank != -1)
+    if config.distributed:
+        torch.cuda.set_device(config.local_rank)
+        config.device = torch.device("cuda", config.local_rank)
+        torch.distributed.init_process_group(backend='nccl', init_method='env://')
+
+    logger.info("Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning")
+    tokenizer_class = OpenAIGPTTokenizer
+    tokenizer = tokenizer_class.from_pretrained(config.model_checkpoint)
+    model_class = OpenAIGPTMultiHeadModel
+    model = model_class.from_pretrained(config.model_checkpoint)
+    tokenizer.set_special_tokens(SPECIAL_TOKENS)
+    model.set_num_special_tokens(len(SPECIAL_TOKENS))
+    model.to(config.device)
+    optimizer = OpenAIAdam(model.parameters(), lr=config.lr)
+
+    # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
+    if config.fp16:
+        from apex import amp  # Apex is only required if we use fp16 training
+        model, optimizer = amp.initialize(model, optimizer, opt_level=config.fp16)
+    if config.distributed:
+        model = DistributedDataParallel(model, device_ids=[config.local_rank], output_device=config.local_rank)
+
+    logger.info("Prepare datasets")
+    train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(config, tokenizer)
+
+    # Training function and trainer
+    def update(engine, batch):
+        model.train()
+        #input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids, token_action_ids = tuple(input_tensor.to(config.device) for input_tensor in batch)
+        input_ids, ec_token_ids, sc_token_ids, lm_labels, ec_labels, sc_labels, token_type_ids, token_emotion_ids, token_action_ids = tuple(input_tensor.to(config.device) for input_tensor in batch)
+
+        lm_loss, emotion_loss, sentence_loss = model(input_ids, ec_token_ids, sc_token_ids,
+                                      lm_labels, ec_labels, sc_labels, token_type_ids,
+                                      token_emotion_ids, token_action_ids)
+        loss = (lm_loss * config.lm_coef + emotion_loss * ec_coef + sentence_loss * sc_coef) / config.gradient_accumulation_steps
+        if config.fp16:
+            with amp.scale_loss(loss, optimizer) as scaled_loss:
+                scaled_loss.backward()
+            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), config.max_norm)
+        else:
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_norm)
+        if engine.state.iteration % config.gradient_accumulation_steps == 0:
+            optimizer.step()
+            optimizer.zero_grad()
+        return loss.item()
+    trainer = Engine(update)
+
+    # Evaluation function and evaluator (evaluator output is the input of the metrics)
+    def inference(engine, batch):
+        model.eval()
+        with torch.no_grad():
+            batch = tuple(input_tensor.to(config.device) for input_tensor in batch)
+            input_ids, ec_token_ids, sc_token_ids, lm_labels, ec_labels, \
+            sc_labels, token_type_ids, token_emotion_ids, token_action_ids = batch
+            #logger.info(tokenizer.decode(input_ids[0, -1, :].tolist()))
+            model_outputs = model(input_ids, ec_token_ids, sc_token_ids, token_type_ids=token_type_ids,
+                                  token_emotion_ids=token_emotion_ids,
+                                  token_action_ids=token_action_ids)
+            lm_logits, mc_logits = model_outputs[0], model_outputs[2]  # So we can also use GPT2 outputs
+            lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1))
+            lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
+            return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, sc_labels)
+    evaluator = Engine(inference)
+
+    # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
+    trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader))
+    if config.n_epochs < 1:
+        trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader))
+    if config.eval_before_start:
+        trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader))
+
+    # Make sure distributed data samplers split the dataset nicely between the distributed processes
+    if config.distributed:
+        trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch))
+        evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch))
+
+    # Linearly decrease the learning rate from lr to zero
+    scheduler = PiecewiseLinear(optimizer, "lr", [(0, config.lr), (config.n_epochs * len(train_loader), 0.0)])
+    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)
+
+    # Prepare metrics - note how we compute distributed metrics
+    RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
+    metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0])),
+               "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))}
+    metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], config),
+                    "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], config)})
+    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
+    for name, metric in metrics.items():
+        metric.attach(evaluator, name)
+
+    # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
+    if config.local_rank in [-1, 0]:
+        pbar = ProgressBar(persist=True)
+        pbar.attach(trainer, metric_names=["loss"])
+        evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics)))
+
+        tb_logger = TensorboardLogger(log_dir=config.log_dir)
+        tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED)
+        tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED)
+        tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED)
+
+        checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir, 'checkpoint', save_interval=1, n_saved=3)
+        trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)})  # "getattr" take care of distributed encapsulation
+
+        torch.save(config, tb_logger.writer.log_dir + '/model_training_args.bin')
+        getattr(model, 'module', model).config.to_json_file(os.path.join(tb_logger.writer.log_dir, CONFIG_NAME))
+        tokenizer.save_vocabulary(tb_logger.writer.log_dir)
+
+    # Run the training
+    trainer.run(train_loader, max_epochs=config.n_epochs)
+
+    # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method)
+    if config.local_rank in [-1, 0] and config.n_epochs > 0:
+        os.rename(checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.log_dir, WEIGHTS_NAME))  # TODO: PR in ignite to have better access to saved file paths (cleaner)
+        tb_logger.close()
+
+if __name__ == "__main__":
+    train()
diff --git a/utils.py b/utils.py
new file mode 100644
index 0000000..c4dd9f8
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,225 @@
+# Copyright (c) 2019-present, HuggingFace Inc.
+# All rights reserved. This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import logging
+import os
+import tarfile
+import tempfile
+import re
+
+import torch
+
+from pytorch_pretrained_bert import cached_path
+from collections import Counter
+
+try:
+    from nltk.translate import bleu_score as nltkbleu
+except ImportError:
+    # User doesn't have nltk installed, so we can't use it for bleu
+    # We'll just turn off things, but we might want to warn the user
+    nltkbleu = None
+
+
+
+PERSONACHAT_URL = "https://s3.amazonaws.com/datasets.huggingface.co/personachat/personachat_self_original.json"
+HF_FINETUNED_MODEL = "https://s3.amazonaws.com/models.huggingface.co/transfer-learning-chatbot/finetuned_chatbot_gpt.tar.gz"
+
+logger = logging.getLogger(__file__)
+
+
+re_art = re.compile(r'\b(a|an|the)\b')
+re_punc = re.compile(r'[!"#$%&()*+,-./:;<=>?@\[\]\\^`{|}~_\']')
+
+
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+    def remove_articles(text):
+        return re_art.sub(' ', text)
+
+    def white_space_fix(text):
+        return ' '.join(text.split())
+
+    def remove_punc(text):
+        return re_punc.sub(' ', text)  # convert punctuation to spaces
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def download_pretrained_model():
+    """ Download and extract finetuned model from S3 """
+    resolved_archive_file = cached_path(HF_FINETUNED_MODEL)
+    tempdir = tempfile.mkdtemp()
+
+    logger.info("extracting archive file {} to temp dir {}".format(resolved_archive_file, tempdir))
+    with tarfile.open(resolved_archive_file, 'r:gz') as archive:
+        archive.extractall(tempdir)
+    return tempdir
+
+
+def get_dataset(tokenizer, dataset_path, dataset_cache=None):
+    """ Get PERSONACHAT from S3 """
+    dataset_path = dataset_path or PERSONACHAT_URL
+    dataset_cache = dataset_cache + '_' + type(tokenizer).__name__  # Do avoid using GPT cache for GPT-2 and vice-versa
+    if dataset_cache and os.path.isfile(dataset_cache):
+        logger.info("Load tokenized dataset from cache at %s", dataset_cache)
+        dataset = torch.load(dataset_cache)
+    else:
+        logger.info("Download dataset from %s", dataset_path)
+        personachat_file = cached_path(dataset_path)
+        with open(personachat_file, "r", encoding="utf-8") as f:
+            dataset = json.loads(f.read())
+
+        logger.info("Tokenize and encode the dataset")
+        def tokenize(obj):
+            if isinstance(obj, str):
+                return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
+            if isinstance(obj, dict):
+                return dict((n, tokenize(o)) for n, o in obj.items())
+            return list(tokenize(o) for o in obj)
+        dataset = tokenize(dataset)
+        if dataset_cache:
+            torch.save(dataset, dataset_cache)
+    return dataset
+
+
+
+def get_dataset_for_daily_dialog(tokenizer, dataset_path, dataset_cache=None, special_tokens=None):
+    """ Get PERSONACHAT from S3 """
+    dataset_path = dataset_path or PERSONACHAT_URL
+    dataset_cache = dataset_cache + '_' + type(tokenizer).__name__  # Do avoid using GPT cache for GPT-2 and vice-versa
+    if dataset_cache and os.path.isfile(dataset_cache):
+        logger.info("Load tokenized dataset from cache at %s", dataset_cache)
+        dataset = torch.load(dataset_cache)
+    else:
+        logger.info("Download dataset from %s", dataset_path)
+        personachat_file = cached_path(dataset_path)
+        with open(personachat_file, "r", encoding="utf-8") as f:
+            dataset = json.loads(f.read())
+
+        logger.info("Tokenize and encode the dataset")
+        def tokenize(obj):
+            if isinstance(obj, str):
+                if obj in special_tokens:
+                    return tokenizer.convert_tokens_to_ids(obj)
+                else:
+                    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
+            if isinstance(obj, dict):
+                return dict((n, tokenize(o)) for n, o in obj.items())
+            return list(tokenize(o) for o in obj)
+        dataset = tokenize(dataset)
+        if dataset_cache:
+            torch.save(dataset, dataset_cache)
+    return dataset
+
+
+def get_dataset_personalities(tokenizer, dataset_path, dataset_cache=None):
+    """ Get personalities from PERSONACHAT """
+    dataset_path = dataset_path or PERSONACHAT_URL
+    dataset_cache = dataset_cache + '_' + type(tokenizer).__name__  # Do avoid using GPT cache for GPT-2 and vice-versa
+    if os.path.isfile(dataset_cache):
+        logger.info("Load tokenized dataset from cache at %s", dataset_cache)
+        personachat = torch.load(dataset_cache)
+    else:
+        logger.info("Download PERSONACHAT dataset from %s", dataset_path)
+        personachat_file = cached_path(dataset_path)
+        with open(personachat_file, "r", encoding="utf-8") as f:
+            personachat = json.loads(f.read())
+
+        logger.info("Tokenize and encode the dataset")
+        def tokenize(obj):
+            if isinstance(obj, str):
+                return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
+            if isinstance(obj, dict):
+                return dict((n, tokenize(o)) for n, o in obj.items())
+            return list(tokenize(o) for o in obj)
+        personachat = tokenize(personachat)
+        torch.save(personachat, dataset_cache)
+
+    logger.info("Filter personalities")
+    personalities = []
+    for dataset in personachat.values():
+        for dialog in dataset:
+            personalities.append(dialog["personality"])
+
+    logger.info("Gathered {} personalities".format(len(personalities)))
+    return personalities
+
+
+def _prec_recall_f1_score(pred_items, gold_items):
+    """
+    Compute precision, recall and f1 given a set of gold and prediction items.
+
+    :param pred_items: iterable of predicted values
+    :param gold_items: iterable of gold values
+
+    :return: tuple (p, r, f1) for precision, recall, f1
+    """
+    common = Counter(gold_items) & Counter(pred_items)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return 0, 0, 0
+    precision = 1.0 * num_same / len(pred_items)
+    recall = 1.0 * num_same / len(gold_items)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return precision, recall, f1
+
+
+
+def _f1_score(guess, answers):
+    """Return the max F1 score between the guess and *any* answer."""
+    if guess is None or answers is None:
+        return 0
+    g_tokens = normalize_answer(guess).split()
+    scores = [
+        _prec_recall_f1_score(g_tokens, normalize_answer(a).split())for a in answers
+    ]
+    return max(f1 for p, r, f1 in scores)
+
+
+def _bleu(guess, answers, method=None):
+    """Compute approximate BLEU score between guess and a set of answers."""
+    if nltkbleu is None:
+        # bleu library not installed, just return a default value
+        return None
+    # Warning: BLEU calculation *should* include proper tokenization and
+    # punctuation etc. We're using the normalize_answer for everything though,
+    # so we're over-estimating our BLEU scores.  Also note that NLTK's bleu is
+    # going to be slower than fairseq's (which is written in C), but fairseq's
+    # requires that everything be in arrays of ints (i.e. as tensors). NLTK's
+    # works with strings, which is better suited for this module.
+    if method == "method0":
+        smoothing_func = nltkbleu.SmoothingFunction(epsilon=1e-12).method0
+    elif method == "method1":
+        smoothing_func = nltkbleu.SmoothingFunction(epsilon=1e-12).method1
+    elif method == "method2":
+        smoothing_func = nltkbleu.SmoothingFunction(epsilon=1e-12).method2
+    elif method == "method3":
+        smoothing_func = nltkbleu.SmoothingFunction(epsilon=1e-12).method3
+    elif method == "method4":
+        smoothing_func = nltkbleu.SmoothingFunction(epsilon=1e-12).method4
+    elif method == "method5":
+        smoothing_func = nltkbleu.SmoothingFunction(epsilon=1e-12).method5
+    elif method == "method6":
+        smoothing_func = nltkbleu.SmoothingFunction(epsilon=1e-12).method6
+    elif method == "method7":
+        smoothing_func = nltkbleu.SmoothingFunction(epsilon=1e-12).method7
+    else:
+        smoothing_func = nltkbleu.SmoothingFunction(epsilon=1e-12).method3
+
+
+    return nltkbleu.sentence_bleu(
+        [normalize_answer(a).split(" ") for a in answers],
+        normalize_answer(guess).split(" "),
+        smoothing_function=smoothing_func,
+    )
+
+
+
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self