diff --git a/.gitignore b/.gitignore
index 0b69c9ad..1c7a4d84 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
+# Wandb
+wandb/
+
 # Outputs from examples
 **/cache_dir
 **/runs
diff --git a/CHANGELOG.md b/CHANGELOG.md
index be80b234..2a5912fa 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,23 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.18.9] - 2020-01-20
+### Fixed
+- Fixed bug with importing certain pre-trained models in `MultiLabelClassificationModel`.
+
+## [0.18.8] - 2020-01-20
+### Added
+- Added `**kwargs` to the init methods of `ClassificationModel`, `MultiLabelClassificationModel`, `QuestionAnsweringModel`, and `NERModel`. These will be passed to the `from_pretrained()` method of the underlying model class.
+
+## [0.18.6] - 2020-01-18
+### Changed
+- Reverted change made in 0.18.4 (Model checkpoint is no longer saved at the end of the last epoch as this is the same model saved in `ouput_dir` at the end of training).
+Model checkpoint is now saved for all epochs again.
+
+## [0.18.5] - 2020-01-18
+### Fixed
+- Fixed bug when using `sliding_window`.
+
 ## [0.18.4] - 2020-01-17
 ### Fixed
 - Typo in `classification_utils.py`.
@@ -179,7 +196,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - This CHANGELOG file to hopefully serve as an evolving example of a
   standardized open source project CHANGELOG.
 
-[0.18.3]: https://github.com/ThilinaRajapakse/simpletransformers/compare/0aa88e4...HEAD
+[0.18.9]: https://github.com/ThilinaRajapakse/simpletransformers/compare/8ade0f4...HEAD
+
+[0.18.8]: https://github.com/ThilinaRajapakse/simpletransformers/compare/44afa70...8ade0f4
+
+[0.18.6]: https://github.com/ThilinaRajapakse/simpletransformers/compare/aa7f650...44afa70
+
+[0.18.5]: https://github.com/ThilinaRajapakse/simpletransformers/compare/ebef6c4...aa7f650
+
+[0.18.4]: https://github.com/ThilinaRajapakse/simpletransformers/compare/0aa88e4...ebef6c4
 
 [0.18.3]: https://github.com/ThilinaRajapakse/simpletransformers/compare/52a488e...0aa88e4
 
diff --git a/setup.py b/setup.py
index cab9b40d..b6bee198 100755
--- a/setup.py
+++ b/setup.py
@@ -6,7 +6,7 @@
 
 setup(
     name="simpletransformers",
-    version="0.18.4",
+    version="0.18.9",
     author="Thilina Rajapakse",
     author_email="chaturangarajapakshe@gmail.com",
     description="An easy-to-use wrapper library for the Transformers library.",
diff --git a/simpletransformers/classification/classification_model.py b/simpletransformers/classification/classification_model.py
index 9067cf70..937f279c 100755
--- a/simpletransformers/classification/classification_model.py
+++ b/simpletransformers/classification/classification_model.py
@@ -86,6 +86,7 @@
 
 
 class ClassificationModel:
+
     def __init__(
         self,
         model_type,
@@ -95,7 +96,9 @@ def __init__(
         args=None,
         use_cuda=True,
         cuda_device=-1,
+        **kwargs,
     ):
+
         """
         Initializes a ClassificationModel model.
 
@@ -107,7 +110,9 @@ def __init__(
             args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args.
             use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only.
             cuda_device (optional): Specific GPU that should be used. Will use the first available GPU by default.
-        """  # noqa: ignore flake8"
+            **kwargs (optional): For providing proxies, force_download, resume_download, cache_dir and other options specific to the 'from_pretrained' implementation where this will be supplied.
+        """ # noqa: ignore flake8"
+
 
         MODEL_CLASSES = {
             "bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
@@ -138,12 +143,10 @@ def __init__(
 
         config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]
         if num_labels:
-            self.config = config_class.from_pretrained(
-                model_name, num_labels=num_labels
-            )
+            self.config = config_class.from_pretrained(model_name, num_labels=num_labels,  **kwargs)
             self.num_labels = num_labels
         else:
-            self.config = config_class.from_pretrained(model_name)
+            self.config = config_class.from_pretrained(model_name,  **kwargs)
             self.num_labels = self.config.num_labels
         self.weight = weight
 
@@ -162,13 +165,10 @@ def __init__(
             self.device = "cpu"
 
         if self.weight:
-            self.model = model_class.from_pretrained(
-                model_name,
-                config=self.config,
-                weight=torch.Tensor(self.weight).to(self.device),
-            )
+
+            self.model = model_class.from_pretrained(model_name, config=self.config, weight=torch.Tensor(self.weight).to(self.device),  **kwargs)
         else:
-            self.model = model_class.from_pretrained(model_name, config=self.config)
+            self.model = model_class.from_pretrained(model_name, config=self.config,  **kwargs)
 
         self.results = {}
 
@@ -187,9 +187,15 @@ def __init__(
         if args:
             self.args.update(args)
 
-        self.tokenizer = tokenizer_class.from_pretrained(
-            model_name, do_lower_case=self.args["do_lower_case"]
-        )
+
+        self.tokenizer = tokenizer_class.from_pretrained(model_name, do_lower_case=self.args['do_lower_case'],  **kwargs)
+
+        self.args['model_name'] = model_name
+        self.args['model_type'] = model_type
+
+        if model_type in ['camembert', 'xlmroberta']:
+            warnings.warn(f"use_multiprocessing automatically disabled as {model_type} fails when using multiprocessing for feature conversion.")
+            self.args['use_multiprocessing'] = False
 
         self.args["model_name"] = model_name
         self.args["model_type"] = model_type
@@ -576,10 +582,8 @@ def train(
             ) and not os.path.exists(output_dir_current):
                 os.makedirs(output_dir_current)
 
-            if (
-                args["save_model_every_epoch"]
-                and epoch_number != args["num_train_epochs"]
-            ):
+            if args['save_model_every_epoch']:
+
                 model_to_save = model.module if hasattr(model, "module") else model
                 model_to_save.save_pretrained(output_dir_current)
                 self.tokenizer.save_pretrained(output_dir_current)
@@ -742,7 +746,7 @@ def evaluate(self, eval_df, output_dir, multi_label=False, prefix="", **kwargs):
 
             model_outputs = preds
 
-            preds = [np.argmax(pred, axis=1)[0] for pred in preds]
+            preds = [np.argmax(pred, axis=1) for pred in preds]
             final_preds = []
             for pred_row in preds:
                 mode_pred, counts = mode(pred_row)
@@ -1015,7 +1019,7 @@ def predict(self, to_predict, multi_label=False):
 
             model_outputs = preds
 
-            preds = [np.argmax(pred, axis=1)[0] for pred in preds]
+            preds = [np.argmax(pred, axis=1) for pred in preds]
             final_preds = []
             for pred_row in preds:
                 mode_pred, counts = mode(pred_row)
diff --git a/simpletransformers/classification/multi_label_classification_model.py b/simpletransformers/classification/multi_label_classification_model.py
index f96e7b13..8e508fb0 100755
--- a/simpletransformers/classification/multi_label_classification_model.py
+++ b/simpletransformers/classification/multi_label_classification_model.py
@@ -31,15 +31,9 @@
 
 
 class MultiLabelClassificationModel(ClassificationModel):
-    def __init__(
-        self,
-        model_type,
-        model_name,
-        num_labels=None,
-        pos_weight=None,
-        args=None,
-        use_cuda=True,
-    ):
+
+    def __init__(self, model_type, model_name, num_labels=None, pos_weight=None, args=None, use_cuda=True, **kwargs):
+
         """
         Initializes a MultiLabelClassification model.
 
@@ -50,7 +44,9 @@ def __init__(
             pos_weight (optional): A list of length num_labels containing the weights to assign to each label for loss calculation.
             args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args.
             use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only.
-        """  # noqa: ignore flake8"
+            **kwargs (optional): For providing proxies, force_download, resume_download, cache_dir and other options specific to the 'from_pretrained' implementation where this will be supplied.
+        """# noqa: ignore flake8"
+
         MODEL_CLASSES = {
             "bert": (
                 BertConfig,
@@ -82,12 +78,12 @@ def __init__(
 
         config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]
         if num_labels:
-            self.config = config_class.from_pretrained(
-                model_name, num_labels=num_labels
-            )
+
+            self.config = config_class.from_pretrained(model_name, num_labels=num_labels, **kwargs)
+
             self.num_labels = num_labels
         else:
-            self.config = config_class.from_pretrained(model_name)
+            self.config = config_class.from_pretrained(model_name, **kwargs)
             self.num_labels = self.config.num_labels
         self.pos_weight = pos_weight
 
@@ -103,13 +99,9 @@ def __init__(
             self.device = "cpu"
 
         if self.pos_weight:
-            self.model = model_class.from_pretrained(
-                model_name,
-                config=self.config,
-                pos_weight=torch.Tensor(self.pos_weight).to(self.device),
-            )
+            self.model = model_class.from_pretrained(model_name, config=self.config, pos_weight=torch.Tensor(self.pos_weight).to(self.device), **kwargs)
         else:
-            self.model = model_class.from_pretrained(model_name, config=self.config)
+            self.model = model_class.from_pretrained(model_name, config=self.config, **kwargs)
 
         self.results = {}
 
@@ -128,9 +120,9 @@ def __init__(
         if args:
             self.args.update(args)
 
-        self.tokenizer = tokenizer_class.from_pretrained(
-            model_name, do_lower_case=self.args["do_lower_case"]
-        )
+
+        self.tokenizer = tokenizer_class.from_pretrained(model_name, do_lower_case=self.args['do_lower_case'], **kwargs)
+
 
         self.args["model_name"] = model_name
         self.args["model_type"] = model_type
diff --git a/simpletransformers/custom_models/models.py b/simpletransformers/custom_models/models.py
index 16cf468e..a1fce45f 100755
--- a/simpletransformers/custom_models/models.py
+++ b/simpletransformers/custom_models/models.py
@@ -16,20 +16,8 @@
     AlbertModel,
 )
 
-from torch.nn import BCEWithLogitsLoss
-
-ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-pytorch_model.bin",
-    "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-pytorch_model.bin",
-    "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-pytorch_model.bin",
-    "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-pytorch_model.bin",
-}
-
-DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-pytorch_model.bin",  # noqa: ignore flake8"
-    "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-pytorch_model.bin",  # noqa: ignore flake8"
-}
-
+from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+from transformers.modeling_distilbert import DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
 class BertForMultiLabelSequenceClassification(BertPreTrainedModel):
     """
diff --git a/simpletransformers/ner/ner_model.py b/simpletransformers/ner/ner_model.py
index 056d2a9b..62d92a18 100755
--- a/simpletransformers/ner/ner_model.py
+++ b/simpletransformers/ner/ner_model.py
@@ -56,15 +56,7 @@
 
 
 class NERModel:
-    def __init__(
-        self,
-        model_type,
-        model_name,
-        labels=None,
-        args=None,
-        use_cuda=True,
-        cuda_device=-1,
-    ):
+    def __init__(self, model_type, model_name, labels=None, args=None, use_cuda=True, cuda_device=-1, **kwargs):
         """
         Initializes a NERModel
 
@@ -115,7 +107,7 @@ def __init__(
 
         config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]
 
-        self.model = model_class.from_pretrained(model_name, num_labels=self.num_labels)
+        self.model = model_class.from_pretrained(model_name, num_labels=self.num_labels, **kwargs)
 
         if use_cuda:
             if torch.cuda.is_available():
@@ -143,9 +135,9 @@ def __init__(
         if args:
             self.args.update(args)
 
-        self.tokenizer = tokenizer_class.from_pretrained(
-            model_name, do_lower_case=self.args["do_lower_case"]
-        )
+
+        self.tokenizer = tokenizer_class.from_pretrained(model_name, do_lower_case=self.args['do_lower_case'], **kwargs)
+
 
         self.args["model_name"] = model_name
         self.args["model_type"] = model_type
@@ -474,10 +466,8 @@ def train(self, train_dataset, output_dir, show_running_loss=True, eval_df=None)
             ) and not os.path.exists(output_dir_current):
                 os.makedirs(output_dir_current)
 
-            if (
-                args["save_model_every_epoch"]
-                and epoch_number != args["num_train_epochs"]
-            ):
+            if args['save_model_every_epoch']:
+
                 model_to_save = model.module if hasattr(model, "module") else model
                 model_to_save.save_pretrained(output_dir_current)
                 self.tokenizer.save_pretrained(output_dir_current)
diff --git a/simpletransformers/question_answering/question_answering_model.py b/simpletransformers/question_answering/question_answering_model.py
index 7f6edb9e..cd343cb9 100755
--- a/simpletransformers/question_answering/question_answering_model.py
+++ b/simpletransformers/question_answering/question_answering_model.py
@@ -62,9 +62,9 @@
 
 
 class QuestionAnsweringModel:
-    def __init__(
-        self, model_type, model_name, args=None, use_cuda=True, cuda_device=-1
-    ):
+
+    def __init__(self, model_type, model_name, args=None, use_cuda=True, cuda_device=-1, **kwargs):
+
         """
         Initializes a QuestionAnsweringModel model.
 
@@ -89,7 +89,7 @@ def __init__(
         }
 
         config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]
-        self.model = model_class.from_pretrained(model_name)
+        self.model = model_class.from_pretrained(model_name, **kwargs)
 
         if use_cuda:
             if torch.cuda.is_available():
@@ -125,9 +125,9 @@ def __init__(
         if args:
             self.args.update(args)
 
-        self.tokenizer = tokenizer_class.from_pretrained(
-            model_name, do_lower_case=self.args["do_lower_case"]
-        )
+
+        self.tokenizer = tokenizer_class.from_pretrained(model_name, do_lower_case=self.args['do_lower_case'], **kwargs)
+
 
         self.args["model_name"] = model_name
         self.args["model_type"] = model_type
@@ -542,10 +542,8 @@ def train(self, train_dataset, output_dir, show_running_loss=True, eval_data=Non
             ) and not os.path.exists(output_dir_current):
                 os.makedirs(output_dir_current)
 
-            if (
-                args["save_model_every_epoch"]
-                and epoch_number != args["num_train_epochs"]
-            ):
+            if args['save_model_every_epoch']:
+
                 model_to_save = model.module if hasattr(model, "module") else model
                 model_to_save.save_pretrained(output_dir_current)
                 self.tokenizer.save_pretrained(output_dir_current)