diff --git a/simpletransformers/__init__.py b/simpletransformers/__init__.py index e9343088..a2e35fe1 100755 --- a/simpletransformers/__init__.py +++ b/simpletransformers/__init__.py @@ -1 +1 @@ -name = "simpletransformers" \ No newline at end of file +name = "simpletransformers" diff --git a/simpletransformers/classification/__init__.py b/simpletransformers/classification/__init__.py index 6520c360..69bcdda6 100755 --- a/simpletransformers/classification/__init__.py +++ b/simpletransformers/classification/__init__.py @@ -1,2 +1,4 @@ from simpletransformers.classification.classification_model import ClassificationModel -from simpletransformers.classification.multi_label_classification_model import MultiLabelClassificationModel \ No newline at end of file +from simpletransformers.classification.multi_label_classification_model import ( + MultiLabelClassificationModel, +) diff --git a/simpletransformers/classification/classification_model.py b/simpletransformers/classification/classification_model.py index b104840b..5583807e 100755 --- a/simpletransformers/classification/classification_model.py +++ b/simpletransformers/classification/classification_model.py @@ -6,55 +6,73 @@ import os import math -import json -import random import warnings -from multiprocessing import cpu_count import torch import numpy as np import pandas as pd -from scipy.stats import pearsonr, mode -from sklearn.metrics import mean_squared_error, matthews_corrcoef, confusion_matrix, label_ranking_average_precision_score +from scipy.stats import mode +from sklearn.metrics import ( + matthews_corrcoef, + confusion_matrix, + label_ranking_average_precision_score, +) from tensorboardX import SummaryWriter from tqdm.auto import trange, tqdm -from torch.utils.data.distributed import DistributedSampler -from torch.utils.data import ( - DataLoader, - RandomSampler, - SequentialSampler, - TensorDataset -) +from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset from transformers import AdamW, get_linear_schedule_with_warmup from transformers import ( - WEIGHTS_NAME, - BertConfig, BertTokenizer, - XLNetConfig, XLNetTokenizer, - XLMConfig, XLMTokenizer, - RobertaConfig, RobertaTokenizer, - DistilBertConfig, DistilBertTokenizer, - AlbertConfig, AlbertTokenizer, - CamembertConfig, CamembertTokenizer, - XLMRobertaConfig, XLMRobertaTokenizer, + BertConfig, + BertTokenizer, + XLNetConfig, + XLNetTokenizer, + XLMConfig, + XLMTokenizer, + RobertaConfig, + RobertaTokenizer, + DistilBertConfig, + DistilBertTokenizer, + AlbertConfig, + AlbertTokenizer, + CamembertConfig, + CamembertTokenizer, + XLMRobertaConfig, + XLMRobertaTokenizer, ) from simpletransformers.classification.classification_utils import ( InputExample, - convert_examples_to_features + convert_examples_to_features, ) -from simpletransformers.classification.transformer_models.bert_model import BertForSequenceClassification -from simpletransformers.classification.transformer_models.roberta_model import RobertaForSequenceClassification -from simpletransformers.classification.transformer_models.xlm_model import XLMForSequenceClassification -from simpletransformers.classification.transformer_models.xlnet_model import XLNetForSequenceClassification -from simpletransformers.classification.transformer_models.distilbert_model import DistilBertForSequenceClassification -from simpletransformers.classification.transformer_models.albert_model import AlbertForSequenceClassification -from simpletransformers.classification.transformer_models.camembert_model import CamembertForSequenceClassification -from simpletransformers.classification.transformer_models.xlm_roberta_model import XLMRobertaForSequenceClassification +from simpletransformers.classification.transformer_models.bert_model import ( + BertForSequenceClassification, +) +from simpletransformers.classification.transformer_models.roberta_model import ( + RobertaForSequenceClassification, +) +from simpletransformers.classification.transformer_models.xlm_model import ( + XLMForSequenceClassification, +) +from simpletransformers.classification.transformer_models.xlnet_model import ( + XLNetForSequenceClassification, +) +from simpletransformers.classification.transformer_models.distilbert_model import ( + DistilBertForSequenceClassification, +) +from simpletransformers.classification.transformer_models.albert_model import ( + AlbertForSequenceClassification, +) +from simpletransformers.classification.transformer_models.camembert_model import ( + CamembertForSequenceClassification, +) +from simpletransformers.classification.transformer_models.xlm_roberta_model import ( + XLMRobertaForSequenceClassification, +) from simpletransformers.config.global_args import global_args @@ -62,7 +80,16 @@ class ClassificationModel: - def __init__(self, model_type, model_name, num_labels=None, weight=None, args=None, use_cuda=True, cuda_device=-1): + def __init__( + self, + model_type, + model_name, + num_labels=None, + weight=None, + args=None, + use_cuda=True, + cuda_device=-1, + ): """ Initializes a ClassificationModel model. @@ -74,22 +101,40 @@ def __init__(self, model_type, model_name, num_labels=None, weight=None, args=No args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args. use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only. cuda_device (optional): Specific GPU that should be used. Will use the first available GPU by default. - """ + """ # noqa: ignore flake8 MODEL_CLASSES = { - 'bert': (BertConfig, BertForSequenceClassification, BertTokenizer), - 'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer), - 'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer), - 'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer), - 'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer), - 'albert': (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer), - 'camembert': (CamembertConfig, CamembertForSequenceClassification, CamembertTokenizer), - 'xlmroberta': (XLMRobertaConfig, XLMRobertaForSequenceClassification, XLMRobertaTokenizer), + "bert": (BertConfig, BertForSequenceClassification, BertTokenizer), + "xlnet": (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer), + "xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer), + "roberta": ( + RobertaConfig, + RobertaForSequenceClassification, + RobertaTokenizer, + ), + "distilbert": ( + DistilBertConfig, + DistilBertForSequenceClassification, + DistilBertTokenizer, + ), + "albert": (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer), + "camembert": ( + CamembertConfig, + CamembertForSequenceClassification, + CamembertTokenizer, + ), + "xlmroberta": ( + XLMRobertaConfig, + XLMRobertaForSequenceClassification, + XLMRobertaTokenizer, + ), } config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type] if num_labels: - self.config = config_class.from_pretrained(model_name, num_labels=num_labels) + self.config = config_class.from_pretrained( + model_name, num_labels=num_labels + ) self.num_labels = num_labels else: self.config = config_class.from_pretrained(model_name) @@ -103,44 +148,63 @@ def __init__(self, model_type, model_name, num_labels=None, weight=None, args=No else: self.device = torch.device(f"cuda:{cuda_device}") else: - raise ValueError("'use_cuda' set to True when cuda is unavailable. Make sure CUDA is available or set use_cuda=False.") + raise ValueError( + "'use_cuda' set to True when cuda is unavailable. Make sure CUDA is" + "available or set use_cuda=False." + ) else: self.device = "cpu" if self.weight: - self.model = model_class.from_pretrained(model_name, config=self.config, weight=torch.Tensor(self.weight).to(self.device)) + self.model = model_class.from_pretrained( + model_name, + config=self.config, + weight=torch.Tensor(self.weight).to(self.device), + ) else: self.model = model_class.from_pretrained(model_name, config=self.config) self.results = {} self.args = { - 'sliding_window': False, - 'tie_value': 1, - 'stride': 0.8, - - 'regression': False, + "sliding_window": False, + "tie_value": 1, + "stride": 0.8, + "regression": False, } self.args.update(global_args) if not use_cuda: - self.args['fp16'] = False + self.args["fp16"] = False if args: self.args.update(args) - self.tokenizer = tokenizer_class.from_pretrained(model_name, do_lower_case=self.args['do_lower_case']) - - self.args['model_name'] = model_name - self.args['model_type'] = model_type - - if model_type in ['camembert', 'xlmroberta']: - warnings.warn(f"use_multiprocessing automatically disabled as {model_type} fails when using multiprocessing for feature conversion.") - self.args['use_multiprocessing'] = False + self.tokenizer = tokenizer_class.from_pretrained( + model_name, do_lower_case=self.args["do_lower_case"] + ) + self.args["model_name"] = model_name + self.args["model_type"] = model_type - def train_model(self, train_df, multi_label=False, output_dir=None, show_running_loss=True, args=None, eval_df=None, **kwargs): + if model_type in ["camembert", "xlmroberta"]: + warnings.warn( + f"use_multiprocessing automatically disabled as {model_type} fails" + "when using multiprocessing for feature conversion." + ) + self.args["use_multiprocessing"] = False + + def train_model( + self, + train_df, + multi_label=False, + output_dir=None, + show_running_loss=True, + args=None, + eval_df=None, + **kwargs, + ): """ Trains the model using 'train_df' @@ -156,86 +220,167 @@ def train_model(self, train_df, multi_label=False, output_dir=None, show_running Returns: None - """ + """ # noqa: ignore flake8 if args: self.args.update(args) - if self.args['silent']: + if self.args["silent"]: show_running_loss = False - if self.args['evaluate_during_training'] and eval_df is None: - raise ValueError("evaluate_during_training is enabled but eval_df is not specified. Pass eval_df to model.train_model() if using evaluate_during_training.") + if self.args["evaluate_during_training"] and eval_df is None: + raise ValueError( + "evaluate_during_training is enabled but eval_df is not specified." + " Pass eval_df to model.train_model() if using" + "evaluate_during_training." + ) if not output_dir: - output_dir = self.args['output_dir'] + output_dir = self.args["output_dir"] - if os.path.exists(output_dir) and os.listdir(output_dir) and not self.args["overwrite_output_dir"]: - raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(output_dir)) + if ( + os.path.exists(output_dir) + and os.listdir(output_dir) + and not self.args["overwrite_output_dir"] + ): + raise ValueError( + "Output directory ({}) already exists and is not empty." + "Use --overwrite_output_dir to overcome.".format(output_dir) + ) self._move_model_to_device() - if 'text' in train_df.columns and 'labels' in train_df.columns: - train_examples = [InputExample(i, text, None, label) for i, (text, label) in enumerate(zip(train_df['text'], train_df['labels']))] - elif 'text_a' in train_df.columns and 'text_b' in train_df.columns: - train_examples = [InputExample(i, text_a, text_b, label) for i, (text_a, text_b, label) in enumerate(zip(train_df['text_a'], train_df['text_b'], train_df['labels']))] + if "text" in train_df.columns and "labels" in train_df.columns: + train_examples = [ + InputExample(i, text, None, label) + for i, (text, label) in enumerate( + zip(train_df["text"], train_df["labels"]) + ) + ] + elif "text_a" in train_df.columns and "text_b" in train_df.columns: + train_examples = [ + InputExample(i, text_a, text_b, label) + for i, (text_a, text_b, label) in enumerate( + zip(train_df["text_a"], train_df["text_b"], train_df["labels"]) + ) + ] else: - warnings.warn("Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels.") - train_examples = [InputExample(i, text, None, label) for i, (text, label) in enumerate(zip(train_df.iloc[:, 0], train_df.iloc[:, 1]))] + warnings.warn( + "Dataframe headers not specified. Falling back to using column" + " 0 as text and column 1 as labels." + ) + train_examples = [ + InputExample(i, text, None, label) + for i, (text, label) in enumerate( + zip(train_df.iloc[:, 0], train_df.iloc[:, 1]) + ) + ] train_dataset = self.load_and_cache_examples(train_examples) if not os.path.exists(output_dir): os.makedirs(output_dir) - global_step, tr_loss = self.train(train_dataset, output_dir, multi_label=multi_label, show_running_loss=show_running_loss, eval_df=eval_df, **kwargs) - - model_to_save = self.model.module if hasattr(self.model, "module") else self.model + global_step, tr_loss = self.train( + train_dataset, + output_dir, + multi_label=multi_label, + show_running_loss=show_running_loss, + eval_df=eval_df, + **kwargs, + ) + + model_to_save = ( + self.model.module if hasattr(self.model, "module") else self.model + ) model_to_save.save_pretrained(output_dir) self.tokenizer.save_pretrained(output_dir) torch.save(self.args, os.path.join(output_dir, "training_args.bin")) - print("Training of {} model complete. Saved to {}.".format(self.args["model_type"], output_dir)) - - def train(self, train_dataset, output_dir, multi_label=False, show_running_loss=True, eval_df=None, **kwargs): + print( + "Training of {} model complete. Saved to {}.".format( + self.args["model_type"], output_dir + ) + ) + + def train( + self, + train_dataset, + output_dir, + multi_label=False, + show_running_loss=True, + eval_df=None, + **kwargs, + ): """ Trains the model on train_dataset. - Utility function to be used by the train_model() method. Not intended to be used directly. + Utility function to be used by the train_model() method. Not intended" + "to be used directly. """ - tokenizer = self.tokenizer device = self.device model = self.model args = self.args tb_writer = SummaryWriter(logdir=args["tensorboard_dir"]) train_sampler = RandomSampler(train_dataset) - train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args["train_batch_size"]) + train_dataloader = DataLoader( + train_dataset, sampler=train_sampler, batch_size=args["train_batch_size"] + ) - t_total = len(train_dataloader) // args["gradient_accumulation_steps"] * args["num_train_epochs"] + t_total = ( + len(train_dataloader) + // args["gradient_accumulation_steps"] + * args["num_train_epochs"] + ) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ - {"params": [p for n, p in model.named_parameters() if not any( - nd in n for nd in no_decay)], "weight_decay": args["weight_decay"]}, - {"params": [p for n, p in model.named_parameters() if any( - nd in n for nd in no_decay)], "weight_decay": 0.0} + { + "params": [ + p + for n, p in model.named_parameters() + if not any(nd in n for nd in no_decay) + ], + "weight_decay": args["weight_decay"], + }, + { + "params": [ + p + for n, p in model.named_parameters() + if any(nd in n for nd in no_decay) + ], + "weight_decay": 0.0, + }, ] warmup_steps = math.ceil(t_total * args["warmup_ratio"]) - args["warmup_steps"] = warmup_steps if args["warmup_steps"] == 0 else args["warmup_steps"] - - optimizer = AdamW(optimizer_grouped_parameters, lr=args["learning_rate"], eps=args["adam_epsilon"]) - scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args["warmup_steps"], num_training_steps=t_total) + args["warmup_steps"] = ( + warmup_steps if args["warmup_steps"] == 0 else args["warmup_steps"] + ) + + optimizer = AdamW( + optimizer_grouped_parameters, + lr=args["learning_rate"], + eps=args["adam_epsilon"], + ) + scheduler = get_linear_schedule_with_warmup( + optimizer, num_warmup_steps=args["warmup_steps"], num_training_steps=t_total + ) if args["fp16"]: try: from apex import amp except ImportError: - raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") + raise ImportError( + "Please install apex from https://www.github.com/nvidia/apex " + "to use fp16 training." + ) - model, optimizer = amp.initialize(model, optimizer, opt_level=args["fp16_opt_level"]) + model, optimizer = amp.initialize( + model, optimizer, opt_level=args["fp16_opt_level"] + ) if args["n_gpu"] > 1: model = torch.nn.DataParallel(model) @@ -243,55 +388,61 @@ def train(self, train_dataset, output_dir, multi_label=False, show_running_loss= global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() - train_iterator = trange(int(args["num_train_epochs"]), desc="Epoch", disable=args['silent']) + train_iterator = trange( + int(args["num_train_epochs"]), desc="Epoch", disable=args["silent"] + ) epoch_number = 0 - if args['evaluate_during_training']: + if args["evaluate_during_training"]: extra_metrics = {key: [] for key in kwargs} if multi_label: training_progress_scores = { - 'global_step': [], - 'LRAP': [], - 'train_loss': [], - 'eval_loss': [], - **extra_metrics + "global_step": [], + "LRAP": [], + "train_loss": [], + "eval_loss": [], + **extra_metrics, } else: if self.model.num_labels == 2: training_progress_scores = { - 'global_step': [], - 'tp': [], - 'tn': [], - 'fp': [], - 'fn': [], - 'mcc': [], - 'train_loss': [], - 'eval_loss': [], - **extra_metrics + "global_step": [], + "tp": [], + "tn": [], + "fp": [], + "fn": [], + "mcc": [], + "train_loss": [], + "eval_loss": [], + **extra_metrics, } elif self.model.num_labels == 1: - training_progress_scores = { - 'global_step': [], - 'train_loss': [], - 'eval_loss': [], - **extra_metrics + training_progress_scores = { + "global_step": [], + "train_loss": [], + "eval_loss": [], + **extra_metrics, } else: training_progress_scores = { - 'global_step': [], - 'mcc': [], - 'train_loss': [], - 'eval_loss': [], - **extra_metrics + "global_step": [], + "mcc": [], + "train_loss": [], + "eval_loss": [], + **extra_metrics, } - if args['wandb_project']: - wandb.init(project=args['wandb_project'], config={**args}, **args['wandb_kwargs']) + if args["wandb_project"]: + wandb.init( + project=args["wandb_project"], config={**args}, **args["wandb_kwargs"] + ) wandb.watch(self.model) model.train() for _ in train_iterator: # epoch_iterator = tqdm(train_dataloader, desc="Iteration") - for step, batch in enumerate(tqdm(train_dataloader, desc="Current iteration", disable=args['silent'])): + for step, batch in enumerate( + tqdm(train_dataloader, desc="Current iteration", disable=args["silent"]) + ): batch = tuple(t.to(device) for t in batch) inputs = self._get_inputs_dict(batch) @@ -299,8 +450,10 @@ def train(self, train_dataset, output_dir, multi_label=False, show_running_loss= # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] - if args['n_gpu'] > 1: - loss = loss.mean() # mean() to average on multi-gpu parallel training + if args["n_gpu"] > 1: + loss = ( + loss.mean() + ) # mean() to average on multi-gpu parallel training current_loss = loss.item() @@ -313,10 +466,14 @@ def train(self, train_dataset, output_dir, multi_label=False, show_running_loss= if args["fp16"]: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() - torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args["max_grad_norm"]) + torch.nn.utils.clip_grad_norm_( + amp.master_params(optimizer), args["max_grad_norm"] + ) else: loss.backward() - torch.nn.utils.clip_grad_norm_(model.parameters(), args["max_grad_norm"]) + torch.nn.utils.clip_grad_norm_( + model.parameters(), args["max_grad_norm"] + ) tr_loss += loss.item() if (step + 1) % args["gradient_accumulation_steps"] == 0: @@ -325,69 +482,108 @@ def train(self, train_dataset, output_dir, multi_label=False, show_running_loss= model.zero_grad() global_step += 1 - if args["logging_steps"] > 0 and global_step % args["logging_steps"] == 0: + if ( + args["logging_steps"] > 0 + and global_step % args["logging_steps"] == 0 + ): # Log metrics tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) - tb_writer.add_scalar("loss", (tr_loss - logging_loss)/args["logging_steps"], global_step) + tb_writer.add_scalar( + "loss", + (tr_loss - logging_loss) / args["logging_steps"], + global_step, + ) logging_loss = tr_loss - if args['wandb_project']: - wandb.log({'Training loss': current_loss, 'lr': scheduler.get_lr()[0], 'global_step': global_step}) + if args["wandb_project"]: + wandb.log( + { + "Training loss": current_loss, + "lr": scheduler.get_lr()[0], + "global_step": global_step, + } + ) if args["save_steps"] > 0 and global_step % args["save_steps"] == 0: # Save model checkpoint - output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step)) + output_dir_current = os.path.join( + output_dir, "checkpoint-{}".format(global_step) + ) if not os.path.exists(output_dir_current): os.makedirs(output_dir_current) # Take care of distributed/parallel training - model_to_save = model.module if hasattr(model, "module") else model + model_to_save = ( + model.module if hasattr(model, "module") else model + ) model_to_save.save_pretrained(output_dir_current) self.tokenizer.save_pretrained(output_dir_current) - if args['evaluate_during_training'] and (args["evaluate_during_training_steps"] > 0 and global_step % args["evaluate_during_training_steps"] == 0): - # Only evaluate when single GPU otherwise metrics may not average well + if args["evaluate_during_training"] and ( + args["evaluate_during_training_steps"] > 0 + and global_step % args["evaluate_during_training_steps"] == 0 + ): + # Only evaluate when single GPU otherwise + # metrics may not average well results, _, _ = self.eval_model(eval_df, verbose=True, **kwargs) for key, value in results.items(): - tb_writer.add_scalar('eval_{}'.format(key), value, global_step) + tb_writer.add_scalar( + "eval_{}".format(key), value, global_step + ) - output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step)) + output_dir_current = os.path.join( + output_dir, "checkpoint-{}".format(global_step) + ) if not os.path.exists(output_dir_current): os.makedirs(output_dir_current) - if args['save_eval_checkpoints']: - model_to_save = model.module if hasattr(model, "module") else model + if args["save_eval_checkpoints"]: + model_to_save = ( + model.module if hasattr(model, "module") else model + ) model_to_save.save_pretrained(output_dir_current) self.tokenizer.save_pretrained(output_dir_current) - output_eval_file = os.path.join(output_dir_current, "eval_results.txt") + output_eval_file = os.path.join( + output_dir_current, "eval_results.txt" + ) with open(output_eval_file, "w") as writer: for key in sorted(results.keys()): writer.write("{} = {}\n".format(key, str(results[key]))) - training_progress_scores['global_step'].append(global_step) - training_progress_scores['train_loss'].append(current_loss) + training_progress_scores["global_step"].append(global_step) + training_progress_scores["train_loss"].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) - report.to_csv(args['output_dir'] + 'training_progress_scores.csv', index=False) + report.to_csv( + args["output_dir"] + "training_progress_scores.csv", + index=False, + ) - if args['wandb_project']: + if args["wandb_project"]: wandb.log(self._get_last_metrics(training_progress_scores)) epoch_number += 1 - output_dir_current = os.path.join(output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number)) + output_dir_current = os.path.join( + output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number) + ) - if (args['save_model_every_epoch'] or args['evaluate_during_training']) and not os.path.exists(output_dir_current): + if ( + args["save_model_every_epoch"] or args["evaluate_during_training"] + ) and not os.path.exists(output_dir_current): os.makedirs(output_dir_current) - if args['save_model_every_epoch'] and epoch_number != args['num_train_epochs']: + if ( + args["save_model_every_epoch"] + and epoch_number != args["num_train_epochs"] + ): model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(output_dir_current) self.tokenizer.save_pretrained(output_dir_current) - if args['evaluate_during_training']: + if args["evaluate_during_training"]: results, _, _ = self.eval_model(eval_df, verbose=True, **kwargs) output_eval_file = os.path.join(output_dir_current, "eval_results.txt") @@ -395,16 +591,20 @@ def train(self, train_dataset, output_dir, multi_label=False, show_running_loss= for key in sorted(results.keys()): writer.write("{} = {}\n".format(key, str(results[key]))) - training_progress_scores['global_step'].append(global_step) - training_progress_scores['train_loss'].append(current_loss) + training_progress_scores["global_step"].append(global_step) + training_progress_scores["train_loss"].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) - report.to_csv(args['output_dir'] + 'training_progress_scores.csv', index=False) + report.to_csv( + args["output_dir"] + "training_progress_scores.csv", index=False + ) return global_step, tr_loss / global_step - def eval_model(self, eval_df, multi_label=False, output_dir=None, verbose=False, **kwargs): + def eval_model( + self, eval_df, multi_label=False, output_dir=None, verbose=False, **kwargs + ): """ Evaluates the model on eval_df. Saves results to output_dir. @@ -420,14 +620,16 @@ def eval_model(self, eval_df, multi_label=False, output_dir=None, verbose=False, result: Dictionary containing evaluation results. (Matthews correlation coefficient, tp, tn, fp, fn) model_outputs: List of model outputs for each row in eval_df wrong_preds: List of InputExample objects corresponding to each incorrect prediction by the model - """ + """ # noqa: ignore flake8 if not output_dir: output_dir = self.args["output_dir"] self._move_model_to_device() - result, model_outputs, wrong_preds = self.evaluate(eval_df, output_dir, multi_label=multi_label, **kwargs) + result, model_outputs, wrong_preds = self.evaluate( + eval_df, output_dir, multi_label=multi_label, **kwargs + ) self.results.update(result) if verbose: @@ -439,10 +641,10 @@ def evaluate(self, eval_df, output_dir, multi_label=False, prefix="", **kwargs): """ Evaluates the model on eval_df. - Utility function to be used by the eval_model() method. Not intended to be used directly. + Utility function to be used by the eval_model() method. Not intended to + be used directly. """ - tokenizer = self.tokenizer device = self.device model = self.model args = self.args @@ -450,23 +652,45 @@ def evaluate(self, eval_df, output_dir, multi_label=False, prefix="", **kwargs): results = {} - if 'text' in eval_df.columns and 'labels' in eval_df.columns: - eval_examples = [InputExample(i, text, None, label) for i, (text, label) in enumerate(zip(eval_df['text'], eval_df['labels']))] - elif 'text_a' in eval_df.columns and 'text_b' in eval_df.columns: - eval_examples = [InputExample(i, text_a, text_b, label) for i, (text_a, text_b, label) in enumerate(zip(eval_df['text_a'], eval_df['text_b'], eval_df['labels']))] + if "text" in eval_df.columns and "labels" in eval_df.columns: + eval_examples = [ + InputExample(i, text, None, label) + for i, (text, label) in enumerate( + zip(eval_df["text"], eval_df["labels"]) + ) + ] + elif "text_a" in eval_df.columns and "text_b" in eval_df.columns: + eval_examples = [ + InputExample(i, text_a, text_b, label) + for i, (text_a, text_b, label) in enumerate( + zip(eval_df["text_a"], eval_df["text_b"], eval_df["labels"]) + ) + ] else: - warnings.warn("Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels.") - eval_examples = [InputExample(i, text, None, label) for i, (text, label) in enumerate(zip(eval_df.iloc[:, 0], eval_df.iloc[:, 1]))] - - if args['sliding_window']: - eval_dataset, window_counts = self.load_and_cache_examples(eval_examples, evaluate=True) + warnings.warn( + "Dataframe headers not specified. Falling back to using column 0 as" + "text and column 1 as labels." + ) + eval_examples = [ + InputExample(i, text, None, label) + for i, (text, label) in enumerate( + zip(eval_df.iloc[:, 0], eval_df.iloc[:, 1]) + ) + ] + + if args["sliding_window"]: + eval_dataset, window_counts = self.load_and_cache_examples( + eval_examples, evaluate=True + ) else: eval_dataset = self.load_and_cache_examples(eval_examples, evaluate=True) if not os.path.exists(eval_output_dir): os.makedirs(eval_output_dir) eval_sampler = SequentialSampler(eval_dataset) - eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args["eval_batch_size"]) + eval_dataloader = DataLoader( + eval_dataset, sampler=eval_sampler, batch_size=args["eval_batch_size"] + ) eval_loss = 0.0 nb_eval_steps = 0 @@ -474,7 +698,7 @@ def evaluate(self, eval_df, output_dir, multi_label=False, prefix="", **kwargs): out_label_ids = None model.eval() - for batch in tqdm(eval_dataloader, disable=args['silent']): + for batch in tqdm(eval_dataloader, disable=args["silent"]): batch = tuple(t.to(device) for t in batch) with torch.no_grad(): @@ -495,19 +719,27 @@ def evaluate(self, eval_df, output_dir, multi_label=False, prefix="", **kwargs): else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( - out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) + out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0 + ) eval_loss = eval_loss / nb_eval_steps - if args['sliding_window']: + if args["sliding_window"]: count = 0 window_ranges = [] for n_windows in window_counts: window_ranges.append([count, count + n_windows]) count += n_windows - preds = [preds[window_range[0]: window_range[1]] for window_range in window_ranges] - out_label_ids = [out_label_ids[i] for i in range(len(out_label_ids)) if i in [window[0] for window in window_ranges]] + preds = [ + preds[window_range[0] : window_range[1]] + for window_range in window_ranges + ] + out_label_ids = [ + out_label_ids[i] + for i in range(len(out_label_ids)) + if i in [window[0] for window in window_ranges] + ] model_outputs = preds @@ -516,11 +748,11 @@ def evaluate(self, eval_df, output_dir, multi_label=False, prefix="", **kwargs): for pred_row in preds: mode_pred, counts = mode(pred_row) if len(counts) > 1 and counts[0] == counts[1]: - final_preds.append(args['tie_value']) + final_preds.append(args["tie_value"]) else: final_preds.append(mode_pred[0]) preds = np.array(final_preds) - elif not multi_label and args['regression'] == True: + elif not multi_label and args["regression"] is True: preds = np.squeeze(preds) model_outputs = preds else: @@ -529,8 +761,10 @@ def evaluate(self, eval_df, output_dir, multi_label=False, prefix="", **kwargs): if not multi_label: preds = np.argmax(preds, axis=1) - result, wrong = self.compute_metrics(preds, out_label_ids, eval_examples, **kwargs) - result['eval_loss'] = eval_loss + result, wrong = self.compute_metrics( + preds, out_label_ids, eval_examples, **kwargs + ) + result["eval_loss"] = eval_loss results.update(result) output_eval_file = os.path.join(eval_output_dir, "eval_results.txt") @@ -540,20 +774,24 @@ def evaluate(self, eval_df, output_dir, multi_label=False, prefix="", **kwargs): return results, model_outputs, wrong - def load_and_cache_examples(self, examples, evaluate=False, no_cache=False, multi_label=False): + def load_and_cache_examples( + self, examples, evaluate=False, no_cache=False, multi_label=False + ): """ - Converts a list of InputExample objects to a TensorDataset containing InputFeatures. Caches the InputFeatures. + Converts a list of InputExample objects to a TensorDataset containing + InputFeatures. Caches the InputFeatures. - Utility function for train() and eval() methods. Not intended to be used directly. + Utility function for train() and eval() methods. Not intended to be + used directly. """ process_count = self.args["process_count"] tokenizer = self.tokenizer args = self.args - - if not multi_label and args['regression']: - output_mode = 'regression' + + if not multi_label and args["regression"]: + output_mode = "regression" else: output_mode = "classification" @@ -561,9 +799,21 @@ def load_and_cache_examples(self, examples, evaluate=False, no_cache=False, mult os.mkdir(self.args["cache_dir"]) mode = "dev" if evaluate else "train" - cached_features_file = os.path.join(args["cache_dir"], "cached_{}_{}_{}_{}_{}".format(mode, args["model_type"], args["max_seq_length"], self.num_labels, len(examples))) - - if os.path.exists(cached_features_file) and ((not args["reprocess_input_data"] and not no_cache) or (mode == "dev" and args['use_cached_eval_features'])): + cached_features_file = os.path.join( + args["cache_dir"], + "cached_{}_{}_{}_{}_{}".format( + mode, + args["model_type"], + args["max_seq_length"], + self.num_labels, + len(examples), + ), + ) + + if os.path.exists(cached_features_file) and ( + (not args["reprocess_input_data"] and not no_cache) + or (mode == "dev" and args["use_cached_eval_features"]) + ): features = torch.load(cached_features_file) print(f"Features loaded from cache at {cached_features_file}") else: @@ -578,7 +828,8 @@ def load_and_cache_examples(self, examples, evaluate=False, no_cache=False, mult cls_token=tokenizer.cls_token, cls_token_segment_id=2 if args["model_type"] in ["xlnet"] else 0, sep_token=tokenizer.sep_token, - # RoBERTa uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 + # RoBERTa uses an extra separator b/w pairs of sentences, cf. + # github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 # noqa: ignore flake8 sep_token_extra=bool(args["model_type"] in ["roberta"]), # PAD on the left for XLNet pad_on_left=bool(args["model_type"] in ["xlnet"]), @@ -586,37 +837,49 @@ def load_and_cache_examples(self, examples, evaluate=False, no_cache=False, mult pad_token_segment_id=4 if args["model_type"] in ["xlnet"] else 0, process_count=process_count, multi_label=multi_label, - silent=args['silent'], - use_multiprocessing=args['use_multiprocessing'], - sliding_window=args['sliding_window'], + silent=args["silent"], + use_multiprocessing=args["use_multiprocessing"], + sliding_window=args["sliding_window"], flatten=not evaluate, - stride=args['stride'] + stride=args["stride"], ) if not no_cache: torch.save(features, cached_features_file) - if args['sliding_window'] and evaluate: + if args["sliding_window"] and evaluate: window_counts = [len(sample) for sample in features] features = [feature for feature_set in features for feature in feature_set] all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) - all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) - all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) + all_input_mask = torch.tensor( + [f.input_mask for f in features], dtype=torch.long + ) + all_segment_ids = torch.tensor( + [f.segment_ids for f in features], dtype=torch.long + ) if output_mode == "classification": - all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) + all_label_ids = torch.tensor( + [f.label_id for f in features], dtype=torch.long + ) elif output_mode == "regression": - all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) + all_label_ids = torch.tensor( + [f.label_id for f in features], dtype=torch.float + ) - dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) + dataset = TensorDataset( + all_input_ids, all_input_mask, all_segment_ids, all_label_ids + ) - if args['sliding_window'] and evaluate: + if args["sliding_window"] and evaluate: return dataset, window_counts else: return dataset - def compute_metrics(self, preds, labels, eval_examples, multi_label=False, **kwargs): + def compute_metrics( + self, preds, labels, eval_examples, multi_label=False, **kwargs + ): """ Computes the evaluation metrics for the model predictions. @@ -630,7 +893,7 @@ def compute_metrics(self, preds, labels, eval_examples, multi_label=False, **kwa Returns: result: Dictionary containing evaluation results. (Matthews correlation coefficient, tp, tn, fp, fn) wrong: List of InputExample objects corresponding to each incorrect prediction by the model - """ + """ # noqa: ignore flake8 assert len(preds) == len(labels) @@ -645,20 +908,20 @@ def compute_metrics(self, preds, labels, eval_examples, multi_label=False, **kwa if multi_label: label_ranking_score = label_ranking_average_precision_score(labels, preds) return {**{"LRAP": label_ranking_score}, **extra_metrics}, wrong - elif self.args['regression']: + elif self.args["regression"]: return {**extra_metrics}, wrong - + mcc = matthews_corrcoef(labels, preds) if self.model.num_labels == 2: tn, fp, fn, tp = confusion_matrix(labels, preds).ravel() - return {**{ - "mcc": mcc, - "tp": tp, - "tn": tn, - "fp": fp, - "fn": fn - }, **extra_metrics}, wrong + return ( + { + **{"mcc": mcc, "tp": tp, "tn": tn, "fp": fp, "fn": fn}, + **extra_metrics, + }, + wrong, + ) else: return {**{"mcc": mcc}, **extra_metrics}, wrong @@ -667,14 +930,14 @@ def predict(self, to_predict, multi_label=False): Performs predictions on a list of text. Args: - to_predict: A python list of text (str) to be sent to the model for prediction. + to_predict: A python list of text (str) to be sent to the model + for prediction. Returns: preds: A python list of the predictions (0 or 1) for each text. model_outputs: A python list of the raw model outputs for each text. """ - tokenizer = self.tokenizer device = self.device model = self.model args = self.args @@ -682,26 +945,40 @@ def predict(self, to_predict, multi_label=False): self._move_model_to_device() if multi_label: - eval_examples = [InputExample(i, text, None, [0 for i in range(self.num_labels)]) for i, text in enumerate(to_predict)] + eval_examples = [ + InputExample(i, text, None, [0 for i in range(self.num_labels)]) + for i, text in enumerate(to_predict) + ] else: if isinstance(to_predict[0], list): - eval_examples = [InputExample(i, text[0], text[1], 0) for i, text in enumerate(to_predict)] + eval_examples = [ + InputExample(i, text[0], text[1], 0) + for i, text in enumerate(to_predict) + ] else: - eval_examples = [InputExample(i, text, None, 0) for i, text in enumerate(to_predict)] - if args['sliding_window']: - eval_dataset, window_counts = self.load_and_cache_examples(eval_examples, evaluate=True, no_cache=True) + eval_examples = [ + InputExample(i, text, None, 0) for i, text in enumerate(to_predict) + ] + if args["sliding_window"]: + eval_dataset, window_counts = self.load_and_cache_examples( + eval_examples, evaluate=True, no_cache=True + ) else: - eval_dataset = self.load_and_cache_examples(eval_examples, evaluate=True, multi_label=multi_label, no_cache=True) + eval_dataset = self.load_and_cache_examples( + eval_examples, evaluate=True, multi_label=multi_label, no_cache=True + ) eval_sampler = SequentialSampler(eval_dataset) - eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args["eval_batch_size"]) + eval_dataloader = DataLoader( + eval_dataset, sampler=eval_sampler, batch_size=args["eval_batch_size"] + ) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None - for batch in tqdm(eval_dataloader, disable=args['silent']): + for batch in tqdm(eval_dataloader, disable=args["silent"]): model.eval() batch = tuple(t.to(device) for t in batch) @@ -722,18 +999,23 @@ def predict(self, to_predict, multi_label=False): out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) - out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) + out_label_ids = np.append( + out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0 + ) eval_loss = eval_loss / nb_eval_steps - if args['sliding_window']: + if args["sliding_window"]: count = 0 window_ranges = [] for n_windows in window_counts: window_ranges.append([count, count + n_windows]) count += n_windows - preds = [preds[window_range[0]: window_range[1]] for window_range in window_ranges] + preds = [ + preds[window_range[0] : window_range[1]] + for window_range in window_ranges + ] model_outputs = preds @@ -742,21 +1024,30 @@ def predict(self, to_predict, multi_label=False): for pred_row in preds: mode_pred, counts = mode(pred_row) if len(counts) > 1 and counts[0] == counts[1]: - final_preds.append(args['tie_value']) + final_preds.append(args["tie_value"]) else: final_preds.append(mode_pred[0]) preds = np.array(final_preds) - elif not multi_label and args['regression'] == True: + elif not multi_label and args["regression"] is True: preds = np.squeeze(preds) model_outputs = preds else: model_outputs = preds if multi_label: - if isinstance(args['threshold'], list): - threshold_values = args['threshold'] - preds = [[self._threshold(pred, threshold_values[i]) for i, pred in enumerate(example)] for example in preds] + if isinstance(args["threshold"], list): + threshold_values = args["threshold"] + preds = [ + [ + self._threshold(pred, threshold_values[i]) + for i, pred in enumerate(example) + ] + for example in preds + ] else: - preds = [[self._threshold(pred, args['threshold']) for pred in example] for example in preds] + preds = [ + [self._threshold(pred, args["threshold"]) for pred in example] + for example in preds + ] else: preds = np.argmax(preds, axis=1) @@ -771,15 +1062,13 @@ def _move_model_to_device(self): self.model.to(self.device) def _get_inputs_dict(self, batch): - inputs = { - "input_ids": batch[0], - "attention_mask": batch[1], - "labels": batch[3] - } + inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} # XLM, DistilBERT and RoBERTa don't use segment_ids if self.args["model_type"] != "distilbert": - inputs["token_type_ids"] = batch[2] if self.args["model_type"] in ["bert", "xlnet"] else None + inputs["token_type_ids"] = ( + batch[2] if self.args["model_type"] in ["bert", "xlnet"] else None + ) return inputs diff --git a/simpletransformers/classification/classification_utils.py b/simpletransformers/classification/classification_utils.py index 85a4d15f..44c7ea3a 100755 --- a/simpletransformers/classification/classification_utils.py +++ b/simpletransformers/classification/classification_utils.py @@ -16,17 +16,10 @@ """ BERT classification fine-tuning: utilities to work with GLUE tasks """ from __future__ import absolute_import, division, print_function - -import os -import sys import csv - -from io import open from multiprocessing import Pool, cpu_count from tqdm.auto import tqdm -from scipy.stats import pearsonr, spearmanr -from sklearn.metrics import matthews_corrcoef, f1_score csv.field_size_limit(2147483647) @@ -72,9 +65,23 @@ def convert_example_to_feature( cls_token_segment_id=1, pad_token_segment_id=0, mask_padding_with_zero=True, - sep_token_extra=False + sep_token_extra=False, ): - example, max_seq_length, tokenizer, output_mode, cls_token_at_end, cls_token, sep_token, cls_token_segment_id, pad_on_left, pad_token_segment_id, sep_token_extra, multi_label, stride = example_row + ( + example, + max_seq_length, + tokenizer, + output_mode, + cls_token_at_end, + cls_token, + sep_token, + cls_token_segment_id, + pad_on_left, + pad_token_segment_id, + sep_token_extra, + multi_label, + stride, + ) = example_row tokens_a = tokenizer.tokenize(example.text_a) @@ -90,7 +97,7 @@ def convert_example_to_feature( # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa. special_tokens_count = 3 if sep_token_extra else 2 if len(tokens_a) > max_seq_length - special_tokens_count: - tokens_a = tokens_a[:(max_seq_length - special_tokens_count)] + tokens_a = tokens_a[: (max_seq_length - special_tokens_count)] # The convention in BERT is: # (a) For sequence pairs: @@ -134,11 +141,15 @@ def convert_example_to_feature( padding_length = max_seq_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids - input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask + input_mask = ( + [0 if mask_padding_with_zero else 1] * padding_length + ) + input_mask segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids else: input_ids = input_ids + ([pad_token] * padding_length) - input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask = input_mask + ( + [0 if mask_padding_with_zero else 1] * padding_length + ) segment_ids = segment_ids + ([pad_token_segment_id] * padding_length) assert len(input_ids) == max_seq_length @@ -152,14 +163,14 @@ def convert_example_to_feature( # else: # raise KeyError(output_mode) - if output_mode == 'regression': - label_id = float(example.label) + if output_mode == "regression": + label_id = float(example.label) # noqa: ignore flake8 return InputFeatures( input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, - label_id=example.label + label_id=example.label, ) @@ -173,7 +184,21 @@ def convert_example_to_feature_sliding_window( mask_padding_with_zero=True, sep_token_extra=False, ): - example, max_seq_length, tokenizer, output_mode, cls_token_at_end, cls_token, sep_token, cls_token_segment_id, pad_on_left, pad_token_segment_id, sep_token_extra, multi_label, stride = example_row + ( + example, + max_seq_length, + tokenizer, + output_mode, + cls_token_at_end, + cls_token, + sep_token, + cls_token_segment_id, + pad_on_left, + pad_token_segment_id, + sep_token_extra, + multi_label, + stride, + ) = example_row if stride < 1: stride = int(max_seq_length * stride) @@ -183,14 +208,17 @@ def convert_example_to_feature_sliding_window( tokens_a = tokenizer.tokenize(example.text_a) - special_tokens_count = 3 if sep_token_extra else 2 if len(tokens_a) > bucket_size: - token_sets = [tokens_a[i:i + bucket_size] for i in range(0, len(tokens_a), stride)] + token_sets = [ + tokens_a[i : i + bucket_size] for i in range(0, len(tokens_a), stride) + ] else: token_sets.append(tokens_a) if example.text_b: - raise ValueError("Sequence pair tasks not implemented for sliding window tokenization.") + raise ValueError( + "Sequence pair tasks not implemented for sliding window tokenization." + ) # The convention in BERT is: # (a) For sequence pairs: @@ -233,11 +261,15 @@ def convert_example_to_feature_sliding_window( padding_length = max_seq_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids - input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask + input_mask = ( + [0 if mask_padding_with_zero else 1] * padding_length + ) + input_mask segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids else: input_ids = input_ids + ([pad_token] * padding_length) - input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask = input_mask + ( + [0 if mask_padding_with_zero else 1] * padding_length + ) segment_ids = segment_ids + ([pad_token_segment_id] * padding_length) assert len(input_ids) == max_seq_length @@ -256,7 +288,7 @@ def convert_example_to_feature_sliding_window( input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, - label_id=example.label + label_id=example.label, ) ) @@ -285,37 +317,81 @@ def convert_examples_to_features( use_multiprocessing=True, sliding_window=False, flatten=False, - stride=None + stride=None, ): """ Loads a data file into a list of `InputBatch`s `cls_token_at_end` define the location of the CLS token: - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] - `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet) + `cls_token_segment_id` define the segment id associated to the CLS token + (0 for BERT, 2 for XLNet) """ - examples = [(example, max_seq_length, tokenizer, output_mode, cls_token_at_end, cls_token, sep_token, cls_token_segment_id, pad_on_left, pad_token_segment_id, sep_token_extra, multi_label, stride) for example in examples] + examples = [ + ( + example, + max_seq_length, + tokenizer, + output_mode, + cls_token_at_end, + cls_token, + sep_token, + cls_token_segment_id, + pad_on_left, + pad_token_segment_id, + sep_token_extra, + multi_label, + stride, + ) + for example in examples + ] if use_multiprocessing: if sliding_window: - print('sliding_window enabled') + print("sliding_window enabled") with Pool(process_count) as p: - features = list(tqdm(p.imap(convert_example_to_feature_sliding_window, examples, chunksize=500), total=len(examples), disable=silent)) + features = list( + tqdm( + p.imap( + convert_example_to_feature_sliding_window, + examples, + chunksize=500, + ), + total=len(examples), + disable=silent, + ) + ) if flatten: - features = [feature for feature_set in features for feature in feature_set] - print(f'{len(features)} features created from {len(examples)} samples.') + features = [ + feature for feature_set in features for feature in feature_set + ] + print(f"{len(features)} features created from {len(examples)} samples.") else: with Pool(process_count) as p: - features = list(tqdm(p.imap(convert_example_to_feature, examples, chunksize=500), total=len(examples), disable=silent)) + features = list( + tqdm( + p.imap(convert_example_to_feature, examples, chunksize=500), + total=len(examples), + disable=silent, + ) + ) else: if sliding_window: - print('sliding_window enabled') - features = [convert_example_to_feature_sliding_window(example) for example in tqdm(examples, disable=silent)] + print("sliding_window enabled") + features = [ + convert_example_to_feature_sliding_window(example) + for example in tqdm(examples, disable=silent) + ] if flatten: - features = [feature for feature_set in features for feature in feature_set] - print(f'{len(features)} features created from {len(examples)} samples.') + features = [ + feature for feature_set in features for feature in feature_set + ] + print(f"{len(features)} features created from {len(examples)} samples.") else: - features = [convert_example_to_feature(example) for example in tqdm(examples, disable=silent)] + features = [ + convert_example_to_feature(example) + for example in tqdm(examples, disable=silent) + ] return features diff --git a/simpletransformers/classification/multi_label_classification_model.py b/simpletransformers/classification/multi_label_classification_model.py index 268f6435..2c07c28a 100755 --- a/simpletransformers/classification/multi_label_classification_model.py +++ b/simpletransformers/classification/multi_label_classification_model.py @@ -1,30 +1,42 @@ import torch -from multiprocessing import cpu_count - from simpletransformers.classification import ClassificationModel -from simpletransformers.custom_models.models import (BertForMultiLabelSequenceClassification, - RobertaForMultiLabelSequenceClassification, - XLNetForMultiLabelSequenceClassification, - XLMForMultiLabelSequenceClassification, - DistilBertForMultiLabelSequenceClassification, - AlbertForMultiLabelSequenceClassification - ) +from simpletransformers.custom_models.models import ( + BertForMultiLabelSequenceClassification, + RobertaForMultiLabelSequenceClassification, + XLNetForMultiLabelSequenceClassification, + XLMForMultiLabelSequenceClassification, + DistilBertForMultiLabelSequenceClassification, + AlbertForMultiLabelSequenceClassification, +) from simpletransformers.config.global_args import global_args from transformers import ( - WEIGHTS_NAME, - BertConfig, BertTokenizer, - XLNetConfig, XLNetTokenizer, - XLMConfig, XLMTokenizer, - RobertaConfig, RobertaTokenizer, - DistilBertConfig, DistilBertTokenizer, - AlbertConfig, AlbertTokenizer + BertConfig, + BertTokenizer, + XLNetConfig, + XLNetTokenizer, + XLMConfig, + XLMTokenizer, + RobertaConfig, + RobertaTokenizer, + DistilBertConfig, + DistilBertTokenizer, + AlbertConfig, + AlbertTokenizer, ) class MultiLabelClassificationModel(ClassificationModel): - def __init__(self, model_type, model_name, num_labels=None, pos_weight=None, args=None, use_cuda=True): + def __init__( + self, + model_type, + model_name, + num_labels=None, + pos_weight=None, + args=None, + use_cuda=True, + ): """ Initializes a MultiLabelClassification model. @@ -35,19 +47,41 @@ def __init__(self, model_type, model_name, num_labels=None, pos_weight=None, arg pos_weight (optional): A list of length num_labels containing the weights to assign to each label for loss calculation. args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args. use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only. - """ + """ # noqa: ignore flake8 MODEL_CLASSES = { - 'bert': (BertConfig, BertForMultiLabelSequenceClassification, BertTokenizer), - 'roberta': (RobertaConfig, RobertaForMultiLabelSequenceClassification, RobertaTokenizer), - 'xlnet': (XLNetConfig, XLNetForMultiLabelSequenceClassification, XLNetTokenizer), - 'xlm': (XLMConfig, XLMForMultiLabelSequenceClassification, XLMTokenizer), - 'distilbert': (DistilBertConfig, DistilBertForMultiLabelSequenceClassification, DistilBertTokenizer), - 'albert': (AlbertConfig, AlbertForMultiLabelSequenceClassification, AlbertTokenizer) + "bert": ( + BertConfig, + BertForMultiLabelSequenceClassification, + BertTokenizer, + ), + "roberta": ( + RobertaConfig, + RobertaForMultiLabelSequenceClassification, + RobertaTokenizer, + ), + "xlnet": ( + XLNetConfig, + XLNetForMultiLabelSequenceClassification, + XLNetTokenizer, + ), + "xlm": (XLMConfig, XLMForMultiLabelSequenceClassification, XLMTokenizer), + "distilbert": ( + DistilBertConfig, + DistilBertForMultiLabelSequenceClassification, + DistilBertTokenizer, + ), + "albert": ( + AlbertConfig, + AlbertForMultiLabelSequenceClassification, + AlbertTokenizer, + ), } config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type] if num_labels: - self.config = config_class.from_pretrained(model_name, num_labels=num_labels) + self.config = config_class.from_pretrained( + model_name, num_labels=num_labels + ) self.num_labels = num_labels else: self.config = config_class.from_pretrained(model_name) @@ -58,52 +92,92 @@ def __init__(self, model_type, model_name, num_labels=None, pos_weight=None, arg if torch.cuda.is_available(): self.device = torch.device("cuda") else: - raise ValueError("'use_cuda' set to True when cuda is unavailable. Make sure CUDA is available or set use_cuda=False.") + raise ValueError( + "'use_cuda' set to True when cuda is unavailable." + "Make sure CUDA is available or set use_cuda=False." + ) else: self.device = "cpu" if self.pos_weight: - self.model = model_class.from_pretrained(model_name, config=self.config, pos_weight=torch.Tensor(self.pos_weight).to(self.device)) + self.model = model_class.from_pretrained( + model_name, + config=self.config, + pos_weight=torch.Tensor(self.pos_weight).to(self.device), + ) else: self.model = model_class.from_pretrained(model_name, config=self.config) self.results = {} self.args = { - 'threshold': 0.5, - - 'sliding_window': False, - 'tie_value': 1, - 'stride': False, + "threshold": 0.5, + "sliding_window": False, + "tie_value": 1, + "stride": False, } self.args.update(global_args) if not use_cuda: - self.args['fp16'] = False + self.args["fp16"] = False if args: self.args.update(args) - self.tokenizer = tokenizer_class.from_pretrained(model_name, do_lower_case=self.args['do_lower_case']) + self.tokenizer = tokenizer_class.from_pretrained( + model_name, do_lower_case=self.args["do_lower_case"] + ) self.args["model_name"] = model_name self.args["model_type"] = model_type - def train_model(self, train_df, multi_label=True, eval_df=None, output_dir=None, show_running_loss=True, args=None, **kwargs): - return super().train_model(train_df, multi_label=multi_label, eval_df=eval_df, output_dir=output_dir, show_running_loss=show_running_loss, args=args) - - def eval_model(self, eval_df, multi_label=True, output_dir=None, verbose=False, **kwargs): - return super().eval_model(eval_df, output_dir=output_dir, multi_label=multi_label, verbose=verbose, **kwargs) - - def evaluate(self, eval_df, output_dir, multi_label=True, prefix='', **kwargs): - return super().evaluate(eval_df, output_dir, multi_label=multi_label, prefix=prefix, **kwargs) - - def load_and_cache_examples(self, examples, evaluate=False, no_cache=False, multi_label=True): - return super().load_and_cache_examples(examples, evaluate=evaluate, no_cache=no_cache, multi_label=multi_label) + def train_model( + self, + train_df, + multi_label=True, + eval_df=None, + output_dir=None, + show_running_loss=True, + args=None, + **kwargs + ): + return super().train_model( + train_df, + multi_label=multi_label, + eval_df=eval_df, + output_dir=output_dir, + show_running_loss=show_running_loss, + args=args, + ) + + def eval_model( + self, eval_df, multi_label=True, output_dir=None, verbose=False, **kwargs + ): + return super().eval_model( + eval_df, + output_dir=output_dir, + multi_label=multi_label, + verbose=verbose, + **kwargs + ) + + def evaluate(self, eval_df, output_dir, multi_label=True, prefix="", **kwargs): + return super().evaluate( + eval_df, output_dir, multi_label=multi_label, prefix=prefix, **kwargs + ) + + def load_and_cache_examples( + self, examples, evaluate=False, no_cache=False, multi_label=True + ): + return super().load_and_cache_examples( + examples, evaluate=evaluate, no_cache=no_cache, multi_label=multi_label + ) def compute_metrics(self, preds, labels, eval_examples, multi_label=True, **kwargs): - return super().compute_metrics(preds, labels, eval_examples, multi_label=multi_label, **kwargs) + return super().compute_metrics( + preds, labels, eval_examples, multi_label=multi_label, **kwargs + ) def predict(self, to_predict, multi_label=True): - return super().predict(to_predict, multi_label=multi_label) \ No newline at end of file + return super().predict(to_predict, multi_label=multi_label) diff --git a/simpletransformers/config/global_args.py b/simpletransformers/config/global_args.py index 77e85098..fe9999c6 100644 --- a/simpletransformers/config/global_args.py +++ b/simpletransformers/config/global_args.py @@ -2,41 +2,36 @@ global_args = { - 'output_dir': 'outputs/', - 'cache_dir': 'cache_dir/', - - 'fp16': True, - 'fp16_opt_level': 'O1', - 'max_seq_length': 128, - 'train_batch_size': 8, - 'gradient_accumulation_steps': 1, - 'eval_batch_size': 8, - 'num_train_epochs': 1, - 'weight_decay': 0, - 'learning_rate': 4e-5, - 'adam_epsilon': 1e-8, - 'warmup_ratio': 0.06, - 'warmup_steps': 0, - 'max_grad_norm': 1.0, - 'do_lower_case': False, - - 'logging_steps': 50, - 'save_steps': 2000, - 'save_model_every_epoch': True, - 'evaluate_during_training': False, - 'evaluate_during_training_steps': 2000, - 'use_cached_eval_features': True, - 'save_eval_checkpoints': True, - 'tensorboard_dir': None, - - 'overwrite_output_dir': False, - 'reprocess_input_data': False, - - 'process_count': cpu_count() - 2 if cpu_count() > 2 else 1, - 'n_gpu': 1, - 'use_multiprocessing': True, - 'silent': False, - - 'wandb_project': None, - 'wandb_kwargs': {}, -} \ No newline at end of file + "output_dir": "outputs/", + "cache_dir": "cache_dir/", + "fp16": True, + "fp16_opt_level": "O1", + "max_seq_length": 128, + "train_batch_size": 8, + "gradient_accumulation_steps": 1, + "eval_batch_size": 8, + "num_train_epochs": 1, + "weight_decay": 0, + "learning_rate": 4e-5, + "adam_epsilon": 1e-8, + "warmup_ratio": 0.06, + "warmup_steps": 0, + "max_grad_norm": 1.0, + "do_lower_case": False, + "logging_steps": 50, + "save_steps": 2000, + "save_model_every_epoch": True, + "evaluate_during_training": False, + "evaluate_during_training_steps": 2000, + "use_cached_eval_features": True, + "save_eval_checkpoints": True, + "tensorboard_dir": None, + "overwrite_output_dir": False, + "reprocess_input_data": False, + "process_count": cpu_count() - 2 if cpu_count() > 2 else 1, + "n_gpu": 1, + "use_multiprocessing": True, + "silent": False, + "wandb_project": None, + "wandb_kwargs": {}, +} diff --git a/simpletransformers/ner/__init__.py b/simpletransformers/ner/__init__.py index c597ea4c..bae54d6e 100755 --- a/simpletransformers/ner/__init__.py +++ b/simpletransformers/ner/__init__.py @@ -1 +1 @@ -from simpletransformers.ner.ner_model import NERModel \ No newline at end of file +from simpletransformers.ner.ner_model import NERModel diff --git a/simpletransformers/ner/ner_model.py b/simpletransformers/ner/ner_model.py index e510373c..39c629b7 100755 --- a/simpletransformers/ner/ner_model.py +++ b/simpletransformers/ner/ner_model.py @@ -17,28 +17,54 @@ from tqdm.auto import trange, tqdm from torch.nn import CrossEntropyLoss -from torch.utils.data import ( - DataLoader, - RandomSampler, - SequentialSampler, - TensorDataset -) +from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset from transformers import AdamW, get_linear_schedule_with_warmup -from transformers import WEIGHTS_NAME, BertConfig, BertForTokenClassification, BertTokenizer -from transformers import DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer +from transformers import ( + WEIGHTS_NAME, + BertConfig, + BertForTokenClassification, + BertTokenizer, +) +from transformers import ( + DistilBertConfig, + DistilBertForTokenClassification, + DistilBertTokenizer, +) from transformers import RobertaConfig, RobertaForTokenClassification, RobertaTokenizer -from transformers import XLMRobertaConfig, XLMRobertaForTokenClassification, XLMRobertaTokenizer +from transformers import ( + XLMRobertaConfig, + XLMRobertaForTokenClassification, + XLMRobertaTokenizer, +) -from simpletransformers.ner.ner_utils import InputExample, convert_examples_to_features, get_labels, read_examples_from_file, get_examples_from_df -from transformers import CamembertConfig, CamembertForTokenClassification, CamembertTokenizer +from simpletransformers.ner.ner_utils import ( + InputExample, + convert_examples_to_features, + get_labels, + read_examples_from_file, + get_examples_from_df, +) +from transformers import ( + CamembertConfig, + CamembertForTokenClassification, + CamembertTokenizer, +) from simpletransformers.config.global_args import global_args import wandb class NERModel: - def __init__(self, model_type, model_name, labels=None, args=None, use_cuda=True, cuda_device=-1): + def __init__( + self, + model_type, + model_name, + labels=None, + args=None, + use_cuda=True, + cuda_device=-1, + ): """ Initializes a NERModel @@ -54,15 +80,37 @@ def __init__(self, model_type, model_name, labels=None, args=None, use_cuda=True if labels: self.labels = labels else: - self.labels = ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"] + self.labels = [ + "O", + "B-MISC", + "I-MISC", + "B-PER", + "I-PER", + "B-ORG", + "I-ORG", + "B-LOC", + "I-LOC", + ] self.num_labels = len(self.labels) MODEL_CLASSES = { - 'bert': (BertConfig, BertForTokenClassification, BertTokenizer), - 'roberta': (RobertaConfig, RobertaForTokenClassification, RobertaTokenizer), - 'distilbert': (DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer), - 'camembert': (CamembertConfig, CamembertForTokenClassification, CamembertTokenizer), - 'xlmroberta': (XLMRobertaConfig, XLMRobertaForTokenClassification, XLMRobertaTokenizer), + "bert": (BertConfig, BertForTokenClassification, BertTokenizer), + "roberta": (RobertaConfig, RobertaForTokenClassification, RobertaTokenizer), + "distilbert": ( + DistilBertConfig, + DistilBertForTokenClassification, + DistilBertTokenizer, + ), + "camembert": ( + CamembertConfig, + CamembertForTokenClassification, + CamembertTokenizer, + ), + "xlmroberta": ( + XLMRobertaConfig, + XLMRobertaForTokenClassification, + XLMRobertaTokenizer, + ), } config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type] @@ -76,7 +124,9 @@ def __init__(self, model_type, model_name, labels=None, args=None, use_cuda=True else: self.device = torch.device(f"cuda:{cuda_device}") else: - raise ValueError("'use_cuda' set to True when cuda is unavailable. Make sure CUDA is available or set use_cuda=False.") + raise ValueError( + "'use_cuda' set to True when cuda is unavailable. Make sure CUDA is available or set use_cuda=False." + ) else: self.device = "cpu" @@ -87,23 +137,34 @@ def __init__(self, model_type, model_name, labels=None, args=None, use_cuda=True self.args.update(global_args) if not use_cuda: - self.args['fp16'] = False + self.args["fp16"] = False if args: self.args.update(args) - self.tokenizer = tokenizer_class.from_pretrained(model_name, do_lower_case=self.args['do_lower_case']) + self.tokenizer = tokenizer_class.from_pretrained( + model_name, do_lower_case=self.args["do_lower_case"] + ) - self.args['model_name'] = model_name - self.args['model_type'] = model_type + self.args["model_name"] = model_name + self.args["model_type"] = model_type self.pad_token_label_id = CrossEntropyLoss().ignore_index - if model_type == 'camembert': - warnings.warn("use_multiprocessing automatically disabled as CamemBERT fails when using multiprocessing for feature conversion.") - self.args['use_multiprocessing'] = False - - def train_model(self, train_data, output_dir=None, show_running_loss=True, args=None, eval_df=None): + if model_type == "camembert": + warnings.warn( + "use_multiprocessing automatically disabled as CamemBERT fails when using multiprocessing for feature conversion." + ) + self.args["use_multiprocessing"] = False + + def train_model( + self, + train_data, + output_dir=None, + show_running_loss=True, + args=None, + eval_df=None, + ): """ Trains the model using 'train_data' @@ -124,17 +185,27 @@ def train_model(self, train_data, output_dir=None, show_running_loss=True, args= if args: self.args.update(args) - if self.args['silent']: + if self.args["silent"]: show_running_loss = False - if self.args['evaluate_during_training'] and eval_df is None: - raise ValueError("evaluate_during_training is enabled but eval_df is not specified. Pass eval_df to model.train_model() if using evaluate_during_training.") + if self.args["evaluate_during_training"] and eval_df is None: + raise ValueError( + "evaluate_during_training is enabled but eval_df is not specified. Pass eval_df to model.train_model() if using evaluate_during_training." + ) if not output_dir: - output_dir = self.args['output_dir'] + output_dir = self.args["output_dir"] - if os.path.exists(output_dir) and os.listdir(output_dir) and not self.args["overwrite_output_dir"]: - raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(output_dir)) + if ( + os.path.exists(output_dir) + and os.listdir(output_dir) + and not self.args["overwrite_output_dir"] + ): + raise ValueError( + "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( + output_dir + ) + ) self._move_model_to_device() @@ -143,14 +214,25 @@ def train_model(self, train_data, output_dir=None, show_running_loss=True, args= if not os.path.exists(output_dir): os.makedirs(output_dir) - global_step, tr_loss = self.train(train_dataset, output_dir, show_running_loss=show_running_loss, eval_df=eval_df) + global_step, tr_loss = self.train( + train_dataset, + output_dir, + show_running_loss=show_running_loss, + eval_df=eval_df, + ) - model_to_save = self.model.module if hasattr(self.model, "module") else self.model + model_to_save = ( + self.model.module if hasattr(self.model, "module") else self.model + ) model_to_save.save_pretrained(output_dir) self.tokenizer.save_pretrained(output_dir) torch.save(self.args, os.path.join(output_dir, "training_args.bin")) - print("Training of {} model complete. Saved to {}.".format(self.args["model_type"], output_dir)) + print( + "Training of {} model complete. Saved to {}.".format( + self.args["model_type"], output_dir + ) + ) def train(self, train_dataset, output_dir, show_running_loss=True, eval_df=None): """ @@ -166,31 +248,61 @@ def train(self, train_dataset, output_dir, show_running_loss=True, eval_df=None) tb_writer = SummaryWriter(logdir=args["tensorboard_dir"]) train_sampler = RandomSampler(train_dataset) - train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args["train_batch_size"]) + train_dataloader = DataLoader( + train_dataset, sampler=train_sampler, batch_size=args["train_batch_size"] + ) - t_total = len(train_dataloader) // args["gradient_accumulation_steps"] * args["num_train_epochs"] + t_total = ( + len(train_dataloader) + // args["gradient_accumulation_steps"] + * args["num_train_epochs"] + ) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ - {"params": [p for n, p in model.named_parameters() if not any( - nd in n for nd in no_decay)], "weight_decay": args["weight_decay"]}, - {"params": [p for n, p in model.named_parameters() if any( - nd in n for nd in no_decay)], "weight_decay": 0.0} + { + "params": [ + p + for n, p in model.named_parameters() + if not any(nd in n for nd in no_decay) + ], + "weight_decay": args["weight_decay"], + }, + { + "params": [ + p + for n, p in model.named_parameters() + if any(nd in n for nd in no_decay) + ], + "weight_decay": 0.0, + }, ] warmup_steps = math.ceil(t_total * args["warmup_ratio"]) - args["warmup_steps"] = warmup_steps if args["warmup_steps"] == 0 else args["warmup_steps"] - - optimizer = AdamW(optimizer_grouped_parameters, lr=args["learning_rate"], eps=args["adam_epsilon"]) - scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args["warmup_steps"], num_training_steps=t_total) + args["warmup_steps"] = ( + warmup_steps if args["warmup_steps"] == 0 else args["warmup_steps"] + ) + + optimizer = AdamW( + optimizer_grouped_parameters, + lr=args["learning_rate"], + eps=args["adam_epsilon"], + ) + scheduler = get_linear_schedule_with_warmup( + optimizer, num_warmup_steps=args["warmup_steps"], num_training_steps=t_total + ) if args["fp16"]: try: from apex import amp except ImportError: - raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") + raise ImportError( + "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." + ) - model, optimizer = amp.initialize(model, optimizer, opt_level=args["fp16_opt_level"]) + model, optimizer = amp.initialize( + model, optimizer, opt_level=args["fp16_opt_level"] + ) if args["n_gpu"] > 1: model = torch.nn.DataParallel(model) @@ -198,41 +310,49 @@ def train(self, train_dataset, output_dir, show_running_loss=True, eval_df=None) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() - train_iterator = trange(int(args["num_train_epochs"]), desc="Epoch", disable=args['silent']) + train_iterator = trange( + int(args["num_train_epochs"]), desc="Epoch", disable=args["silent"] + ) epoch_number = 0 - if args['evaluate_during_training']: + if args["evaluate_during_training"]: training_progress_scores = { - 'global_step': [], - 'precision': [], - 'recall': [], - 'f1_score': [], - 'train_loss': [], - 'eval_loss': [], + "global_step": [], + "precision": [], + "recall": [], + "f1_score": [], + "train_loss": [], + "eval_loss": [], } - if args['wandb_project']: - wandb.init(project=args['wandb_project'], config={**args}) + if args["wandb_project"]: + wandb.init(project=args["wandb_project"], config={**args}) wandb.watch(self.model) model.train() for _ in train_iterator: # epoch_iterator = tqdm(train_dataloader, desc="Iteration") - for step, batch in enumerate(tqdm(train_dataloader, desc="Current iteration", disable=args['silent'])): + for step, batch in enumerate( + tqdm(train_dataloader, desc="Current iteration", disable=args["silent"]) + ): batch = tuple(t.to(device) for t in batch) - inputs = {"input_ids": batch[0], - "attention_mask": batch[1], - "labels": batch[3]} + inputs = { + "input_ids": batch[0], + "attention_mask": batch[1], + "labels": batch[3], + } # XLM and RoBERTa don"t use segment_ids - if args['model_type'] in ["bert", "xlnet"]: + if args["model_type"] in ["bert", "xlnet"]: inputs["token_type_ids"] = batch[2] outputs = model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] - if args['n_gpu'] > 1: - loss = loss.mean() # mean() to average on multi-gpu parallel training + if args["n_gpu"] > 1: + loss = ( + loss.mean() + ) # mean() to average on multi-gpu parallel training current_loss = loss.item() @@ -245,10 +365,14 @@ def train(self, train_dataset, output_dir, show_running_loss=True, eval_df=None) if args["fp16"]: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() - torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args["max_grad_norm"]) + torch.nn.utils.clip_grad_norm_( + amp.master_params(optimizer), args["max_grad_norm"] + ) else: loss.backward() - torch.nn.utils.clip_grad_norm_(model.parameters(), args["max_grad_norm"]) + torch.nn.utils.clip_grad_norm_( + model.parameters(), args["max_grad_norm"] + ) tr_loss += loss.item() if (step + 1) % args["gradient_accumulation_steps"] == 0: @@ -257,69 +381,107 @@ def train(self, train_dataset, output_dir, show_running_loss=True, eval_df=None) model.zero_grad() global_step += 1 - if args["logging_steps"] > 0 and global_step % args["logging_steps"] == 0: + if ( + args["logging_steps"] > 0 + and global_step % args["logging_steps"] == 0 + ): # Log metrics tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) - tb_writer.add_scalar("loss", (tr_loss - logging_loss)/args["logging_steps"], global_step) + tb_writer.add_scalar( + "loss", + (tr_loss - logging_loss) / args["logging_steps"], + global_step, + ) logging_loss = tr_loss - if args['wandb_project']: - wandb.log({'Training loss': current_loss, 'lr': scheduler.get_lr()[0], 'global_step': global_step}) + if args["wandb_project"]: + wandb.log( + { + "Training loss": current_loss, + "lr": scheduler.get_lr()[0], + "global_step": global_step, + } + ) if args["save_steps"] > 0 and global_step % args["save_steps"] == 0: # Save model checkpoint - output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step)) + output_dir_current = os.path.join( + output_dir, "checkpoint-{}".format(global_step) + ) if not os.path.exists(output_dir_current): os.makedirs(output_dir_current) # Take care of distributed/parallel training - model_to_save = model.module if hasattr(model, "module") else model + model_to_save = ( + model.module if hasattr(model, "module") else model + ) model_to_save.save_pretrained(output_dir_current) self.tokenizer.save_pretrained(output_dir_current) - if args['evaluate_during_training'] and (args["evaluate_during_training_steps"] > 0 and global_step % args["evaluate_during_training_steps"] == 0): + if args["evaluate_during_training"] and ( + args["evaluate_during_training_steps"] > 0 + and global_step % args["evaluate_during_training_steps"] == 0 + ): # Only evaluate when single GPU otherwise metrics may not average well results, _, _ = self.eval_model(eval_df, verbose=True) for key, value in results.items(): - tb_writer.add_scalar('eval_{}'.format(key), value, global_step) + tb_writer.add_scalar( + "eval_{}".format(key), value, global_step + ) - output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step)) + output_dir_current = os.path.join( + output_dir, "checkpoint-{}".format(global_step) + ) if not os.path.exists(output_dir_current): os.makedirs(output_dir_current) - if args['save_eval_checkpoints']: - model_to_save = model.module if hasattr(model, "module") else model + if args["save_eval_checkpoints"]: + model_to_save = ( + model.module if hasattr(model, "module") else model + ) model_to_save.save_pretrained(output_dir_current) self.tokenizer.save_pretrained(output_dir_current) - output_eval_file = os.path.join(output_dir_current, "eval_results.txt") + output_eval_file = os.path.join( + output_dir_current, "eval_results.txt" + ) with open(output_eval_file, "w") as writer: for key in sorted(results.keys()): writer.write("{} = {}\n".format(key, str(results[key]))) - training_progress_scores['global_step'].append(global_step) - training_progress_scores['train_loss'].append(current_loss) + training_progress_scores["global_step"].append(global_step) + training_progress_scores["train_loss"].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) - report.to_csv(args['output_dir'] + 'training_progress_scores.csv', index=False) + report.to_csv( + args["output_dir"] + "training_progress_scores.csv", + index=False, + ) - if args['wandb_project']: + if args["wandb_project"]: wandb.log(self._get_last_metrics(training_progress_scores)) epoch_number += 1 - output_dir_current = os.path.join(output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number)) + output_dir_current = os.path.join( + output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number) + ) - if (args['save_model_every_epoch'] or args['evaluate_during_training']) and not os.path.exists(output_dir_current): + if ( + args["save_model_every_epoch"] or args["evaluate_during_training"] + ) and not os.path.exists(output_dir_current): os.makedirs(output_dir_current) - if args['save_model_every_epoch'] and epoch_number != args['num_train_epochs']: + if ( + args["save_model_every_epoch"] + and epoch_number != args["num_train_epochs"] + ): model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(output_dir_current) self.tokenizer.save_pretrained(output_dir_current) - if args['evaluate_during_training']: + if args["evaluate_during_training"]: results, _, _ = self.eval_model(eval_df, verbose=True) output_eval_file = os.path.join(output_dir_current, "eval_results.txt") @@ -379,7 +541,9 @@ def evaluate(self, eval_dataset, output_dir): results = {} eval_sampler = SequentialSampler(eval_dataset) - eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args["eval_batch_size"]) + eval_dataloader = DataLoader( + eval_dataset, sampler=eval_sampler, batch_size=args["eval_batch_size"] + ) eval_loss = 0.0 nb_eval_steps = 0 @@ -387,15 +551,17 @@ def evaluate(self, eval_dataset, output_dir): out_label_ids = None model.eval() - for batch in tqdm(eval_dataloader, disable=args['silent']): + for batch in tqdm(eval_dataloader, disable=args["silent"]): batch = tuple(t.to(device) for t in batch) with torch.no_grad(): - inputs = {"input_ids": batch[0], - "attention_mask": batch[1], - "labels": batch[3]} + inputs = { + "input_ids": batch[0], + "attention_mask": batch[1], + "labels": batch[3], + } # XLM and RoBERTa don"t use segment_ids - if args['model_type'] in ["bert", "xlnet"]: + if args["model_type"] in ["bert", "xlnet"]: inputs["token_type_ids"] = batch[2] outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] @@ -409,7 +575,9 @@ def evaluate(self, eval_dataset, output_dir): out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) - out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) + out_label_ids = np.append( + out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0 + ) eval_loss = eval_loss / nb_eval_steps model_outputs = preds @@ -430,7 +598,7 @@ def evaluate(self, eval_dataset, output_dir): "eval_loss": eval_loss, "precision": precision_score(out_label_list, preds_list), "recall": recall_score(out_label_list, preds_list), - "f1_score": f1_score(out_label_list, preds_list) + "f1_score": f1_score(out_label_list, preds_list), } results.update(result) @@ -462,12 +630,17 @@ def predict(self, to_predict): self._move_model_to_device() - predict_examples = [InputExample(i, sentence.split(), ["O" for word in sentence.split()]) for i, sentence in enumerate(to_predict)] + predict_examples = [ + InputExample(i, sentence.split(), ["O" for word in sentence.split()]) + for i, sentence in enumerate(to_predict) + ] eval_dataset = self.load_and_cache_examples(None, to_predict=predict_examples) eval_sampler = SequentialSampler(eval_dataset) - eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args["eval_batch_size"]) + eval_dataloader = DataLoader( + eval_dataset, sampler=eval_sampler, batch_size=args["eval_batch_size"] + ) eval_loss = 0.0 nb_eval_steps = 0 @@ -475,15 +648,17 @@ def predict(self, to_predict): out_label_ids = None model.eval() - for batch in tqdm(eval_dataloader, disable=args['silent']): + for batch in tqdm(eval_dataloader, disable=args["silent"]): batch = tuple(t.to(device) for t in batch) with torch.no_grad(): - inputs = {"input_ids": batch[0], - "attention_mask": batch[1], - "labels": batch[3]} + inputs = { + "input_ids": batch[0], + "attention_mask": batch[1], + "labels": batch[3], + } # XLM and RoBERTa don"t use segment_ids - if args['model_type'] in ["bert", "xlnet"]: + if args["model_type"] in ["bert", "xlnet"]: inputs["token_type_ids"] = batch[2] outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] @@ -497,7 +672,9 @@ def predict(self, to_predict): out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) - out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) + out_label_ids = np.append( + out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0 + ) eval_loss = eval_loss / nb_eval_steps model_outputs = preds @@ -514,11 +691,19 @@ def predict(self, to_predict): out_label_list[i].append(label_map[out_label_ids[i][j]]) preds_list[i].append(label_map[preds[i][j]]) - preds = [[{word: preds_list[i][j]} for j, word in enumerate(sentence.split()[:len(preds_list[i])])] for i, sentence in enumerate(to_predict)] + preds = [ + [ + {word: preds_list[i][j]} + for j, word in enumerate(sentence.split()[: len(preds_list[i])]) + ] + for i, sentence in enumerate(to_predict) + ] return preds, model_outputs - def load_and_cache_examples(self, data, evaluate=False, no_cache=False, to_predict=None): + def load_and_cache_examples( + self, data, evaluate=False, no_cache=False, to_predict=None + ): """ Reads data_file and generates a TensorDataset containing InputFeatures. Caches the InputFeatures. Utility function for train() and eval() methods. Not intended to be used directly. @@ -548,12 +733,24 @@ def load_and_cache_examples(self, data, evaluate=False, no_cache=False, to_predi examples = to_predict no_cache = True - cached_features_file = os.path.join(args["cache_dir"], "cached_{}_{}_{}_{}_{}".format(mode, args["model_type"], args["max_seq_length"], self.num_labels, len(examples))) + cached_features_file = os.path.join( + args["cache_dir"], + "cached_{}_{}_{}_{}_{}".format( + mode, + args["model_type"], + args["max_seq_length"], + self.num_labels, + len(examples), + ), + ) if not os.path.isdir(self.args["cache_dir"]): os.mkdir(self.args["cache_dir"]) - if os.path.exists(cached_features_file) and ((not args["reprocess_input_data"] and not no_cache) or (mode == "dev" and args['use_cached_eval_features'])): + if os.path.exists(cached_features_file) and ( + (not args["reprocess_input_data"] and not no_cache) + or (mode == "dev" and args["use_cached_eval_features"]) + ): features = torch.load(cached_features_file) print(f"Features loaded from cache at {cached_features_file}") else: @@ -561,7 +758,7 @@ def load_and_cache_examples(self, data, evaluate=False, no_cache=False, to_predi features = convert_examples_to_features( examples, self.labels, - self.args['max_seq_length'], + self.args["max_seq_length"], self.tokenizer, # XLNet has a CLS token at the end cls_token_at_end=bool(args["model_type"] in ["xlnet"]), @@ -576,19 +773,25 @@ def load_and_cache_examples(self, data, evaluate=False, no_cache=False, to_predi pad_token_segment_id=4 if args["model_type"] in ["xlnet"] else 0, pad_token_label_id=self.pad_token_label_id, process_count=process_count, - silent=args['silent'], - use_multiprocessing=args['use_multiprocessing'] + silent=args["silent"], + use_multiprocessing=args["use_multiprocessing"], ) if not no_cache: torch.save(features, cached_features_file) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) - all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) - all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) + all_input_mask = torch.tensor( + [f.input_mask for f in features], dtype=torch.long + ) + all_segment_ids = torch.tensor( + [f.segment_ids for f in features], dtype=torch.long + ) all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long) - dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) + dataset = TensorDataset( + all_input_ids, all_input_mask, all_segment_ids, all_label_ids + ) return dataset diff --git a/simpletransformers/ner/ner_utils.py b/simpletransformers/ner/ner_utils.py index 99845411..a9483843 100755 --- a/simpletransformers/ner/ner_utils.py +++ b/simpletransformers/ner/ner_utils.py @@ -17,12 +17,9 @@ from __future__ import absolute_import, division, print_function -import logging -import os from io import open from multiprocessing import Pool, cpu_count from tqdm.auto import tqdm -import pandas as pd class InputExample(object): @@ -61,9 +58,13 @@ def read_examples_from_file(data_file, mode): for line in f: if line.startswith("-DOCSTART-") or line == "" or line == "\n": if words: - examples.append(InputExample(guid="{}-{}".format(mode, guid_index), - words=words, - labels=labels)) + examples.append( + InputExample( + guid="{}-{}".format(mode, guid_index), + words=words, + labels=labels, + ) + ) guid_index += 1 words = [] labels = [] @@ -76,18 +77,43 @@ def read_examples_from_file(data_file, mode): # Examples could have no label for mode = "test" labels.append("O") if words: - examples.append(InputExample(guid="%s-%d".format(mode, guid_index), - words=words, - labels=labels)) + examples.append( + InputExample( + guid="%s-%d".format(mode, guid_index), words=words, labels=labels + ) + ) return examples def get_examples_from_df(data): - return [InputExample(guid=sentence_id, words=sentence_df['words'].tolist(), labels=sentence_df['labels'].tolist()) for sentence_id, sentence_df in data.groupby(['sentence_id'])] + return [ + InputExample( + guid=sentence_id, + words=sentence_df["words"].tolist(), + labels=sentence_df["labels"].tolist(), + ) + for sentence_id, sentence_df in data.groupby(["sentence_id"]) + ] def convert_example_to_feature(example_row): - example, label_map, max_seq_length, tokenizer, cls_token_at_end, cls_token, cls_token_segment_id, sep_token, sep_token_extra, pad_on_left, pad_token, pad_token_segment_id, pad_token_label_id, sequence_a_segment_id, mask_padding_with_zero = example_row + ( + example, + label_map, + max_seq_length, + tokenizer, + cls_token_at_end, + cls_token, + cls_token_segment_id, + sep_token, + sep_token_extra, + pad_on_left, + pad_token, + pad_token_segment_id, + pad_token_label_id, + sequence_a_segment_id, + mask_padding_with_zero, + ) = example_row tokens = [] label_ids = [] @@ -95,13 +121,15 @@ def convert_example_to_feature(example_row): word_tokens = tokenizer.tokenize(word) tokens.extend(word_tokens) # Use the real label id for the first token of the word, and padding ids for the remaining tokens - label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1)) + label_ids.extend( + [label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1) + ) # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa. special_tokens_count = 3 if sep_token_extra else 2 if len(tokens) > max_seq_length - special_tokens_count: - tokens = tokens[:(max_seq_length - special_tokens_count)] - label_ids = label_ids[:(max_seq_length - special_tokens_count)] + tokens = tokens[: (max_seq_length - special_tokens_count)] + label_ids = label_ids[: (max_seq_length - special_tokens_count)] # The convention in BERT is: # (a) For sequence pairs: @@ -148,14 +176,16 @@ def convert_example_to_feature(example_row): padding_length = max_seq_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids - input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask + input_mask = ( + [0 if mask_padding_with_zero else 1] * padding_length + ) + input_mask segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids label_ids = ([pad_token_label_id] * padding_length) + label_ids else: - input_ids += ([pad_token] * padding_length) - input_mask += ([0 if mask_padding_with_zero else 1] * padding_length) - segment_ids += ([pad_token_segment_id] * padding_length) - label_ids += ([pad_token_label_id] * padding_length) + input_ids += [pad_token] * padding_length + input_mask += [0 if mask_padding_with_zero else 1] * padding_length + segment_ids += [pad_token_segment_id] * padding_length + label_ids += [pad_token_label_id] * padding_length assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length @@ -166,30 +196,31 @@ def convert_example_to_feature(example_row): input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, - label_ids=label_ids + label_ids=label_ids, ) + def convert_examples_to_features( - examples, - label_list, - max_seq_length, - tokenizer, - cls_token_at_end=False, - cls_token="[CLS]", - cls_token_segment_id=1, - sep_token="[SEP]", - sep_token_extra=False, - pad_on_left=False, - pad_token=0, - pad_token_segment_id=0, - pad_token_label_id=-1, - sequence_a_segment_id=0, - mask_padding_with_zero=True, - process_count=cpu_count() - 2, - chunksize=500, - silent=False, - use_multiprocessing=True - ): + examples, + label_list, + max_seq_length, + tokenizer, + cls_token_at_end=False, + cls_token="[CLS]", + cls_token_segment_id=1, + sep_token="[SEP]", + sep_token_extra=False, + pad_on_left=False, + pad_token=0, + pad_token_segment_id=0, + pad_token_label_id=-1, + sequence_a_segment_id=0, + mask_padding_with_zero=True, + process_count=cpu_count() - 2, + chunksize=500, + silent=False, + use_multiprocessing=True, +): """ Loads a data file into a list of `InputBatch`s `cls_token_at_end` define the location of the CLS token: - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] @@ -199,26 +230,36 @@ def convert_examples_to_features( label_map = {label: i for i, label in enumerate(label_list)} - examples = [( - example, - label_map, - max_seq_length, - tokenizer, - cls_token_at_end, - cls_token, - cls_token_segment_id, - sep_token, - sep_token_extra, - pad_on_left, - pad_token, - pad_token_segment_id, - pad_token_label_id, - sequence_a_segment_id, - mask_padding_with_zero) for example in examples] + examples = [ + ( + example, + label_map, + max_seq_length, + tokenizer, + cls_token_at_end, + cls_token, + cls_token_segment_id, + sep_token, + sep_token_extra, + pad_on_left, + pad_token, + pad_token_segment_id, + pad_token_label_id, + sequence_a_segment_id, + mask_padding_with_zero, + ) + for example in examples + ] if use_multiprocessing: with Pool(process_count) as p: - features = list(tqdm(p.imap(convert_example_to_feature, examples, chunksize=chunksize), total=len(examples), disable=silent)) + features = list( + tqdm( + p.imap(convert_example_to_feature, examples, chunksize=chunksize), + total=len(examples), + disable=silent, + ) + ) else: features = [] for example in tqdm(examples): @@ -234,4 +275,14 @@ def get_labels(path): labels = ["O"] + labels return labels else: - return ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"] \ No newline at end of file + return [ + "O", + "B-MISC", + "I-MISC", + "B-PER", + "I-PER", + "B-ORG", + "I-ORG", + "B-LOC", + "I-LOC", + ] diff --git a/simpletransformers/question_answering/__init__.py b/simpletransformers/question_answering/__init__.py index a8745088..c0c73d44 100755 --- a/simpletransformers/question_answering/__init__.py +++ b/simpletransformers/question_answering/__init__.py @@ -1 +1,3 @@ -from simpletransformers.question_answering.question_answering_model import QuestionAnsweringModel \ No newline at end of file +from simpletransformers.question_answering.question_answering_model import ( + QuestionAnsweringModel, +) diff --git a/simpletransformers/question_answering/question_answering_model.py b/simpletransformers/question_answering/question_answering_model.py index 1c6f4072..a6a9dc39 100755 --- a/simpletransformers/question_answering/question_answering_model.py +++ b/simpletransformers/question_answering/question_answering_model.py @@ -12,27 +12,36 @@ import pandas as pd from scipy.stats import pearsonr -from sklearn.metrics import mean_squared_error, matthews_corrcoef, confusion_matrix, label_ranking_average_precision_score +from sklearn.metrics import ( + mean_squared_error, + matthews_corrcoef, + confusion_matrix, + label_ranking_average_precision_score, +) from tensorboardX import SummaryWriter from tqdm.auto import trange, tqdm from torch.utils.data.distributed import DistributedSampler -from torch.utils.data import ( - DataLoader, - RandomSampler, - SequentialSampler, - TensorDataset -) +from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset from transformers import AdamW, get_linear_schedule_with_warmup from transformers import ( WEIGHTS_NAME, BertConfig, - BertForQuestionAnswering, BertTokenizer, - XLMConfig, XLMForQuestionAnswering, XLMTokenizer, - XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer, - DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer, - AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer + BertForQuestionAnswering, + BertTokenizer, + XLMConfig, + XLMForQuestionAnswering, + XLMTokenizer, + XLNetConfig, + XLNetForQuestionAnswering, + XLNetTokenizer, + DistilBertConfig, + DistilBertForQuestionAnswering, + DistilBertTokenizer, + AlbertConfig, + AlbertForQuestionAnswering, + AlbertTokenizer, ) from simpletransformers.question_answering.question_answering_utils import ( @@ -45,7 +54,7 @@ to_list, build_examples, get_best_predictions, - get_best_predictions_extended + get_best_predictions_extended, ) from simpletransformers.config.global_args import global_args @@ -53,7 +62,9 @@ class QuestionAnsweringModel: - def __init__(self, model_type, model_name, args=None, use_cuda=True, cuda_device=-1): + def __init__( + self, model_type, model_name, args=None, use_cuda=True, cuda_device=-1 + ): """ Initializes a QuestionAnsweringModel model. @@ -66,11 +77,15 @@ def __init__(self, model_type, model_name, args=None, use_cuda=True, cuda_device """ MODEL_CLASSES = { - 'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer), - 'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer), - 'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer), - 'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer), - 'albert': (AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer), + "bert": (BertConfig, BertForQuestionAnswering, BertTokenizer), + "xlnet": (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer), + "xlm": (XLMConfig, XLMForQuestionAnswering, XLMTokenizer), + "distilbert": ( + DistilBertConfig, + DistilBertForQuestionAnswering, + DistilBertTokenizer, + ), + "albert": (AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer), } config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type] @@ -83,37 +98,42 @@ def __init__(self, model_type, model_name, args=None, use_cuda=True, cuda_device else: self.device = torch.device(f"cuda:{cuda_device}") else: - raise ValueError("'use_cuda' set to True when cuda is unavailable. Make sure CUDA is available or set use_cuda=False.") + raise ValueError( + "'use_cuda' set to True when cuda is unavailable. Make sure CUDA is available or set use_cuda=False." + ) else: self.device = "cpu" self.results = {} self.args = { - 'doc_stride': 384, - 'max_query_length': 64, - 'n_best_size': 20, - 'max_answer_length': 100, - 'null_score_diff_threshold': 0.0, - - 'wandb_project': False, - 'wandb_kwargs': {}, + "doc_stride": 384, + "max_query_length": 64, + "n_best_size": 20, + "max_answer_length": 100, + "null_score_diff_threshold": 0.0, + "wandb_project": False, + "wandb_kwargs": {}, } self.args.update(global_args) if not use_cuda: - self.args['fp16'] = False + self.args["fp16"] = False if args: self.args.update(args) - self.tokenizer = tokenizer_class.from_pretrained(model_name, do_lower_case=self.args['do_lower_case']) + self.tokenizer = tokenizer_class.from_pretrained( + model_name, do_lower_case=self.args["do_lower_case"] + ) - self.args['model_name'] = model_name - self.args['model_type'] = model_type + self.args["model_name"] = model_name + self.args["model_type"] = model_type - def load_and_cache_examples(self, examples, evaluate=False, no_cache=False, output_examples=False): + def load_and_cache_examples( + self, examples, evaluate=False, no_cache=False, output_examples=False + ): """ Converts a list of examples to a TensorDataset containing InputFeatures. Caches the InputFeatures. @@ -129,50 +149,86 @@ def load_and_cache_examples(self, examples, evaluate=False, no_cache=False, outp examples = get_examples(examples, is_training=not evaluate) mode = "dev" if evaluate else "train" - cached_features_file = os.path.join(args["cache_dir"], "cached_{}_{}_{}_{}".format(mode, args["model_type"], args["max_seq_length"], len(examples))) - - if os.path.exists(cached_features_file) and ((not args["reprocess_input_data"] and not no_cache) or (mode == "dev" and args['use_cached_eval_features'])): + cached_features_file = os.path.join( + args["cache_dir"], + "cached_{}_{}_{}_{}".format( + mode, args["model_type"], args["max_seq_length"], len(examples) + ), + ) + + if os.path.exists(cached_features_file) and ( + (not args["reprocess_input_data"] and not no_cache) + or (mode == "dev" and args["use_cached_eval_features"]) + ): features = torch.load(cached_features_file) print(f"Features loaded from cache at {cached_features_file}") else: print(f"Converting to features started.") - features = convert_examples_to_features(examples=examples, - tokenizer=tokenizer, - max_seq_length=args['max_seq_length'], - doc_stride=args['doc_stride'], - max_query_length=args['max_query_length'], - is_training=not evaluate, - cls_token_segment_id=2 if args['model_type'] in ['xlnet'] else 0, - pad_token_segment_id=3 if args['model_type'] in ['xlnet'] else 0, - cls_token_at_end=True if args['model_type'] in ['xlnet'] else False, - sequence_a_is_doc=True if args['model_type'] in ['xlnet'] else False, - silent=args['silent'] - ) + features = convert_examples_to_features( + examples=examples, + tokenizer=tokenizer, + max_seq_length=args["max_seq_length"], + doc_stride=args["doc_stride"], + max_query_length=args["max_query_length"], + is_training=not evaluate, + cls_token_segment_id=2 if args["model_type"] in ["xlnet"] else 0, + pad_token_segment_id=3 if args["model_type"] in ["xlnet"] else 0, + cls_token_at_end=True if args["model_type"] in ["xlnet"] else False, + sequence_a_is_doc=True if args["model_type"] in ["xlnet"] else False, + silent=args["silent"], + ) if not no_cache: torch.save(features, cached_features_file) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) - all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) - all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) + all_input_mask = torch.tensor( + [f.input_mask for f in features], dtype=torch.long + ) + all_segment_ids = torch.tensor( + [f.segment_ids for f in features], dtype=torch.long + ) all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long) all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) if evaluate: - dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, - all_example_index, all_cls_index, all_p_mask) + dataset = TensorDataset( + all_input_ids, + all_input_mask, + all_segment_ids, + all_example_index, + all_cls_index, + all_p_mask, + ) else: - all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long) - all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long) - dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, - all_start_positions, all_end_positions, - all_cls_index, all_p_mask) + all_start_positions = torch.tensor( + [f.start_position for f in features], dtype=torch.long + ) + all_end_positions = torch.tensor( + [f.end_position for f in features], dtype=torch.long + ) + dataset = TensorDataset( + all_input_ids, + all_input_mask, + all_segment_ids, + all_start_positions, + all_end_positions, + all_cls_index, + all_p_mask, + ) if output_examples: return dataset, examples, features return dataset - def train_model(self, train_data, output_dir=False, show_running_loss=True, args=None, eval_data=None): + def train_model( + self, + train_data, + output_dir=False, + show_running_loss=True, + args=None, + eval_data=None, + ): """ Trains the model using 'train_data' @@ -189,22 +245,32 @@ def train_model(self, train_data, output_dir=False, show_running_loss=True, args if args: self.args.update(args) - if self.args['silent']: + if self.args["silent"]: show_running_loss = False - if self.args['evaluate_during_training'] and eval_data is None: - raise ValueError("evaluate_during_training is enabled but eval_data is not specified. Pass eval_data to model.train_model() if using evaluate_during_training.") + if self.args["evaluate_during_training"] and eval_data is None: + raise ValueError( + "evaluate_during_training is enabled but eval_data is not specified. Pass eval_data to model.train_model() if using evaluate_during_training." + ) if not output_dir: - output_dir = self.args['output_dir'] + output_dir = self.args["output_dir"] - if os.path.exists(output_dir) and os.listdir(output_dir) and not self.args["overwrite_output_dir"]: - raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(output_dir)) + if ( + os.path.exists(output_dir) + and os.listdir(output_dir) + and not self.args["overwrite_output_dir"] + ): + raise ValueError( + "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( + output_dir + ) + ) self._move_model_to_device() if isinstance(train_data, str): - with open(train_data, 'r') as f: + with open(train_data, "r") as f: train_examples = json.load(f) else: train_examples = train_data @@ -214,14 +280,25 @@ def train_model(self, train_data, output_dir=False, show_running_loss=True, args if not os.path.exists(output_dir): os.makedirs(output_dir) - global_step, tr_loss = self.train(train_dataset, output_dir, show_running_loss=show_running_loss, eval_data=eval_data) + global_step, tr_loss = self.train( + train_dataset, + output_dir, + show_running_loss=show_running_loss, + eval_data=eval_data, + ) - model_to_save = self.model.module if hasattr(self.model, "module") else self.model + model_to_save = ( + self.model.module if hasattr(self.model, "module") else self.model + ) model_to_save.save_pretrained(output_dir) self.tokenizer.save_pretrained(output_dir) torch.save(self.args, os.path.join(output_dir, "training_args.bin")) - print("Training of {} model complete. Saved to {}.".format(self.args["model_type"], output_dir)) + print( + "Training of {} model complete. Saved to {}.".format( + self.args["model_type"], output_dir + ) + ) def train(self, train_dataset, output_dir, show_running_loss=True, eval_data=None): """ @@ -237,32 +314,61 @@ def train(self, train_dataset, output_dir, show_running_loss=True, eval_data=Non tb_writer = SummaryWriter(logdir=args["tensorboard_dir"]) train_sampler = RandomSampler(train_dataset) - train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args["train_batch_size"]) + train_dataloader = DataLoader( + train_dataset, sampler=train_sampler, batch_size=args["train_batch_size"] + ) - t_total = len(train_dataloader) // args["gradient_accumulation_steps"] * args["num_train_epochs"] + t_total = ( + len(train_dataloader) + // args["gradient_accumulation_steps"] + * args["num_train_epochs"] + ) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ - {"params": [p for n, p in model.named_parameters() if not any( - nd in n for nd in no_decay)], "weight_decay": args["weight_decay"]}, - {"params": [p for n, p in model.named_parameters() if any( - nd in n for nd in no_decay)], "weight_decay": 0.0} + { + "params": [ + p + for n, p in model.named_parameters() + if not any(nd in n for nd in no_decay) + ], + "weight_decay": args["weight_decay"], + }, + { + "params": [ + p + for n, p in model.named_parameters() + if any(nd in n for nd in no_decay) + ], + "weight_decay": 0.0, + }, ] warmup_steps = math.ceil(t_total * args["warmup_ratio"]) - args["warmup_steps"] = warmup_steps if args["warmup_steps"] == 0 else args["warmup_steps"] - - optimizer = AdamW(optimizer_grouped_parameters, lr=args["learning_rate"], eps=args["adam_epsilon"]) - scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args["warmup_steps"], num_training_steps=t_total) + args["warmup_steps"] = ( + warmup_steps if args["warmup_steps"] == 0 else args["warmup_steps"] + ) + + optimizer = AdamW( + optimizer_grouped_parameters, + lr=args["learning_rate"], + eps=args["adam_epsilon"], + ) + scheduler = get_linear_schedule_with_warmup( + optimizer, num_warmup_steps=args["warmup_steps"], num_training_steps=t_total + ) if args["fp16"]: try: from apex import amp except ImportError: raise ImportError( - "Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") + "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." + ) - model, optimizer = amp.initialize(model, optimizer, opt_level=args["fp16_opt_level"]) + model, optimizer = amp.initialize( + model, optimizer, opt_level=args["fp16_opt_level"] + ) if args["n_gpu"] > 1: model = torch.nn.DataParallel(model) @@ -270,45 +376,53 @@ def train(self, train_dataset, output_dir, show_running_loss=True, eval_data=Non global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() - train_iterator = trange(int(args["num_train_epochs"]), desc="Epoch", disable=args['silent']) + train_iterator = trange( + int(args["num_train_epochs"]), desc="Epoch", disable=args["silent"] + ) epoch_number = 0 - if args['evaluate_during_training']: + if args["evaluate_during_training"]: training_progress_scores = { - 'global_step': [], - 'correct': [], - 'similar': [], - 'incorrect': [], - 'train_loss': [], + "global_step": [], + "correct": [], + "similar": [], + "incorrect": [], + "train_loss": [], } - if args['wandb_project']: - wandb.init(project=args['wandb_project'], config={**args}) + if args["wandb_project"]: + wandb.init(project=args["wandb_project"], config={**args}) wandb.watch(self.model) model.train() for _ in train_iterator: # epoch_iterator = tqdm(train_dataloader, desc="Iteration") - for step, batch in enumerate(tqdm(train_dataloader, desc="Current iteration", disable=args['silent'])): + for step, batch in enumerate( + tqdm(train_dataloader, desc="Current iteration", disable=args["silent"]) + ): batch = tuple(t.to(device) for t in batch) - inputs = {'input_ids': batch[0], - 'attention_mask': batch[1], - 'start_positions': batch[3], - 'end_positions': batch[4] - } + inputs = { + "input_ids": batch[0], + "attention_mask": batch[1], + "start_positions": batch[3], + "end_positions": batch[4], + } - if args['model_type'] != 'distilbert': - inputs['token_type_ids'] = None if args['model_type'] == 'xlm' else batch[2] - if args['model_type'] in ['xlnet', 'xlm']: - inputs.update({'cls_index': batch[5], - 'p_mask': batch[6]}) + if args["model_type"] != "distilbert": + inputs["token_type_ids"] = ( + None if args["model_type"] == "xlm" else batch[2] + ) + if args["model_type"] in ["xlnet", "xlm"]: + inputs.update({"cls_index": batch[5], "p_mask": batch[6]}) outputs = model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] - if args['n_gpu'] > 1: - loss = loss.mean() # mean() to average on multi-gpu parallel training + if args["n_gpu"] > 1: + loss = ( + loss.mean() + ) # mean() to average on multi-gpu parallel training current_loss = loss.item() @@ -321,10 +435,14 @@ def train(self, train_dataset, output_dir, show_running_loss=True, eval_data=Non if args["fp16"]: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() - torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args["max_grad_norm"]) + torch.nn.utils.clip_grad_norm_( + amp.master_params(optimizer), args["max_grad_norm"] + ) else: loss.backward() - torch.nn.utils.clip_grad_norm_(model.parameters(), args["max_grad_norm"]) + torch.nn.utils.clip_grad_norm_( + model.parameters(), args["max_grad_norm"] + ) tr_loss += loss.item() if (step + 1) % args["gradient_accumulation_steps"] == 0: @@ -333,68 +451,106 @@ def train(self, train_dataset, output_dir, show_running_loss=True, eval_data=Non model.zero_grad() global_step += 1 - if args["logging_steps"] > 0 and global_step % args["logging_steps"] == 0: + if ( + args["logging_steps"] > 0 + and global_step % args["logging_steps"] == 0 + ): # Log metrics tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) - tb_writer.add_scalar("loss", (tr_loss - logging_loss)/args["logging_steps"], global_step) + tb_writer.add_scalar( + "loss", + (tr_loss - logging_loss) / args["logging_steps"], + global_step, + ) logging_loss = tr_loss - if args['wandb_project']: - wandb.log({'Training loss': current_loss, 'lr': scheduler.get_lr()[0], 'global_step': global_step}) + if args["wandb_project"]: + wandb.log( + { + "Training loss": current_loss, + "lr": scheduler.get_lr()[0], + "global_step": global_step, + } + ) if args["save_steps"] > 0 and global_step % args["save_steps"] == 0: # Save model checkpoint - output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step)) + output_dir_current = os.path.join( + output_dir, "checkpoint-{}".format(global_step) + ) if not os.path.exists(output_dir_current): os.makedirs(output_dir_current) - model_to_save = model.module if hasattr(model, "module") else model + model_to_save = ( + model.module if hasattr(model, "module") else model + ) model_to_save.save_pretrained(output_dir_current) self.tokenizer.save_pretrained(output_dir_current) - if args['evaluate_during_training'] and (args["evaluate_during_training_steps"] > 0 and global_step % args["evaluate_during_training_steps"] == 0): + if args["evaluate_during_training"] and ( + args["evaluate_during_training_steps"] > 0 + and global_step % args["evaluate_during_training_steps"] == 0 + ): # Only evaluate when single GPU otherwise metrics may not average well results, _ = self.eval_model(eval_data, verbose=True) for key, value in results.items(): - tb_writer.add_scalar('eval_{}'.format(key), value, global_step) + tb_writer.add_scalar( + "eval_{}".format(key), value, global_step + ) - output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step)) + output_dir_current = os.path.join( + output_dir, "checkpoint-{}".format(global_step) + ) if not os.path.exists(output_dir_current): os.makedirs(output_dir_current) - if args['save_eval_checkpoints']: - model_to_save = model.module if hasattr(model, "module") else model + if args["save_eval_checkpoints"]: + model_to_save = ( + model.module if hasattr(model, "module") else model + ) model_to_save.save_pretrained(output_dir_current) self.tokenizer.save_pretrained(output_dir_current) - output_eval_file = os.path.join(output_dir_current, "eval_results.txt") + output_eval_file = os.path.join( + output_dir_current, "eval_results.txt" + ) with open(output_eval_file, "w") as writer: for key in sorted(results.keys()): writer.write("{} = {}\n".format(key, str(results[key]))) - training_progress_scores['global_step'].append(global_step) - training_progress_scores['train_loss'].append(current_loss) + training_progress_scores["global_step"].append(global_step) + training_progress_scores["train_loss"].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) - report.to_csv(args['output_dir'] + 'training_progress_scores.csv', index=False) + report.to_csv( + args["output_dir"] + "training_progress_scores.csv", + index=False, + ) - if args['wandb_project']: + if args["wandb_project"]: wandb.log(self._get_last_metrics(training_progress_scores)) epoch_number += 1 - output_dir_current = os.path.join(output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number)) + output_dir_current = os.path.join( + output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number) + ) - if (args['save_model_every_epoch'] or args['evaluate_during_training']) and not os.path.exists(output_dir_current): + if ( + args["save_model_every_epoch"] or args["evaluate_during_training"] + ) and not os.path.exists(output_dir_current): os.makedirs(output_dir_current) - if args['save_model_every_epoch'] and epoch_number != args['num_train_epochs']: + if ( + args["save_model_every_epoch"] + and epoch_number != args["num_train_epochs"] + ): model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(output_dir_current) self.tokenizer.save_pretrained(output_dir_current) - if args['evaluate_during_training']: + if args["evaluate_during_training"]: results, _ = self.eval_model(eval_data, verbose=True) output_eval_file = os.path.join(output_dir_current, "eval_results.txt") @@ -423,10 +579,12 @@ def eval_model(self, eval_data, output_dir=None, verbose=False): self._move_model_to_device() - all_predictions, all_nbest_json, scores_diff_json = self.evaluate(eval_data, output_dir) + all_predictions, all_nbest_json, scores_diff_json = self.evaluate( + eval_data, output_dir + ) if isinstance(eval_data, str): - with open(eval_data, 'r') as f: + with open(eval_data, "r") as f: truth = json.load(f) else: truth = eval_data @@ -455,15 +613,19 @@ def evaluate(self, eval_data, output_dir): results = {} if isinstance(eval_data, str): - with open(eval_data, 'r') as f: + with open(eval_data, "r") as f: eval_examples = json.load(f) else: eval_examples = eval_data - eval_dataset, examples, features = self.load_and_cache_examples(eval_examples, evaluate=True, output_examples=True) + eval_dataset, examples, features = self.load_and_cache_examples( + eval_examples, evaluate=True, output_examples=True + ) eval_sampler = SequentialSampler(eval_dataset) - eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args["eval_batch_size"]) + eval_dataloader = DataLoader( + eval_dataset, sampler=eval_sampler, batch_size=args["eval_batch_size"] + ) eval_loss = 0.0 nb_eval_steps = 0 @@ -472,62 +634,99 @@ def evaluate(self, eval_data, output_dir): model.eval() all_results = [] - for batch in tqdm(eval_dataloader, disable=args['silent']): + for batch in tqdm(eval_dataloader, disable=args["silent"]): batch = tuple(t.to(device) for t in batch) with torch.no_grad(): - inputs = {'input_ids': batch[0], - 'attention_mask': batch[1], - } + inputs = { + "input_ids": batch[0], + "attention_mask": batch[1], + } - if args['model_type'] != 'distilbert': - inputs['token_type_ids'] = None if args['model_type'] == 'xlm' else batch[2] + if args["model_type"] != "distilbert": + inputs["token_type_ids"] = ( + None if args["model_type"] == "xlm" else batch[2] + ) example_indices = batch[3] - if args['model_type'] in ['xlnet', 'xlm']: - inputs.update({'cls_index': batch[4], - 'p_mask': batch[5]}) + if args["model_type"] in ["xlnet", "xlm"]: + inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) - if args['model_type'] in ['xlnet', 'xlm']: + if args["model_type"] in ["xlnet", "xlm"]: # XLNet uses a more complex post-processing procedure - result = RawResultExtended(unique_id=unique_id, - start_top_log_probs=to_list(outputs[0][i]), - start_top_index=to_list(outputs[1][i]), - end_top_log_probs=to_list(outputs[2][i]), - end_top_index=to_list(outputs[3][i]), - cls_logits=to_list(outputs[4][i])) + result = RawResultExtended( + unique_id=unique_id, + start_top_log_probs=to_list(outputs[0][i]), + start_top_index=to_list(outputs[1][i]), + end_top_log_probs=to_list(outputs[2][i]), + end_top_index=to_list(outputs[3][i]), + cls_logits=to_list(outputs[4][i]), + ) else: - result = RawResult(unique_id=unique_id, - start_logits=to_list(outputs[0][i]), - end_logits=to_list(outputs[1][i])) + result = RawResult( + unique_id=unique_id, + start_logits=to_list(outputs[0][i]), + end_logits=to_list(outputs[1][i]), + ) all_results.append(result) - prefix = 'test' + prefix = "test" if not os.path.isdir(output_dir): os.mkdir(output_dir) - output_prediction_file = os.path.join(output_dir, "predictions_{}.json".format(prefix)) - output_nbest_file = os.path.join(output_dir, "nbest_predictions_{}.json".format(prefix)) - output_null_log_odds_file = os.path.join(output_dir, "null_odds_{}.json".format(prefix)) - - if args['model_type'] in ['xlnet', 'xlm']: + output_prediction_file = os.path.join( + output_dir, "predictions_{}.json".format(prefix) + ) + output_nbest_file = os.path.join( + output_dir, "nbest_predictions_{}.json".format(prefix) + ) + output_null_log_odds_file = os.path.join( + output_dir, "null_odds_{}.json".format(prefix) + ) + + if args["model_type"] in ["xlnet", "xlm"]: # XLNet uses a more complex post-processing procedure - all_predictions, all_nbest_json, scores_diff_json = write_predictions_extended(examples, features, all_results, args['n_best_size'], - args['max_answer_length'], output_prediction_file, - output_nbest_file, output_null_log_odds_file, eval_data, - model.config.start_n_top, model.config.end_n_top, - True, tokenizer, not args['silent']) + ( + all_predictions, + all_nbest_json, + scores_diff_json, + ) = write_predictions_extended( + examples, + features, + all_results, + args["n_best_size"], + args["max_answer_length"], + output_prediction_file, + output_nbest_file, + output_null_log_odds_file, + eval_data, + model.config.start_n_top, + model.config.end_n_top, + True, + tokenizer, + not args["silent"], + ) else: - all_predictions, all_nbest_json, scores_diff_json = write_predictions(examples, features, all_results, args['n_best_size'], - args['max_answer_length'], False, output_prediction_file, - output_nbest_file, output_null_log_odds_file, not args['silent'], - True, args['null_score_diff_threshold']) + all_predictions, all_nbest_json, scores_diff_json = write_predictions( + examples, + features, + all_results, + args["n_best_size"], + args["max_answer_length"], + False, + output_prediction_file, + output_nbest_file, + output_null_log_odds_file, + not args["silent"], + True, + args["null_score_diff_threshold"], + ) return all_predictions, all_nbest_json, scores_diff_json @@ -557,15 +756,19 @@ def predict(self, to_predict, n_best_size=None): args = self.args if not n_best_size: - n_best_size = args['n_best_size'] + n_best_size = args["n_best_size"] self._move_model_to_device() eval_examples = build_examples(to_predict) - eval_dataset, examples, features = self.load_and_cache_examples(eval_examples, evaluate=True, output_examples=True, no_cache=True) + eval_dataset, examples, features = self.load_and_cache_examples( + eval_examples, evaluate=True, output_examples=True, no_cache=True + ) eval_sampler = SequentialSampler(eval_dataset) - eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args["eval_batch_size"]) + eval_dataloader = DataLoader( + eval_dataset, sampler=eval_sampler, batch_size=args["eval_batch_size"] + ) eval_loss = 0.0 nb_eval_steps = 0 @@ -574,47 +777,73 @@ def predict(self, to_predict, n_best_size=None): model.eval() all_results = [] - for batch in tqdm(eval_dataloader, disable=args['silent']): + for batch in tqdm(eval_dataloader, disable=args["silent"]): batch = tuple(t.to(device) for t in batch) with torch.no_grad(): - inputs = {'input_ids': batch[0], - 'attention_mask': batch[1], - } + inputs = { + "input_ids": batch[0], + "attention_mask": batch[1], + } - if args['model_type'] != 'distilbert': - inputs['token_type_ids'] = None if args['model_type'] == 'xlm' else batch[2] + if args["model_type"] != "distilbert": + inputs["token_type_ids"] = ( + None if args["model_type"] == "xlm" else batch[2] + ) example_indices = batch[3] - if args['model_type'] in ['xlnet', 'xlm']: - inputs.update({'cls_index': batch[4], - 'p_mask': batch[5]}) + if args["model_type"] in ["xlnet", "xlm"]: + inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) - if args['model_type'] in ['xlnet', 'xlm']: + if args["model_type"] in ["xlnet", "xlm"]: # XLNet uses a more complex post-processing procedure - result = RawResultExtended(unique_id=unique_id, - start_top_log_probs=to_list(outputs[0][i]), - start_top_index=to_list(outputs[1][i]), - end_top_log_probs=to_list(outputs[2][i]), - end_top_index=to_list(outputs[3][i]), - cls_logits=to_list(outputs[4][i])) + result = RawResultExtended( + unique_id=unique_id, + start_top_log_probs=to_list(outputs[0][i]), + start_top_index=to_list(outputs[1][i]), + end_top_log_probs=to_list(outputs[2][i]), + end_top_index=to_list(outputs[3][i]), + cls_logits=to_list(outputs[4][i]), + ) else: - result = RawResult(unique_id=unique_id, - start_logits=to_list(outputs[0][i]), - end_logits=to_list(outputs[1][i])) + result = RawResult( + unique_id=unique_id, + start_logits=to_list(outputs[0][i]), + end_logits=to_list(outputs[1][i]), + ) all_results.append(result) - if args['model_type'] in ['xlnet', 'xlm']: - answers = get_best_predictions_extended(examples, features, all_results, n_best_size, - args['max_answer_length'], model.config.start_n_top, model.config.end_n_top, True, tokenizer, args['null_score_diff_threshold']) + if args["model_type"] in ["xlnet", "xlm"]: + answers = get_best_predictions_extended( + examples, + features, + all_results, + n_best_size, + args["max_answer_length"], + model.config.start_n_top, + model.config.end_n_top, + True, + tokenizer, + args["null_score_diff_threshold"], + ) else: - answers = get_best_predictions(examples, features, all_results, n_best_size, args['max_answer_length'], False, False, True, False) + answers = get_best_predictions( + examples, + features, + all_results, + n_best_size, + args["max_answer_length"], + False, + False, + True, + False, + ) return answers @@ -623,12 +852,12 @@ def calculate_results(self, truth, predictions): questions_dict = {} print(truth) for item in truth: - for answer in item['qas']: - if answer['answers']: - truth_dict[answer['id']] = answer['answers'][0]['text'] + for answer in item["qas"]: + if answer["answers"]: + truth_dict[answer["id"]] = answer["answers"][0]["text"] else: - truth_dict[answer['id']] = '' - questions_dict[answer['id']] = answer['question'] + truth_dict[answer["id"]] = "" + questions_dict[answer["id"]] = answer["question"] correct = 0 incorrect = 0 @@ -641,23 +870,34 @@ def calculate_results(self, truth, predictions): if predictions[q_id].strip() == answer.strip(): correct += 1 correct_text[q_id] = answer - elif predictions[q_id].strip() in answer.strip() or answer.strip() in predictions[q_id].strip(): + elif ( + predictions[q_id].strip() in answer.strip() + or answer.strip() in predictions[q_id].strip() + ): similar += 1 - similar_text[q_id] = {'truth': answer, 'predicted': predictions[q_id], 'question': questions_dict[q_id]} + similar_text[q_id] = { + "truth": answer, + "predicted": predictions[q_id], + "question": questions_dict[q_id], + } else: incorrect += 1 - incorrect_text[q_id] = {'truth': answer, 'predicted': predictions[q_id], 'question': questions_dict[q_id]} + incorrect_text[q_id] = { + "truth": answer, + "predicted": predictions[q_id], + "question": questions_dict[q_id], + } result = { - 'correct': correct, - 'similar': similar, - 'incorrect': incorrect, + "correct": correct, + "similar": similar, + "incorrect": incorrect, } texts = { - 'correct_text': correct_text, - 'similar_text': similar_text, - 'incorrect_text': incorrect_text, + "correct_text": correct_text, + "similar_text": similar_text, + "incorrect_text": incorrect_text, } return result, texts diff --git a/simpletransformers/question_answering/question_answering_utils.py b/simpletransformers/question_answering/question_answering_utils.py index ce740de7..390b2373 100755 --- a/simpletransformers/question_answering/question_answering_utils.py +++ b/simpletransformers/question_answering/question_answering_utils.py @@ -11,8 +11,7 @@ from tqdm import tqdm, trange import os import torch -from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, - TensorDataset) +from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize from pprint import pprint @@ -29,14 +28,16 @@ class InputExample(object): For examples without an answer, the start and end position are -1. """ - def __init__(self, - qas_id, - question_text, - doc_tokens, - orig_answer_text=None, - start_position=None, - end_position=None, - is_impossible=None): + def __init__( + self, + qas_id, + question_text, + doc_tokens, + orig_answer_text=None, + start_position=None, + end_position=None, + is_impossible=None, + ): self.qas_id = qas_id self.question_text = question_text self.doc_tokens = doc_tokens @@ -51,8 +52,7 @@ def __str__(self): def __repr__(self): s = "" s += "qas_id: %s" % (self.qas_id) - s += ", question_text: %s" % ( - self.question_text) + s += ", question_text: %s" % (self.question_text) s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) if self.start_position: s += ", start_position: %d" % (self.start_position) @@ -70,22 +70,24 @@ def to_list(tensor): class InputFeatures(object): """A single set of features of data.""" - def __init__(self, - unique_id, - example_index, - doc_span_index, - tokens, - token_to_orig_map, - token_is_max_context, - input_ids, - input_mask, - segment_ids, - cls_index, - p_mask, - paragraph_len, - start_position=None, - end_position=None, - is_impossible=None): + def __init__( + self, + unique_id, + example_index, + doc_span_index, + tokens, + token_to_orig_map, + token_is_max_context, + input_ids, + input_mask, + segment_ids, + cls_index, + p_mask, + paragraph_len, + start_position=None, + end_position=None, + is_impossible=None, + ): self.unique_id = unique_id self.example_index = example_index self.doc_span_index = doc_span_index @@ -135,20 +137,23 @@ def is_whitespace(c): start_position = None end_position = None orig_answer_text = None - is_impossible = qa.get('is_impossible') + is_impossible = qa.get("is_impossible") if is_training: if version_2_with_negative: is_impossible = qa["is_impossible"] if (len(qa["answers"]) != 1) and (not is_impossible): - raise ValueError("For training, each question should have exactly 1 answer.") + raise ValueError( + "For training, each question should have exactly 1 answer." + ) if not is_impossible: answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] - end_position = char_to_word_offset[answer_offset + - answer_length - 1] + end_position = char_to_word_offset[ + answer_offset + answer_length - 1 + ] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. @@ -156,11 +161,17 @@ def is_whitespace(c): # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( - doc_tokens[start_position:(end_position + 1)]) + doc_tokens[start_position : (end_position + 1)] + ) cleaned_answer_text = " ".join( - whitespace_tokenize(orig_answer_text)) + whitespace_tokenize(orig_answer_text) + ) if actual_text.find(cleaned_answer_text) == -1: - logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) + logger.warning( + "Could not find answer: '%s' vs. '%s'", + actual_text, + cleaned_answer_text, + ) continue else: start_position = -1 @@ -174,20 +185,31 @@ def is_whitespace(c): orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, - is_impossible=is_impossible) + is_impossible=is_impossible, + ) examples.append(example) return examples -def convert_examples_to_features(examples, tokenizer, max_seq_length, - doc_stride, max_query_length, is_training, - cls_token_at_end=False, - cls_token='[CLS]', sep_token='[SEP]', pad_token=0, - sequence_a_segment_id=0, sequence_b_segment_id=1, - cls_token_segment_id=0, pad_token_segment_id=0, - mask_padding_with_zero=True, - sequence_a_is_doc=False, - silent=False): +def convert_examples_to_features( + examples, + tokenizer, + max_seq_length, + doc_stride, + max_query_length, + is_training, + cls_token_at_end=False, + cls_token="[CLS]", + sep_token="[SEP]", + pad_token=0, + sequence_a_segment_id=0, + sequence_b_segment_id=1, + cls_token_segment_id=0, + pad_token_segment_id=0, + mask_padding_with_zero=True, + sequence_a_is_doc=False, + silent=False, +): """Loads a data file into a list of `InputBatch`s.""" unique_id = 1000000000 @@ -228,8 +250,12 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, else: tok_end_position = len(all_doc_tokens) - 1 (tok_start_position, tok_end_position) = _improve_answer_span( - all_doc_tokens, tok_start_position, tok_end_position, tokenizer, - example.orig_answer_text) + all_doc_tokens, + tok_start_position, + tok_end_position, + tokenizer, + example.orig_answer_text, + ) # The -3 accounts for [CLS], [SEP] and [SEP] max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 @@ -238,7 +264,8 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, # To deal with this we do a sliding window approach, where we take chunks # of the up to our max length with a stride of `doc_stride`. _DocSpan = collections.namedtuple( # pylint: disable=invalid-name - "DocSpan", ["start", "length"]) + "DocSpan", ["start", "length"] + ) doc_spans = [] start_offset = 0 while start_offset < len(all_doc_tokens): @@ -283,11 +310,11 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, # Paragraph for i in range(doc_span.length): split_token_index = doc_span.start + i - token_to_orig_map[len( - tokens)] = tok_to_orig_index[split_token_index] + token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] - is_max_context = _check_is_max_context(doc_spans, doc_span_index, - split_token_index) + is_max_context = _check_is_max_context( + doc_spans, doc_span_index, split_token_index + ) token_is_max_context[len(tokens)] = is_max_context tokens.append(all_doc_tokens[split_token_index]) if not sequence_a_is_doc: @@ -345,8 +372,9 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, doc_start = doc_span.start doc_end = doc_span.start + doc_span.length - 1 out_of_span = False - if not (tok_start_position >= doc_start and - tok_end_position <= doc_end): + if not ( + tok_start_position >= doc_start and tok_end_position <= doc_end + ): out_of_span = True if out_of_span: start_position = 0 @@ -370,26 +398,28 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, logger.info("example_index: %s" % (example_index)) logger.info("doc_span_index: %s" % (doc_span_index)) logger.info("tokens: %s" % " ".join(tokens)) - logger.info("token_to_orig_map: %s" % " ".join([ - "%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()])) - logger.info("token_is_max_context: %s" % " ".join([ - "%d:%s" % (x, y) for (x, y) in token_is_max_context.items() - ])) - logger.info("input_ids: %s" % - " ".join([str(x) for x in input_ids])) logger.info( - "input_mask: %s" % " ".join([str(x) for x in input_mask])) + "token_to_orig_map: %s" + % " ".join( + ["%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()] + ) + ) logger.info( - "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + "token_is_max_context: %s" + % " ".join( + ["%d:%s" % (x, y) for (x, y) in token_is_max_context.items()] + ) + ) + logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) if is_training and span_is_impossible: logger.info("impossible example") if is_training and not span_is_impossible: - answer_text = " ".join( - tokens[start_position:(end_position + 1)]) + answer_text = " ".join(tokens[start_position : (end_position + 1)]) logger.info("start_position: %d" % (start_position)) logger.info("end_position: %d" % (end_position)) - logger.info( - "answer: %s" % (answer_text)) + logger.info("answer: %s" % (answer_text)) features.append( InputFeatures( @@ -407,14 +437,17 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, paragraph_len=paragraph_len, start_position=start_position, end_position=end_position, - is_impossible=span_is_impossible)) + is_impossible=span_is_impossible, + ) + ) unique_id += 1 return features -def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, - orig_answer_text): +def _improve_answer_span( + doc_tokens, input_start, input_end, tokenizer, orig_answer_text +): """Returns tokenized answer spans that better match the annotated answer.""" # The SQuAD annotations are character based. We first project them to @@ -443,7 +476,7 @@ def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, for new_start in range(input_start, input_end + 1): for new_end in range(input_end, new_start - 1, -1): - text_span = " ".join(doc_tokens[new_start:(new_end + 1)]) + text_span = " ".join(doc_tokens[new_start : (new_end + 1)]) if text_span == tok_answer_text: return (new_start, new_end) @@ -479,8 +512,7 @@ def _check_is_max_context(doc_spans, cur_span_index, position): continue num_left_context = position - doc_span.start num_right_context = end - position - score = min(num_left_context, num_right_context) + \ - 0.01 * doc_span.length + score = min(num_left_context, num_right_context) + 0.01 * doc_span.length if best_score is None or score > best_score: best_score = score best_span_index = span_index @@ -488,14 +520,25 @@ def _check_is_max_context(doc_spans, cur_span_index, position): return cur_span_index == best_span_index -RawResult = collections.namedtuple("RawResult", - ["unique_id", "start_logits", "end_logits"]) - - -def write_predictions(all_examples, all_features, all_results, n_best_size, - max_answer_length, do_lower_case, output_prediction_file, - output_nbest_file, output_null_log_odds_file, verbose_logging, - version_2_with_negative, null_score_diff_threshold): +RawResult = collections.namedtuple( + "RawResult", ["unique_id", "start_logits", "end_logits"] +) + + +def write_predictions( + all_examples, + all_features, + all_results, + n_best_size, + max_answer_length, + do_lower_case, + output_prediction_file, + output_nbest_file, + output_null_log_odds_file, + verbose_logging, + version_2_with_negative, + null_score_diff_threshold, +): """Write final predictions to the json file and log-odds of null if needed.""" logger.info("Writing predictions to: %s" % (output_prediction_file)) logger.info("Writing nbest to: %s" % (output_nbest_file)) @@ -510,7 +553,8 @@ def write_predictions(all_examples, all_features, all_results, n_best_size, _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name "PrelimPrediction", - ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]) + ["feature_index", "start_index", "end_index", "start_logit", "end_logit"], + ) all_predictions = collections.OrderedDict() all_nbest_json = collections.OrderedDict() @@ -531,8 +575,7 @@ def write_predictions(all_examples, all_features, all_results, n_best_size, end_indexes = _get_best_indexes(result.end_logits, n_best_size) # if we could have irrelevant answers, get the min score of irrelevant if version_2_with_negative: - feature_null_score = result.start_logits[0] + \ - result.end_logits[0] + feature_null_score = result.start_logits[0] + result.end_logits[0] if feature_null_score < score_null: score_null = feature_null_score min_null_feature_index = feature_index @@ -564,7 +607,9 @@ def write_predictions(all_examples, all_features, all_results, n_best_size, start_index=start_index, end_index=end_index, start_logit=result.start_logits[start_index], - end_logit=result.end_logits[end_index])) + end_logit=result.end_logits[end_index], + ) + ) if version_2_with_negative: prelim_predictions.append( _PrelimPrediction( @@ -572,14 +617,18 @@ def write_predictions(all_examples, all_features, all_results, n_best_size, start_index=0, end_index=0, start_logit=null_start_logit, - end_logit=null_end_logit)) + end_logit=null_end_logit, + ) + ) prelim_predictions = sorted( prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), - reverse=True) + reverse=True, + ) _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name - "NbestPrediction", ["text", "start_logit", "end_logit"]) + "NbestPrediction", ["text", "start_logit", "end_logit"] + ) seen_predictions = {} nbest = [] @@ -588,12 +637,10 @@ def write_predictions(all_examples, all_features, all_results, n_best_size, break feature = features[pred.feature_index] if pred.start_index > 0: # this is a non-null prediction - tok_tokens = feature.tokens[pred.start_index:( - pred.end_index + 1)] + tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)] orig_doc_start = feature.token_to_orig_map[pred.start_index] orig_doc_end = feature.token_to_orig_map[pred.end_index] - orig_tokens = example.doc_tokens[orig_doc_start:( - orig_doc_end + 1)] + orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)] tok_text = " ".join(tok_tokens) # De-tokenize WordPieces that have been split off. @@ -606,7 +653,8 @@ def write_predictions(all_examples, all_features, all_results, n_best_size, orig_text = " ".join(orig_tokens) final_text = get_final_text( - tok_text, orig_text, do_lower_case, verbose_logging) + tok_text, orig_text, do_lower_case, verbose_logging + ) if final_text in seen_predictions: continue @@ -619,27 +667,29 @@ def write_predictions(all_examples, all_features, all_results, n_best_size, _NbestPrediction( text=final_text, start_logit=pred.start_logit, - end_logit=pred.end_logit)) + end_logit=pred.end_logit, + ) + ) # if we didn't include the empty option in the n-best, include it if version_2_with_negative: if "" not in seen_predictions: nbest.append( _NbestPrediction( - text="", - start_logit=null_start_logit, - end_logit=null_end_logit)) + text="", start_logit=null_start_logit, end_logit=null_end_logit + ) + ) # In very rare edge cases we could only have single null prediction. # So we just create a nonce prediction in this case to avoid failure. if len(nbest) == 1: - nbest.insert(0, - _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) + nbest.insert( + 0, _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0) + ) # In very rare edge cases we could have no valid predictions. So we # just create a nonce prediction in this case to avoid failure. if not nbest: - nbest.append( - _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) + nbest.append(_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) assert len(nbest) >= 1 @@ -668,8 +718,11 @@ def write_predictions(all_examples, all_features, all_results, n_best_size, all_predictions[example.qas_id] = nbest_json[0]["text"] else: # predict "" iff the null score - the score of best non-null > threshold - score_diff = score_null - best_non_null_entry.start_logit - ( - best_non_null_entry.end_logit) + score_diff = ( + score_null + - best_non_null_entry.start_logit + - (best_non_null_entry.end_logit) + ) scores_diff_json[example.qas_id] = score_diff if score_diff > null_score_diff_threshold: all_predictions[example.qas_id] = "" @@ -691,28 +744,47 @@ def write_predictions(all_examples, all_features, all_results, n_best_size, # For XLNet (and XLM which uses the same head) -RawResultExtended = collections.namedtuple("RawResultExtended", - ["unique_id", "start_top_log_probs", "start_top_index", - "end_top_log_probs", "end_top_index", "cls_logits"]) - - -def write_predictions_extended(all_examples, all_features, all_results, n_best_size, - max_answer_length, output_prediction_file, - output_nbest_file, - output_null_log_odds_file, orig_data_file, - start_n_top, end_n_top, version_2_with_negative, - tokenizer, verbose_logging): +RawResultExtended = collections.namedtuple( + "RawResultExtended", + [ + "unique_id", + "start_top_log_probs", + "start_top_index", + "end_top_log_probs", + "end_top_index", + "cls_logits", + ], +) + + +def write_predictions_extended( + all_examples, + all_features, + all_results, + n_best_size, + max_answer_length, + output_prediction_file, + output_nbest_file, + output_null_log_odds_file, + orig_data_file, + start_n_top, + end_n_top, + version_2_with_negative, + tokenizer, + verbose_logging, +): """ XLNet write prediction logic (more complex than Bert's). Write final predictions to the json file and log-odds of null if needed. Requires utils_squad_evaluate.py """ _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name "PrelimPrediction", - ["feature_index", "start_index", "end_index", - "start_log_prob", "end_log_prob"]) + ["feature_index", "start_index", "end_index", "start_log_prob", "end_log_prob"], + ) _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name - "NbestPrediction", ["text", "start_log_prob", "end_log_prob"]) + "NbestPrediction", ["text", "start_log_prob", "end_log_prob"] + ) logger.info("Writing predictions to: %s", output_prediction_file) # logger.info("Writing nbest to: %s" % (output_nbest_file)) @@ -776,12 +848,15 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s start_index=start_index, end_index=end_index, start_log_prob=start_log_prob, - end_log_prob=end_log_prob)) + end_log_prob=end_log_prob, + ) + ) prelim_predictions = sorted( prelim_predictions, key=lambda x: (x.start_log_prob + x.end_log_prob), - reverse=True) + reverse=True, + ) seen_predictions = {} nbest = [] @@ -801,10 +876,10 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s # final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip() # Previously used Bert untokenizer - tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)] + tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)] orig_doc_start = feature.token_to_orig_map[pred.start_index] orig_doc_end = feature.token_to_orig_map[pred.end_index] - orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)] + orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)] tok_text = tokenizer.convert_tokens_to_string(tok_tokens) # Clean whitespace @@ -812,8 +887,7 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s tok_text = " ".join(tok_text.split()) orig_text = " ".join(orig_tokens) - final_text = get_final_text(tok_text, orig_text, False, - verbose_logging) + final_text = get_final_text(tok_text, orig_text, False, verbose_logging) if final_text in seen_predictions: continue @@ -824,14 +898,16 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s _NbestPrediction( text=final_text, start_log_prob=pred.start_log_prob, - end_log_prob=pred.end_log_prob)) + end_log_prob=pred.end_log_prob, + ) + ) # In very rare edge cases we could have no valid predictions. So we # just create a nonce prediction in this case to avoid failure. if not nbest: nbest.append( - _NbestPrediction(text="", start_log_prob=-1e6, - end_log_prob=-1e6)) + _NbestPrediction(text="", start_log_prob=-1e6, end_log_prob=-1e6) + ) total_scores = [] best_non_null_entry = None @@ -873,7 +949,7 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s writer.write(json.dumps(scores_diff_json, indent=4) + "\n") if isinstance(orig_data_file, str): - with open(orig_data_file, "r", encoding='utf-8') as reader: + with open(orig_data_file, "r", encoding="utf-8") as reader: orig_data = json.load(reader) else: orig_data = orig_data_file @@ -884,15 +960,24 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s exact_raw, f1_raw = get_raw_scores(orig_data, all_predictions) out_eval = {} - find_all_best_thresh_v2(out_eval, all_predictions, - exact_raw, f1_raw, scores_diff_json, qid_to_has_ans) + find_all_best_thresh_v2( + out_eval, all_predictions, exact_raw, f1_raw, scores_diff_json, qid_to_has_ans + ) return all_predictions, all_nbest_json, scores_diff_json -def get_best_predictions(all_examples, all_features, all_results, n_best_size, - max_answer_length, do_lower_case, verbose_logging, - version_2_with_negative, null_score_diff_threshold): +def get_best_predictions( + all_examples, + all_features, + all_results, + n_best_size, + max_answer_length, + do_lower_case, + verbose_logging, + version_2_with_negative, + null_score_diff_threshold, +): example_index_to_features = collections.defaultdict(list) for feature in all_features: @@ -904,7 +989,8 @@ def get_best_predictions(all_examples, all_features, all_results, n_best_size, _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name "PrelimPrediction", - ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]) + ["feature_index", "start_index", "end_index", "start_logit", "end_logit"], + ) all_predictions = collections.OrderedDict() all_nbest_json = collections.OrderedDict() @@ -925,8 +1011,7 @@ def get_best_predictions(all_examples, all_features, all_results, n_best_size, end_indexes = _get_best_indexes(result.end_logits, n_best_size) # if we could have irrelevant answers, get the min score of irrelevant if version_2_with_negative: - feature_null_score = result.start_logits[0] + \ - result.end_logits[0] + feature_null_score = result.start_logits[0] + result.end_logits[0] if feature_null_score < score_null: score_null = feature_null_score min_null_feature_index = feature_index @@ -958,7 +1043,9 @@ def get_best_predictions(all_examples, all_features, all_results, n_best_size, start_index=start_index, end_index=end_index, start_logit=result.start_logits[start_index], - end_logit=result.end_logits[end_index])) + end_logit=result.end_logits[end_index], + ) + ) if version_2_with_negative: prelim_predictions.append( _PrelimPrediction( @@ -966,14 +1053,18 @@ def get_best_predictions(all_examples, all_features, all_results, n_best_size, start_index=0, end_index=0, start_logit=null_start_logit, - end_logit=null_end_logit)) + end_logit=null_end_logit, + ) + ) prelim_predictions = sorted( prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), - reverse=True) + reverse=True, + ) _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name - "NbestPrediction", ["text", "start_logit", "end_logit"]) + "NbestPrediction", ["text", "start_logit", "end_logit"] + ) seen_predictions = {} nbest = [] @@ -982,12 +1073,10 @@ def get_best_predictions(all_examples, all_features, all_results, n_best_size, break feature = features[pred.feature_index] if pred.start_index > 0: # this is a non-null prediction - tok_tokens = feature.tokens[pred.start_index:( - pred.end_index + 1)] + tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)] orig_doc_start = feature.token_to_orig_map[pred.start_index] orig_doc_end = feature.token_to_orig_map[pred.end_index] - orig_tokens = example.doc_tokens[orig_doc_start:( - orig_doc_end + 1)] + orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)] tok_text = " ".join(tok_tokens) # De-tokenize WordPieces that have been split off. @@ -1000,7 +1089,8 @@ def get_best_predictions(all_examples, all_features, all_results, n_best_size, orig_text = " ".join(orig_tokens) final_text = get_final_text( - tok_text, orig_text, do_lower_case, verbose_logging) + tok_text, orig_text, do_lower_case, verbose_logging + ) if final_text in seen_predictions: continue @@ -1013,27 +1103,29 @@ def get_best_predictions(all_examples, all_features, all_results, n_best_size, _NbestPrediction( text=final_text, start_logit=pred.start_logit, - end_logit=pred.end_logit)) + end_logit=pred.end_logit, + ) + ) # if we didn't include the empty option in the n-best, include it if version_2_with_negative: if "" not in seen_predictions: nbest.append( _NbestPrediction( - text="", - start_logit=null_start_logit, - end_logit=null_end_logit)) + text="", start_logit=null_start_logit, end_logit=null_end_logit + ) + ) # In very rare edge cases we could only have single null prediction. # So we just create a nonce prediction in this case to avoid failure. if len(nbest) == 1: - nbest.insert(0, - _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) + nbest.insert( + 0, _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0) + ) # In very rare edge cases we could have no valid predictions. So we # just create a nonce prediction in this case to avoid failure. if not nbest: - nbest.append( - _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) + nbest.append(_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) assert len(nbest) >= 1 @@ -1062,8 +1154,11 @@ def get_best_predictions(all_examples, all_features, all_results, n_best_size, all_predictions[example.qas_id] = nbest_json[0]["text"] else: # predict "" iff the null score - the score of best non-null > threshold - score_diff = score_null - best_non_null_entry.start_logit - ( - best_non_null_entry.end_logit) + score_diff = ( + score_null + - best_non_null_entry.start_logit + - (best_non_null_entry.end_logit) + ) scores_diff_json[example.qas_id] = score_diff if score_diff > null_score_diff_threshold: all_predictions[example.qas_id] = "" @@ -1071,25 +1166,37 @@ def get_best_predictions(all_examples, all_features, all_results, n_best_size, all_predictions[example.qas_id] = best_non_null_entry.text all_nbest_json[example.qas_id] = nbest_json - all_best = [{'id': id, 'answer': answers[0]['text']} for id, answers in all_nbest_json.items()] + all_best = [ + {"id": id, "answer": answers[0]["text"]} + for id, answers in all_nbest_json.items() + ] return all_best -def get_best_predictions_extended(all_examples, all_features, all_results, n_best_size, - max_answer_length, - start_n_top, end_n_top, version_2_with_negative, - tokenizer, verbose_logging): +def get_best_predictions_extended( + all_examples, + all_features, + all_results, + n_best_size, + max_answer_length, + start_n_top, + end_n_top, + version_2_with_negative, + tokenizer, + verbose_logging, +): """ XLNet write prediction logic (more complex than Bert's). Write final predictions to the json file and log-odds of null if needed. Requires utils_squad_evaluate.py """ _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name "PrelimPrediction", - ["feature_index", "start_index", "end_index", - "start_log_prob", "end_log_prob"]) + ["feature_index", "start_index", "end_index", "start_log_prob", "end_log_prob"], + ) _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name - "NbestPrediction", ["text", "start_log_prob", "end_log_prob"]) + "NbestPrediction", ["text", "start_log_prob", "end_log_prob"] + ) example_index_to_features = collections.defaultdict(list) for feature in all_features: @@ -1150,12 +1257,15 @@ def get_best_predictions_extended(all_examples, all_features, all_results, n_bes start_index=start_index, end_index=end_index, start_log_prob=start_log_prob, - end_log_prob=end_log_prob)) + end_log_prob=end_log_prob, + ) + ) prelim_predictions = sorted( prelim_predictions, key=lambda x: (x.start_log_prob + x.end_log_prob), - reverse=True) + reverse=True, + ) seen_predictions = {} nbest = [] @@ -1175,10 +1285,10 @@ def get_best_predictions_extended(all_examples, all_features, all_results, n_bes # final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip() # Previously used Bert untokenizer - tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)] + tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)] orig_doc_start = feature.token_to_orig_map[pred.start_index] orig_doc_end = feature.token_to_orig_map[pred.end_index] - orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)] + orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)] tok_text = tokenizer.convert_tokens_to_string(tok_tokens) # Clean whitespace @@ -1186,8 +1296,9 @@ def get_best_predictions_extended(all_examples, all_features, all_results, n_bes tok_text = " ".join(tok_text.split()) orig_text = " ".join(orig_tokens) - final_text = get_final_text(tok_text, orig_text, tokenizer.do_lower_case, - verbose_logging) + final_text = get_final_text( + tok_text, orig_text, tokenizer.do_lower_case, verbose_logging + ) if final_text in seen_predictions: continue @@ -1198,14 +1309,16 @@ def get_best_predictions_extended(all_examples, all_features, all_results, n_bes _NbestPrediction( text=final_text, start_log_prob=pred.start_log_prob, - end_log_prob=pred.end_log_prob)) + end_log_prob=pred.end_log_prob, + ) + ) # In very rare edge cases we could have no valid predictions. So we # just create a nonce prediction in this case to avoid failure. if not nbest: nbest.append( - _NbestPrediction(text="", start_log_prob=-1e6, - end_log_prob=-1e6)) + _NbestPrediction(text="", start_log_prob=-1e6, end_log_prob=-1e6) + ) total_scores = [] best_non_null_entry = None @@ -1236,21 +1349,28 @@ def get_best_predictions_extended(all_examples, all_features, all_results, n_bes all_nbest_json[example.qas_id] = nbest_json - all_best = [{'id': id, 'answer': answers[0]['text']} for id, answers in all_nbest_json.items()] + all_best = [ + {"id": id, "answer": answers[0]["text"]} + for id, answers in all_nbest_json.items() + ] return all_best -def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans): +def find_all_best_thresh_v2( + main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans +): best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2( - preds, exact_raw, na_probs, qid_to_has_ans) + preds, exact_raw, na_probs, qid_to_has_ans + ) best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2( - preds, f1_raw, na_probs, qid_to_has_ans) - main_eval['best_exact'] = best_exact - main_eval['best_exact_thresh'] = exact_thresh - main_eval['best_f1'] = best_f1 - main_eval['best_f1_thresh'] = f1_thresh - main_eval['has_ans_exact'] = has_ans_exact - main_eval['has_ans_f1'] = has_ans_f1 + preds, f1_raw, na_probs, qid_to_has_ans + ) + main_eval["best_exact"] = best_exact + main_eval["best_exact_thresh"] = exact_thresh + main_eval["best_f1"] = best_f1 + main_eval["best_f1_thresh"] = f1_thresh + main_eval["has_ans_exact"] = has_ans_exact + main_eval["has_ans_f1"] = has_ans_f1 def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans): @@ -1284,14 +1404,18 @@ def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans): continue has_ans_score += scores[qid] - return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt + return ( + 100.0 * best_score / len(scores), + best_thresh, + 1.0 * has_ans_score / has_ans_cnt, + ) def make_qid_to_has_ans(dataset): qid_to_has_ans = {} for p in dataset: - for qa in p['qas']: - qid_to_has_ans[qa['id']] = bool(qa['answers']) + for qa in p["qas"]: + qid_to_has_ans[qa["id"]] = bool(qa["answers"]) return qid_to_has_ans @@ -1299,15 +1423,16 @@ def get_raw_scores(dataset, preds): exact_scores = {} f1_scores = {} for p in dataset: - for qa in p['qas']: - qid = qa['id'] - gold_answers = [a['text'] for a in qa['answers'] - if normalize_answer(a['text'])] + for qa in p["qas"]: + qid = qa["id"] + gold_answers = [ + a["text"] for a in qa["answers"] if normalize_answer(a["text"]) + ] if not gold_answers: # For unanswerable questions, only correct answer is empty string - gold_answers = [''] + gold_answers = [""] if qid not in preds: - logger.warning('Missing prediction for %s' % qid) + logger.warning("Missing prediction for %s" % qid) continue a_pred = preds[qid] # Take max over all gold answers @@ -1344,19 +1469,21 @@ def get_tokens(s): def normalize_answer(s): """Lower text and remove punctuation, articles and extra whitespace.""" + def remove_articles(text): - regex = re.compile(r'\b(a|an|the)\b', re.UNICODE) - return re.sub(regex, ' ', text) + regex = re.compile(r"\b(a|an|the)\b", re.UNICODE) + return re.sub(regex, " ", text) def white_space_fix(text): - return ' '.join(text.split()) + return " ".join(text.split()) def remove_punc(text): exclude = set(string.punctuation) - return ''.join(ch for ch in text if ch not in exclude) + return "".join(ch for ch in text if ch not in exclude) def lower(text): return text.lower() + return white_space_fix(remove_articles(remove_punc(lower(s)))) @@ -1410,8 +1537,7 @@ def _strip_spaces(text): start_position = tok_text.find(pred_text) if start_position == -1: if verbose_logging: - logger.info( - "Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) + logger.info("Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) return orig_text end_position = start_position + len(pred_text) - 1 @@ -1420,8 +1546,11 @@ def _strip_spaces(text): if len(orig_ns_text) != len(tok_ns_text): if verbose_logging: - logger.info("Length not equal after stripping spaces: '%s' vs '%s'", - orig_ns_text, tok_ns_text) + logger.info( + "Length not equal after stripping spaces: '%s' vs '%s'", + orig_ns_text, + tok_ns_text, + ) return orig_text # We then project the characters in `pred_text` back to `orig_text` using @@ -1452,14 +1581,13 @@ def _strip_spaces(text): logger.info("Couldn't map end position") return orig_text - output_text = orig_text[orig_start_position:(orig_end_position + 1)] + output_text = orig_text[orig_start_position : (orig_end_position + 1)] return output_text def _get_best_indexes(logits, n_best_size): """Get the n-best logits from a list.""" - index_and_score = sorted( - enumerate(logits), key=lambda x: x[1], reverse=True) + index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True) best_indexes = [] for i in range(len(index_and_score)): @@ -1503,17 +1631,11 @@ def build_examples(to_predict): examples = [] for row in to_predict: - context = row['context'] - for qa in row['qas']: - qa['answers'] = [{ - 'text': ' ', - 'answer_start': 0 - }] - qa['is_impossible']: False - example = { - 'context': context, - 'qas': row['qas'] - } + context = row["context"] + for qa in row["qas"]: + qa["answers"] = [{"text": " ", "answer_start": 0}] + qa["is_impossible"]: False + example = {"context": context, "qas": row["qas"]} examples.append(example) return examples