From fecd7b727f50388430231da9c0ef4d5e1495e763 Mon Sep 17 00:00:00 2001 From: flaviussn Date: Mon, 20 Jan 2020 11:16:31 +0000 Subject: [PATCH] Revert to original code --- setup.cfg | 6 +- simpletransformers/__init__.py | 2 +- simpletransformers/classification/__init__.py | 4 +- .../classification/classification_model.py | 747 ++++++------------ .../classification/classification_utils.py | 146 +--- .../multi_label_classification_model.py | 168 ++-- simpletransformers/config/global_args.py | 71 +- simpletransformers/ner/__init__.py | 2 +- simpletransformers/ner/ner_model.py | 468 +++-------- simpletransformers/ner/ner_utils.py | 177 ++--- .../question_answering/__init__.py | 4 +- .../question_answering_model.py | 684 ++++++---------- .../question_answering_utils.py | 569 ++++++------- 13 files changed, 994 insertions(+), 2054 deletions(-) diff --git a/setup.cfg b/setup.cfg index 17f59a2f..e47c9af6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,7 +1,7 @@ [tool:pytest] python_functions=test_ -codestyle_max_line_length = 88 +codestyle_max_line_length = 119 log_cli = true log_cli_level = WARNING @@ -11,8 +11,8 @@ description-file = README.md license_file = LICENSE.txt [pycodestyle] -max-line-length = 88 +max-line-length = 119 [flake8] -max-line-length = 88 +max-line-length = 119 ignore = E203 , W503, F401 diff --git a/simpletransformers/__init__.py b/simpletransformers/__init__.py index a2e35fe1..e9343088 100755 --- a/simpletransformers/__init__.py +++ b/simpletransformers/__init__.py @@ -1 +1 @@ -name = "simpletransformers" +name = "simpletransformers" \ No newline at end of file diff --git a/simpletransformers/classification/__init__.py b/simpletransformers/classification/__init__.py index 69bcdda6..6520c360 100755 --- a/simpletransformers/classification/__init__.py +++ b/simpletransformers/classification/__init__.py @@ -1,4 +1,2 @@ from simpletransformers.classification.classification_model import ClassificationModel -from simpletransformers.classification.multi_label_classification_model import ( - MultiLabelClassificationModel, -) +from simpletransformers.classification.multi_label_classification_model import MultiLabelClassificationModel \ No newline at end of file diff --git a/simpletransformers/classification/classification_model.py b/simpletransformers/classification/classification_model.py index 02b61725..b104840b 100755 --- a/simpletransformers/classification/classification_model.py +++ b/simpletransformers/classification/classification_model.py @@ -6,73 +6,55 @@ import os import math +import json +import random import warnings +from multiprocessing import cpu_count import torch import numpy as np import pandas as pd -from scipy.stats import mode -from sklearn.metrics import ( - matthews_corrcoef, - confusion_matrix, - label_ranking_average_precision_score, -) +from scipy.stats import pearsonr, mode +from sklearn.metrics import mean_squared_error, matthews_corrcoef, confusion_matrix, label_ranking_average_precision_score from tensorboardX import SummaryWriter from tqdm.auto import trange, tqdm -from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset +from torch.utils.data.distributed import DistributedSampler +from torch.utils.data import ( + DataLoader, + RandomSampler, + SequentialSampler, + TensorDataset +) from transformers import AdamW, get_linear_schedule_with_warmup from transformers import ( - BertConfig, - BertTokenizer, - XLNetConfig, - XLNetTokenizer, - XLMConfig, - XLMTokenizer, - RobertaConfig, - RobertaTokenizer, - DistilBertConfig, - DistilBertTokenizer, - AlbertConfig, - AlbertTokenizer, - CamembertConfig, - CamembertTokenizer, - XLMRobertaConfig, - XLMRobertaTokenizer, + WEIGHTS_NAME, + BertConfig, BertTokenizer, + XLNetConfig, XLNetTokenizer, + XLMConfig, XLMTokenizer, + RobertaConfig, RobertaTokenizer, + DistilBertConfig, DistilBertTokenizer, + AlbertConfig, AlbertTokenizer, + CamembertConfig, CamembertTokenizer, + XLMRobertaConfig, XLMRobertaTokenizer, ) from simpletransformers.classification.classification_utils import ( InputExample, - convert_examples_to_features, + convert_examples_to_features ) -from simpletransformers.classification.transformer_models.bert_model import ( - BertForSequenceClassification, -) -from simpletransformers.classification.transformer_models.roberta_model import ( - RobertaForSequenceClassification, -) -from simpletransformers.classification.transformer_models.xlm_model import ( - XLMForSequenceClassification, -) -from simpletransformers.classification.transformer_models.xlnet_model import ( - XLNetForSequenceClassification, -) -from simpletransformers.classification.transformer_models.distilbert_model import ( - DistilBertForSequenceClassification, -) -from simpletransformers.classification.transformer_models.albert_model import ( - AlbertForSequenceClassification, -) -from simpletransformers.classification.transformer_models.camembert_model import ( - CamembertForSequenceClassification, -) -from simpletransformers.classification.transformer_models.xlm_roberta_model import ( - XLMRobertaForSequenceClassification, -) +from simpletransformers.classification.transformer_models.bert_model import BertForSequenceClassification +from simpletransformers.classification.transformer_models.roberta_model import RobertaForSequenceClassification +from simpletransformers.classification.transformer_models.xlm_model import XLMForSequenceClassification +from simpletransformers.classification.transformer_models.xlnet_model import XLNetForSequenceClassification +from simpletransformers.classification.transformer_models.distilbert_model import DistilBertForSequenceClassification +from simpletransformers.classification.transformer_models.albert_model import AlbertForSequenceClassification +from simpletransformers.classification.transformer_models.camembert_model import CamembertForSequenceClassification +from simpletransformers.classification.transformer_models.xlm_roberta_model import XLMRobertaForSequenceClassification from simpletransformers.config.global_args import global_args @@ -80,16 +62,7 @@ class ClassificationModel: - def __init__( - self, - model_type, - model_name, - num_labels=None, - weight=None, - args=None, - use_cuda=True, - cuda_device=-1, - ): + def __init__(self, model_type, model_name, num_labels=None, weight=None, args=None, use_cuda=True, cuda_device=-1): """ Initializes a ClassificationModel model. @@ -101,40 +74,22 @@ def __init__( args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args. use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only. cuda_device (optional): Specific GPU that should be used. Will use the first available GPU by default. - """ # noqa: ignore flake8 + """ MODEL_CLASSES = { - "bert": (BertConfig, BertForSequenceClassification, BertTokenizer), - "xlnet": (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer), - "xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer), - "roberta": ( - RobertaConfig, - RobertaForSequenceClassification, - RobertaTokenizer, - ), - "distilbert": ( - DistilBertConfig, - DistilBertForSequenceClassification, - DistilBertTokenizer, - ), - "albert": (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer), - "camembert": ( - CamembertConfig, - CamembertForSequenceClassification, - CamembertTokenizer, - ), - "xlmroberta": ( - XLMRobertaConfig, - XLMRobertaForSequenceClassification, - XLMRobertaTokenizer, - ), + 'bert': (BertConfig, BertForSequenceClassification, BertTokenizer), + 'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer), + 'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer), + 'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer), + 'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer), + 'albert': (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer), + 'camembert': (CamembertConfig, CamembertForSequenceClassification, CamembertTokenizer), + 'xlmroberta': (XLMRobertaConfig, XLMRobertaForSequenceClassification, XLMRobertaTokenizer), } config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type] if num_labels: - self.config = config_class.from_pretrained( - model_name, num_labels=num_labels - ) + self.config = config_class.from_pretrained(model_name, num_labels=num_labels) self.num_labels = num_labels else: self.config = config_class.from_pretrained(model_name) @@ -148,63 +103,44 @@ def __init__( else: self.device = torch.device(f"cuda:{cuda_device}") else: - raise ValueError( - "'use_cuda' set to True when cuda is unavailable. Make sure CUDA is" - "available or set use_cuda=False." - ) + raise ValueError("'use_cuda' set to True when cuda is unavailable. Make sure CUDA is available or set use_cuda=False.") else: self.device = "cpu" if self.weight: - self.model = model_class.from_pretrained( - model_name, - config=self.config, - weight=torch.Tensor(self.weight).to(self.device), - ) + self.model = model_class.from_pretrained(model_name, config=self.config, weight=torch.Tensor(self.weight).to(self.device)) else: self.model = model_class.from_pretrained(model_name, config=self.config) self.results = {} self.args = { - "sliding_window": False, - "tie_value": 1, - "stride": 0.8, - "regression": False, + 'sliding_window': False, + 'tie_value': 1, + 'stride': 0.8, + + 'regression': False, } self.args.update(global_args) if not use_cuda: - self.args["fp16"] = False + self.args['fp16'] = False if args: self.args.update(args) - self.tokenizer = tokenizer_class.from_pretrained( - model_name, do_lower_case=self.args["do_lower_case"] - ) + self.tokenizer = tokenizer_class.from_pretrained(model_name, do_lower_case=self.args['do_lower_case']) - self.args["model_name"] = model_name - self.args["model_type"] = model_type + self.args['model_name'] = model_name + self.args['model_type'] = model_type - if model_type in ["camembert", "xlmroberta"]: - warnings.warn( - f"use_multiprocessing automatically disabled as {model_type} fails" - "when using multiprocessing for feature conversion." - ) - self.args["use_multiprocessing"] = False - - def train_model( - self, - train_df, - multi_label=False, - output_dir=None, - show_running_loss=True, - args=None, - eval_df=None, - **kwargs, - ): + if model_type in ['camembert', 'xlmroberta']: + warnings.warn(f"use_multiprocessing automatically disabled as {model_type} fails when using multiprocessing for feature conversion.") + self.args['use_multiprocessing'] = False + + + def train_model(self, train_df, multi_label=False, output_dir=None, show_running_loss=True, args=None, eval_df=None, **kwargs): """ Trains the model using 'train_df' @@ -220,167 +156,86 @@ def train_model( Returns: None - """ # noqa: ignore flake8 + """ if args: self.args.update(args) - if self.args["silent"]: + if self.args['silent']: show_running_loss = False - if self.args["evaluate_during_training"] and eval_df is None: - raise ValueError( - "evaluate_during_training is enabled but eval_df is not specified." - " Pass eval_df to model.train_model() if using" - "evaluate_during_training." - ) + if self.args['evaluate_during_training'] and eval_df is None: + raise ValueError("evaluate_during_training is enabled but eval_df is not specified. Pass eval_df to model.train_model() if using evaluate_during_training.") if not output_dir: - output_dir = self.args["output_dir"] + output_dir = self.args['output_dir'] - if ( - os.path.exists(output_dir) - and os.listdir(output_dir) - and not self.args["overwrite_output_dir"] - ): - raise ValueError( - "Output directory ({}) already exists and is not empty." - "Use --overwrite_output_dir to overcome.".format(output_dir) - ) + if os.path.exists(output_dir) and os.listdir(output_dir) and not self.args["overwrite_output_dir"]: + raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(output_dir)) self._move_model_to_device() - if "text" in train_df.columns and "labels" in train_df.columns: - train_examples = [ - InputExample(i, text, None, label) - for i, (text, label) in enumerate( - zip(train_df["text"], train_df["labels"]) - ) - ] - elif "text_a" in train_df.columns and "text_b" in train_df.columns: - train_examples = [ - InputExample(i, text_a, text_b, label) - for i, (text_a, text_b, label) in enumerate( - zip(train_df["text_a"], train_df["text_b"], train_df["labels"]) - ) - ] + if 'text' in train_df.columns and 'labels' in train_df.columns: + train_examples = [InputExample(i, text, None, label) for i, (text, label) in enumerate(zip(train_df['text'], train_df['labels']))] + elif 'text_a' in train_df.columns and 'text_b' in train_df.columns: + train_examples = [InputExample(i, text_a, text_b, label) for i, (text_a, text_b, label) in enumerate(zip(train_df['text_a'], train_df['text_b'], train_df['labels']))] else: - warnings.warn( - "Dataframe headers not specified. Falling back to using column" - " 0 as text and column 1 as labels." - ) - train_examples = [ - InputExample(i, text, None, label) - for i, (text, label) in enumerate( - zip(train_df.iloc[:, 0], train_df.iloc[:, 1]) - ) - ] + warnings.warn("Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels.") + train_examples = [InputExample(i, text, None, label) for i, (text, label) in enumerate(zip(train_df.iloc[:, 0], train_df.iloc[:, 1]))] train_dataset = self.load_and_cache_examples(train_examples) if not os.path.exists(output_dir): os.makedirs(output_dir) - global_step, tr_loss = self.train( - train_dataset, - output_dir, - multi_label=multi_label, - show_running_loss=show_running_loss, - eval_df=eval_df, - **kwargs, - ) - - model_to_save = ( - self.model.module if hasattr(self.model, "module") else self.model - ) + global_step, tr_loss = self.train(train_dataset, output_dir, multi_label=multi_label, show_running_loss=show_running_loss, eval_df=eval_df, **kwargs) + + model_to_save = self.model.module if hasattr(self.model, "module") else self.model model_to_save.save_pretrained(output_dir) self.tokenizer.save_pretrained(output_dir) torch.save(self.args, os.path.join(output_dir, "training_args.bin")) - print( - "Training of {} model complete. Saved to {}.".format( - self.args["model_type"], output_dir - ) - ) - - def train( - self, - train_dataset, - output_dir, - multi_label=False, - show_running_loss=True, - eval_df=None, - **kwargs, - ): + print("Training of {} model complete. Saved to {}.".format(self.args["model_type"], output_dir)) + + def train(self, train_dataset, output_dir, multi_label=False, show_running_loss=True, eval_df=None, **kwargs): """ Trains the model on train_dataset. - Utility function to be used by the train_model() method. Not intended" - "to be used directly. + Utility function to be used by the train_model() method. Not intended to be used directly. """ + tokenizer = self.tokenizer device = self.device model = self.model args = self.args tb_writer = SummaryWriter(logdir=args["tensorboard_dir"]) train_sampler = RandomSampler(train_dataset) - train_dataloader = DataLoader( - train_dataset, sampler=train_sampler, batch_size=args["train_batch_size"] - ) + train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args["train_batch_size"]) - t_total = ( - len(train_dataloader) - // args["gradient_accumulation_steps"] - * args["num_train_epochs"] - ) + t_total = len(train_dataloader) // args["gradient_accumulation_steps"] * args["num_train_epochs"] no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ - { - "params": [ - p - for n, p in model.named_parameters() - if not any(nd in n for nd in no_decay) - ], - "weight_decay": args["weight_decay"], - }, - { - "params": [ - p - for n, p in model.named_parameters() - if any(nd in n for nd in no_decay) - ], - "weight_decay": 0.0, - }, + {"params": [p for n, p in model.named_parameters() if not any( + nd in n for nd in no_decay)], "weight_decay": args["weight_decay"]}, + {"params": [p for n, p in model.named_parameters() if any( + nd in n for nd in no_decay)], "weight_decay": 0.0} ] warmup_steps = math.ceil(t_total * args["warmup_ratio"]) - args["warmup_steps"] = ( - warmup_steps if args["warmup_steps"] == 0 else args["warmup_steps"] - ) - - optimizer = AdamW( - optimizer_grouped_parameters, - lr=args["learning_rate"], - eps=args["adam_epsilon"], - ) - scheduler = get_linear_schedule_with_warmup( - optimizer, num_warmup_steps=args["warmup_steps"], num_training_steps=t_total - ) + args["warmup_steps"] = warmup_steps if args["warmup_steps"] == 0 else args["warmup_steps"] + + optimizer = AdamW(optimizer_grouped_parameters, lr=args["learning_rate"], eps=args["adam_epsilon"]) + scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args["warmup_steps"], num_training_steps=t_total) if args["fp16"]: try: from apex import amp except ImportError: - raise ImportError( - "Please install apex from https://www.github.com/nvidia/apex " - "to use fp16 training." - ) + raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") - model, optimizer = amp.initialize( - model, optimizer, opt_level=args["fp16_opt_level"] - ) + model, optimizer = amp.initialize(model, optimizer, opt_level=args["fp16_opt_level"]) if args["n_gpu"] > 1: model = torch.nn.DataParallel(model) @@ -388,61 +243,55 @@ def train( global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() - train_iterator = trange( - int(args["num_train_epochs"]), desc="Epoch", disable=args["silent"] - ) + train_iterator = trange(int(args["num_train_epochs"]), desc="Epoch", disable=args['silent']) epoch_number = 0 - if args["evaluate_during_training"]: + if args['evaluate_during_training']: extra_metrics = {key: [] for key in kwargs} if multi_label: training_progress_scores = { - "global_step": [], - "LRAP": [], - "train_loss": [], - "eval_loss": [], - **extra_metrics, + 'global_step': [], + 'LRAP': [], + 'train_loss': [], + 'eval_loss': [], + **extra_metrics } else: if self.model.num_labels == 2: training_progress_scores = { - "global_step": [], - "tp": [], - "tn": [], - "fp": [], - "fn": [], - "mcc": [], - "train_loss": [], - "eval_loss": [], - **extra_metrics, + 'global_step': [], + 'tp': [], + 'tn': [], + 'fp': [], + 'fn': [], + 'mcc': [], + 'train_loss': [], + 'eval_loss': [], + **extra_metrics } elif self.model.num_labels == 1: - training_progress_scores = { - "global_step": [], - "train_loss": [], - "eval_loss": [], - **extra_metrics, + training_progress_scores = { + 'global_step': [], + 'train_loss': [], + 'eval_loss': [], + **extra_metrics } else: training_progress_scores = { - "global_step": [], - "mcc": [], - "train_loss": [], - "eval_loss": [], - **extra_metrics, + 'global_step': [], + 'mcc': [], + 'train_loss': [], + 'eval_loss': [], + **extra_metrics } - if args["wandb_project"]: - wandb.init( - project=args["wandb_project"], config={**args}, **args["wandb_kwargs"] - ) + if args['wandb_project']: + wandb.init(project=args['wandb_project'], config={**args}, **args['wandb_kwargs']) wandb.watch(self.model) model.train() for _ in train_iterator: # epoch_iterator = tqdm(train_dataloader, desc="Iteration") - for step, batch in enumerate( - tqdm(train_dataloader, desc="Current iteration", disable=args["silent"]) - ): + for step, batch in enumerate(tqdm(train_dataloader, desc="Current iteration", disable=args['silent'])): batch = tuple(t.to(device) for t in batch) inputs = self._get_inputs_dict(batch) @@ -450,10 +299,8 @@ def train( # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] - if args["n_gpu"] > 1: - loss = ( - loss.mean() - ) # mean() to average on multi-gpu parallel training + if args['n_gpu'] > 1: + loss = loss.mean() # mean() to average on multi-gpu parallel training current_loss = loss.item() @@ -466,14 +313,10 @@ def train( if args["fp16"]: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() - torch.nn.utils.clip_grad_norm_( - amp.master_params(optimizer), args["max_grad_norm"] - ) + torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args["max_grad_norm"]) else: loss.backward() - torch.nn.utils.clip_grad_norm_( - model.parameters(), args["max_grad_norm"] - ) + torch.nn.utils.clip_grad_norm_(model.parameters(), args["max_grad_norm"]) tr_loss += loss.item() if (step + 1) % args["gradient_accumulation_steps"] == 0: @@ -482,108 +325,69 @@ def train( model.zero_grad() global_step += 1 - if ( - args["logging_steps"] > 0 - and global_step % args["logging_steps"] == 0 - ): + if args["logging_steps"] > 0 and global_step % args["logging_steps"] == 0: # Log metrics tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) - tb_writer.add_scalar( - "loss", - (tr_loss - logging_loss) / args["logging_steps"], - global_step, - ) + tb_writer.add_scalar("loss", (tr_loss - logging_loss)/args["logging_steps"], global_step) logging_loss = tr_loss - if args["wandb_project"]: - wandb.log( - { - "Training loss": current_loss, - "lr": scheduler.get_lr()[0], - "global_step": global_step, - } - ) + if args['wandb_project']: + wandb.log({'Training loss': current_loss, 'lr': scheduler.get_lr()[0], 'global_step': global_step}) if args["save_steps"] > 0 and global_step % args["save_steps"] == 0: # Save model checkpoint - output_dir_current = os.path.join( - output_dir, "checkpoint-{}".format(global_step) - ) + output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir_current): os.makedirs(output_dir_current) # Take care of distributed/parallel training - model_to_save = ( - model.module if hasattr(model, "module") else model - ) + model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(output_dir_current) self.tokenizer.save_pretrained(output_dir_current) - if args["evaluate_during_training"] and ( - args["evaluate_during_training_steps"] > 0 - and global_step % args["evaluate_during_training_steps"] == 0 - ): - # Only evaluate when single GPU otherwise - # metrics may not average well + if args['evaluate_during_training'] and (args["evaluate_during_training_steps"] > 0 and global_step % args["evaluate_during_training_steps"] == 0): + # Only evaluate when single GPU otherwise metrics may not average well results, _, _ = self.eval_model(eval_df, verbose=True, **kwargs) for key, value in results.items(): - tb_writer.add_scalar( - "eval_{}".format(key), value, global_step - ) + tb_writer.add_scalar('eval_{}'.format(key), value, global_step) - output_dir_current = os.path.join( - output_dir, "checkpoint-{}".format(global_step) - ) + output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir_current): os.makedirs(output_dir_current) - if args["save_eval_checkpoints"]: - model_to_save = ( - model.module if hasattr(model, "module") else model - ) + if args['save_eval_checkpoints']: + model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(output_dir_current) self.tokenizer.save_pretrained(output_dir_current) - output_eval_file = os.path.join( - output_dir_current, "eval_results.txt" - ) + output_eval_file = os.path.join(output_dir_current, "eval_results.txt") with open(output_eval_file, "w") as writer: for key in sorted(results.keys()): writer.write("{} = {}\n".format(key, str(results[key]))) - training_progress_scores["global_step"].append(global_step) - training_progress_scores["train_loss"].append(current_loss) + training_progress_scores['global_step'].append(global_step) + training_progress_scores['train_loss'].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) - report.to_csv( - args["output_dir"] + "training_progress_scores.csv", - index=False, - ) + report.to_csv(args['output_dir'] + 'training_progress_scores.csv', index=False) - if args["wandb_project"]: + if args['wandb_project']: wandb.log(self._get_last_metrics(training_progress_scores)) epoch_number += 1 - output_dir_current = os.path.join( - output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number) - ) + output_dir_current = os.path.join(output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number)) - if ( - args["save_model_every_epoch"] or args["evaluate_during_training"] - ) and not os.path.exists(output_dir_current): + if (args['save_model_every_epoch'] or args['evaluate_during_training']) and not os.path.exists(output_dir_current): os.makedirs(output_dir_current) - if ( - args["save_model_every_epoch"] - and epoch_number != args["num_train_epochs"] - ): + if args['save_model_every_epoch'] and epoch_number != args['num_train_epochs']: model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(output_dir_current) self.tokenizer.save_pretrained(output_dir_current) - if args["evaluate_during_training"]: + if args['evaluate_during_training']: results, _, _ = self.eval_model(eval_df, verbose=True, **kwargs) output_eval_file = os.path.join(output_dir_current, "eval_results.txt") @@ -591,20 +395,16 @@ def train( for key in sorted(results.keys()): writer.write("{} = {}\n".format(key, str(results[key]))) - training_progress_scores["global_step"].append(global_step) - training_progress_scores["train_loss"].append(current_loss) + training_progress_scores['global_step'].append(global_step) + training_progress_scores['train_loss'].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) - report.to_csv( - args["output_dir"] + "training_progress_scores.csv", index=False - ) + report.to_csv(args['output_dir'] + 'training_progress_scores.csv', index=False) return global_step, tr_loss / global_step - def eval_model( - self, eval_df, multi_label=False, output_dir=None, verbose=False, **kwargs - ): + def eval_model(self, eval_df, multi_label=False, output_dir=None, verbose=False, **kwargs): """ Evaluates the model on eval_df. Saves results to output_dir. @@ -620,16 +420,14 @@ def eval_model( result: Dictionary containing evaluation results. (Matthews correlation coefficient, tp, tn, fp, fn) model_outputs: List of model outputs for each row in eval_df wrong_preds: List of InputExample objects corresponding to each incorrect prediction by the model - """ # noqa: ignore flake8 + """ if not output_dir: output_dir = self.args["output_dir"] self._move_model_to_device() - result, model_outputs, wrong_preds = self.evaluate( - eval_df, output_dir, multi_label=multi_label, **kwargs - ) + result, model_outputs, wrong_preds = self.evaluate(eval_df, output_dir, multi_label=multi_label, **kwargs) self.results.update(result) if verbose: @@ -641,10 +439,10 @@ def evaluate(self, eval_df, output_dir, multi_label=False, prefix="", **kwargs): """ Evaluates the model on eval_df. - Utility function to be used by the eval_model() method. Not intended to - be used directly. + Utility function to be used by the eval_model() method. Not intended to be used directly. """ + tokenizer = self.tokenizer device = self.device model = self.model args = self.args @@ -652,45 +450,23 @@ def evaluate(self, eval_df, output_dir, multi_label=False, prefix="", **kwargs): results = {} - if "text" in eval_df.columns and "labels" in eval_df.columns: - eval_examples = [ - InputExample(i, text, None, label) - for i, (text, label) in enumerate( - zip(eval_df["text"], eval_df["labels"]) - ) - ] - elif "text_a" in eval_df.columns and "text_b" in eval_df.columns: - eval_examples = [ - InputExample(i, text_a, text_b, label) - for i, (text_a, text_b, label) in enumerate( - zip(eval_df["text_a"], eval_df["text_b"], eval_df["labels"]) - ) - ] + if 'text' in eval_df.columns and 'labels' in eval_df.columns: + eval_examples = [InputExample(i, text, None, label) for i, (text, label) in enumerate(zip(eval_df['text'], eval_df['labels']))] + elif 'text_a' in eval_df.columns and 'text_b' in eval_df.columns: + eval_examples = [InputExample(i, text_a, text_b, label) for i, (text_a, text_b, label) in enumerate(zip(eval_df['text_a'], eval_df['text_b'], eval_df['labels']))] else: - warnings.warn( - "Dataframe headers not specified." - " Falling back to using column 0 as text and column 1 as labels." - ) - eval_examples = [ - InputExample(i, text, None, label) - for i, (text, label) in enumerate( - zip(eval_df.iloc[:, 0], eval_df.iloc[:, 1]) - ) - ] - - if args["sliding_window"]: - eval_dataset, window_counts = self.load_and_cache_examples( - eval_examples, evaluate=True - ) + warnings.warn("Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels.") + eval_examples = [InputExample(i, text, None, label) for i, (text, label) in enumerate(zip(eval_df.iloc[:, 0], eval_df.iloc[:, 1]))] + + if args['sliding_window']: + eval_dataset, window_counts = self.load_and_cache_examples(eval_examples, evaluate=True) else: eval_dataset = self.load_and_cache_examples(eval_examples, evaluate=True) if not os.path.exists(eval_output_dir): os.makedirs(eval_output_dir) eval_sampler = SequentialSampler(eval_dataset) - eval_dataloader = DataLoader( - eval_dataset, sampler=eval_sampler, batch_size=args["eval_batch_size"] - ) + eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args["eval_batch_size"]) eval_loss = 0.0 nb_eval_steps = 0 @@ -698,7 +474,7 @@ def evaluate(self, eval_df, output_dir, multi_label=False, prefix="", **kwargs): out_label_ids = None model.eval() - for batch in tqdm(eval_dataloader, disable=args["silent"]): + for batch in tqdm(eval_dataloader, disable=args['silent']): batch = tuple(t.to(device) for t in batch) with torch.no_grad(): @@ -719,27 +495,19 @@ def evaluate(self, eval_df, output_dir, multi_label=False, prefix="", **kwargs): else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( - out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0 - ) + out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps - if args["sliding_window"]: + if args['sliding_window']: count = 0 window_ranges = [] for n_windows in window_counts: window_ranges.append([count, count + n_windows]) count += n_windows - preds = [ - preds[window_range[0] : window_range[1]] - for window_range in window_ranges - ] - out_label_ids = [ - out_label_ids[i] - for i in range(len(out_label_ids)) - if i in [window[0] for window in window_ranges] - ] + preds = [preds[window_range[0]: window_range[1]] for window_range in window_ranges] + out_label_ids = [out_label_ids[i] for i in range(len(out_label_ids)) if i in [window[0] for window in window_ranges]] model_outputs = preds @@ -748,11 +516,11 @@ def evaluate(self, eval_df, output_dir, multi_label=False, prefix="", **kwargs): for pred_row in preds: mode_pred, counts = mode(pred_row) if len(counts) > 1 and counts[0] == counts[1]: - final_preds.append(args["tie_value"]) + final_preds.append(args['tie_value']) else: final_preds.append(mode_pred[0]) preds = np.array(final_preds) - elif not multi_label and args["regression"] is True: + elif not multi_label and args['regression'] == True: preds = np.squeeze(preds) model_outputs = preds else: @@ -761,10 +529,8 @@ def evaluate(self, eval_df, output_dir, multi_label=False, prefix="", **kwargs): if not multi_label: preds = np.argmax(preds, axis=1) - result, wrong = self.compute_metrics( - preds, out_label_ids, eval_examples, **kwargs - ) - result["eval_loss"] = eval_loss + result, wrong = self.compute_metrics(preds, out_label_ids, eval_examples, **kwargs) + result['eval_loss'] = eval_loss results.update(result) output_eval_file = os.path.join(eval_output_dir, "eval_results.txt") @@ -774,24 +540,20 @@ def evaluate(self, eval_df, output_dir, multi_label=False, prefix="", **kwargs): return results, model_outputs, wrong - def load_and_cache_examples( - self, examples, evaluate=False, no_cache=False, multi_label=False - ): + def load_and_cache_examples(self, examples, evaluate=False, no_cache=False, multi_label=False): """ - Converts a list of InputExample objects to a TensorDataset containing - InputFeatures. Caches the InputFeatures. + Converts a list of InputExample objects to a TensorDataset containing InputFeatures. Caches the InputFeatures. - Utility function for train() and eval() methods. Not intended to be - used directly. + Utility function for train() and eval() methods. Not intended to be used directly. """ process_count = self.args["process_count"] tokenizer = self.tokenizer args = self.args - - if not multi_label and args["regression"]: - output_mode = "regression" + + if not multi_label and args['regression']: + output_mode = 'regression' else: output_mode = "classification" @@ -799,21 +561,9 @@ def load_and_cache_examples( os.mkdir(self.args["cache_dir"]) mode = "dev" if evaluate else "train" - cached_features_file = os.path.join( - args["cache_dir"], - "cached_{}_{}_{}_{}_{}".format( - mode, - args["model_type"], - args["max_seq_length"], - self.num_labels, - len(examples), - ), - ) - - if os.path.exists(cached_features_file) and ( - (not args["reprocess_input_data"] and not no_cache) - or (mode == "dev" and args["use_cached_eval_features"]) - ): + cached_features_file = os.path.join(args["cache_dir"], "cached_{}_{}_{}_{}_{}".format(mode, args["model_type"], args["max_seq_length"], self.num_labels, len(examples))) + + if os.path.exists(cached_features_file) and ((not args["reprocess_input_data"] and not no_cache) or (mode == "dev" and args['use_cached_eval_features'])): features = torch.load(cached_features_file) print(f"Features loaded from cache at {cached_features_file}") else: @@ -828,8 +578,7 @@ def load_and_cache_examples( cls_token=tokenizer.cls_token, cls_token_segment_id=2 if args["model_type"] in ["xlnet"] else 0, sep_token=tokenizer.sep_token, - # RoBERTa uses an extra separator b/w pairs of sentences, cf. - # github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 # noqa: ignore flake8 + # RoBERTa uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 sep_token_extra=bool(args["model_type"] in ["roberta"]), # PAD on the left for XLNet pad_on_left=bool(args["model_type"] in ["xlnet"]), @@ -837,49 +586,37 @@ def load_and_cache_examples( pad_token_segment_id=4 if args["model_type"] in ["xlnet"] else 0, process_count=process_count, multi_label=multi_label, - silent=args["silent"], - use_multiprocessing=args["use_multiprocessing"], - sliding_window=args["sliding_window"], + silent=args['silent'], + use_multiprocessing=args['use_multiprocessing'], + sliding_window=args['sliding_window'], flatten=not evaluate, - stride=args["stride"], + stride=args['stride'] ) if not no_cache: torch.save(features, cached_features_file) - if args["sliding_window"] and evaluate: + if args['sliding_window'] and evaluate: window_counts = [len(sample) for sample in features] features = [feature for feature_set in features for feature in feature_set] all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) - all_input_mask = torch.tensor( - [f.input_mask for f in features], dtype=torch.long - ) - all_segment_ids = torch.tensor( - [f.segment_ids for f in features], dtype=torch.long - ) + all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) + all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) if output_mode == "classification": - all_label_ids = torch.tensor( - [f.label_id for f in features], dtype=torch.long - ) + all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) elif output_mode == "regression": - all_label_ids = torch.tensor( - [f.label_id for f in features], dtype=torch.float - ) + all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) - dataset = TensorDataset( - all_input_ids, all_input_mask, all_segment_ids, all_label_ids - ) + dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) - if args["sliding_window"] and evaluate: + if args['sliding_window'] and evaluate: return dataset, window_counts else: return dataset - def compute_metrics( - self, preds, labels, eval_examples, multi_label=False, **kwargs - ): + def compute_metrics(self, preds, labels, eval_examples, multi_label=False, **kwargs): """ Computes the evaluation metrics for the model predictions. @@ -893,7 +630,7 @@ def compute_metrics( Returns: result: Dictionary containing evaluation results. (Matthews correlation coefficient, tp, tn, fp, fn) wrong: List of InputExample objects corresponding to each incorrect prediction by the model - """ # noqa: ignore flake8 + """ assert len(preds) == len(labels) @@ -908,20 +645,20 @@ def compute_metrics( if multi_label: label_ranking_score = label_ranking_average_precision_score(labels, preds) return {**{"LRAP": label_ranking_score}, **extra_metrics}, wrong - elif self.args["regression"]: + elif self.args['regression']: return {**extra_metrics}, wrong - + mcc = matthews_corrcoef(labels, preds) if self.model.num_labels == 2: tn, fp, fn, tp = confusion_matrix(labels, preds).ravel() - return ( - { - **{"mcc": mcc, "tp": tp, "tn": tn, "fp": fp, "fn": fn}, - **extra_metrics, - }, - wrong, - ) + return {**{ + "mcc": mcc, + "tp": tp, + "tn": tn, + "fp": fp, + "fn": fn + }, **extra_metrics}, wrong else: return {**{"mcc": mcc}, **extra_metrics}, wrong @@ -930,14 +667,14 @@ def predict(self, to_predict, multi_label=False): Performs predictions on a list of text. Args: - to_predict: A python list of text (str) to be sent to the model - for prediction. + to_predict: A python list of text (str) to be sent to the model for prediction. Returns: preds: A python list of the predictions (0 or 1) for each text. model_outputs: A python list of the raw model outputs for each text. """ + tokenizer = self.tokenizer device = self.device model = self.model args = self.args @@ -945,40 +682,26 @@ def predict(self, to_predict, multi_label=False): self._move_model_to_device() if multi_label: - eval_examples = [ - InputExample(i, text, None, [0 for i in range(self.num_labels)]) - for i, text in enumerate(to_predict) - ] + eval_examples = [InputExample(i, text, None, [0 for i in range(self.num_labels)]) for i, text in enumerate(to_predict)] else: if isinstance(to_predict[0], list): - eval_examples = [ - InputExample(i, text[0], text[1], 0) - for i, text in enumerate(to_predict) - ] + eval_examples = [InputExample(i, text[0], text[1], 0) for i, text in enumerate(to_predict)] else: - eval_examples = [ - InputExample(i, text, None, 0) for i, text in enumerate(to_predict) - ] - if args["sliding_window"]: - eval_dataset, window_counts = self.load_and_cache_examples( - eval_examples, evaluate=True, no_cache=True - ) + eval_examples = [InputExample(i, text, None, 0) for i, text in enumerate(to_predict)] + if args['sliding_window']: + eval_dataset, window_counts = self.load_and_cache_examples(eval_examples, evaluate=True, no_cache=True) else: - eval_dataset = self.load_and_cache_examples( - eval_examples, evaluate=True, multi_label=multi_label, no_cache=True - ) + eval_dataset = self.load_and_cache_examples(eval_examples, evaluate=True, multi_label=multi_label, no_cache=True) eval_sampler = SequentialSampler(eval_dataset) - eval_dataloader = DataLoader( - eval_dataset, sampler=eval_sampler, batch_size=args["eval_batch_size"] - ) + eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args["eval_batch_size"]) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None - for batch in tqdm(eval_dataloader, disable=args["silent"]): + for batch in tqdm(eval_dataloader, disable=args['silent']): model.eval() batch = tuple(t.to(device) for t in batch) @@ -999,23 +722,18 @@ def predict(self, to_predict, multi_label=False): out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) - out_label_ids = np.append( - out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0 - ) + out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps - if args["sliding_window"]: + if args['sliding_window']: count = 0 window_ranges = [] for n_windows in window_counts: window_ranges.append([count, count + n_windows]) count += n_windows - preds = [ - preds[window_range[0] : window_range[1]] - for window_range in window_ranges - ] + preds = [preds[window_range[0]: window_range[1]] for window_range in window_ranges] model_outputs = preds @@ -1024,30 +742,21 @@ def predict(self, to_predict, multi_label=False): for pred_row in preds: mode_pred, counts = mode(pred_row) if len(counts) > 1 and counts[0] == counts[1]: - final_preds.append(args["tie_value"]) + final_preds.append(args['tie_value']) else: final_preds.append(mode_pred[0]) preds = np.array(final_preds) - elif not multi_label and args["regression"] is True: + elif not multi_label and args['regression'] == True: preds = np.squeeze(preds) model_outputs = preds else: model_outputs = preds if multi_label: - if isinstance(args["threshold"], list): - threshold_values = args["threshold"] - preds = [ - [ - self._threshold(pred, threshold_values[i]) - for i, pred in enumerate(example) - ] - for example in preds - ] + if isinstance(args['threshold'], list): + threshold_values = args['threshold'] + preds = [[self._threshold(pred, threshold_values[i]) for i, pred in enumerate(example)] for example in preds] else: - preds = [ - [self._threshold(pred, args["threshold"]) for pred in example] - for example in preds - ] + preds = [[self._threshold(pred, args['threshold']) for pred in example] for example in preds] else: preds = np.argmax(preds, axis=1) @@ -1062,13 +771,15 @@ def _move_model_to_device(self): self.model.to(self.device) def _get_inputs_dict(self, batch): - inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} + inputs = { + "input_ids": batch[0], + "attention_mask": batch[1], + "labels": batch[3] + } # XLM, DistilBERT and RoBERTa don't use segment_ids if self.args["model_type"] != "distilbert": - inputs["token_type_ids"] = ( - batch[2] if self.args["model_type"] in ["bert", "xlnet"] else None - ) + inputs["token_type_ids"] = batch[2] if self.args["model_type"] in ["bert", "xlnet"] else None return inputs diff --git a/simpletransformers/classification/classification_utils.py b/simpletransformers/classification/classification_utils.py index 44c7ea3a..85a4d15f 100755 --- a/simpletransformers/classification/classification_utils.py +++ b/simpletransformers/classification/classification_utils.py @@ -16,10 +16,17 @@ """ BERT classification fine-tuning: utilities to work with GLUE tasks """ from __future__ import absolute_import, division, print_function + +import os +import sys import csv + +from io import open from multiprocessing import Pool, cpu_count from tqdm.auto import tqdm +from scipy.stats import pearsonr, spearmanr +from sklearn.metrics import matthews_corrcoef, f1_score csv.field_size_limit(2147483647) @@ -65,23 +72,9 @@ def convert_example_to_feature( cls_token_segment_id=1, pad_token_segment_id=0, mask_padding_with_zero=True, - sep_token_extra=False, + sep_token_extra=False ): - ( - example, - max_seq_length, - tokenizer, - output_mode, - cls_token_at_end, - cls_token, - sep_token, - cls_token_segment_id, - pad_on_left, - pad_token_segment_id, - sep_token_extra, - multi_label, - stride, - ) = example_row + example, max_seq_length, tokenizer, output_mode, cls_token_at_end, cls_token, sep_token, cls_token_segment_id, pad_on_left, pad_token_segment_id, sep_token_extra, multi_label, stride = example_row tokens_a = tokenizer.tokenize(example.text_a) @@ -97,7 +90,7 @@ def convert_example_to_feature( # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa. special_tokens_count = 3 if sep_token_extra else 2 if len(tokens_a) > max_seq_length - special_tokens_count: - tokens_a = tokens_a[: (max_seq_length - special_tokens_count)] + tokens_a = tokens_a[:(max_seq_length - special_tokens_count)] # The convention in BERT is: # (a) For sequence pairs: @@ -141,15 +134,11 @@ def convert_example_to_feature( padding_length = max_seq_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids - input_mask = ( - [0 if mask_padding_with_zero else 1] * padding_length - ) + input_mask + input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids else: input_ids = input_ids + ([pad_token] * padding_length) - input_mask = input_mask + ( - [0 if mask_padding_with_zero else 1] * padding_length - ) + input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length) segment_ids = segment_ids + ([pad_token_segment_id] * padding_length) assert len(input_ids) == max_seq_length @@ -163,14 +152,14 @@ def convert_example_to_feature( # else: # raise KeyError(output_mode) - if output_mode == "regression": - label_id = float(example.label) # noqa: ignore flake8 + if output_mode == 'regression': + label_id = float(example.label) return InputFeatures( input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, - label_id=example.label, + label_id=example.label ) @@ -184,21 +173,7 @@ def convert_example_to_feature_sliding_window( mask_padding_with_zero=True, sep_token_extra=False, ): - ( - example, - max_seq_length, - tokenizer, - output_mode, - cls_token_at_end, - cls_token, - sep_token, - cls_token_segment_id, - pad_on_left, - pad_token_segment_id, - sep_token_extra, - multi_label, - stride, - ) = example_row + example, max_seq_length, tokenizer, output_mode, cls_token_at_end, cls_token, sep_token, cls_token_segment_id, pad_on_left, pad_token_segment_id, sep_token_extra, multi_label, stride = example_row if stride < 1: stride = int(max_seq_length * stride) @@ -208,17 +183,14 @@ def convert_example_to_feature_sliding_window( tokens_a = tokenizer.tokenize(example.text_a) + special_tokens_count = 3 if sep_token_extra else 2 if len(tokens_a) > bucket_size: - token_sets = [ - tokens_a[i : i + bucket_size] for i in range(0, len(tokens_a), stride) - ] + token_sets = [tokens_a[i:i + bucket_size] for i in range(0, len(tokens_a), stride)] else: token_sets.append(tokens_a) if example.text_b: - raise ValueError( - "Sequence pair tasks not implemented for sliding window tokenization." - ) + raise ValueError("Sequence pair tasks not implemented for sliding window tokenization.") # The convention in BERT is: # (a) For sequence pairs: @@ -261,15 +233,11 @@ def convert_example_to_feature_sliding_window( padding_length = max_seq_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids - input_mask = ( - [0 if mask_padding_with_zero else 1] * padding_length - ) + input_mask + input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids else: input_ids = input_ids + ([pad_token] * padding_length) - input_mask = input_mask + ( - [0 if mask_padding_with_zero else 1] * padding_length - ) + input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length) segment_ids = segment_ids + ([pad_token_segment_id] * padding_length) assert len(input_ids) == max_seq_length @@ -288,7 +256,7 @@ def convert_example_to_feature_sliding_window( input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, - label_id=example.label, + label_id=example.label ) ) @@ -317,81 +285,37 @@ def convert_examples_to_features( use_multiprocessing=True, sliding_window=False, flatten=False, - stride=None, + stride=None ): """ Loads a data file into a list of `InputBatch`s `cls_token_at_end` define the location of the CLS token: - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] - `cls_token_segment_id` define the segment id associated to the CLS token - (0 for BERT, 2 for XLNet) + `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet) """ - examples = [ - ( - example, - max_seq_length, - tokenizer, - output_mode, - cls_token_at_end, - cls_token, - sep_token, - cls_token_segment_id, - pad_on_left, - pad_token_segment_id, - sep_token_extra, - multi_label, - stride, - ) - for example in examples - ] + examples = [(example, max_seq_length, tokenizer, output_mode, cls_token_at_end, cls_token, sep_token, cls_token_segment_id, pad_on_left, pad_token_segment_id, sep_token_extra, multi_label, stride) for example in examples] if use_multiprocessing: if sliding_window: - print("sliding_window enabled") + print('sliding_window enabled') with Pool(process_count) as p: - features = list( - tqdm( - p.imap( - convert_example_to_feature_sliding_window, - examples, - chunksize=500, - ), - total=len(examples), - disable=silent, - ) - ) + features = list(tqdm(p.imap(convert_example_to_feature_sliding_window, examples, chunksize=500), total=len(examples), disable=silent)) if flatten: - features = [ - feature for feature_set in features for feature in feature_set - ] - print(f"{len(features)} features created from {len(examples)} samples.") + features = [feature for feature_set in features for feature in feature_set] + print(f'{len(features)} features created from {len(examples)} samples.') else: with Pool(process_count) as p: - features = list( - tqdm( - p.imap(convert_example_to_feature, examples, chunksize=500), - total=len(examples), - disable=silent, - ) - ) + features = list(tqdm(p.imap(convert_example_to_feature, examples, chunksize=500), total=len(examples), disable=silent)) else: if sliding_window: - print("sliding_window enabled") - features = [ - convert_example_to_feature_sliding_window(example) - for example in tqdm(examples, disable=silent) - ] + print('sliding_window enabled') + features = [convert_example_to_feature_sliding_window(example) for example in tqdm(examples, disable=silent)] if flatten: - features = [ - feature for feature_set in features for feature in feature_set - ] - print(f"{len(features)} features created from {len(examples)} samples.") + features = [feature for feature_set in features for feature in feature_set] + print(f'{len(features)} features created from {len(examples)} samples.') else: - features = [ - convert_example_to_feature(example) - for example in tqdm(examples, disable=silent) - ] + features = [convert_example_to_feature(example) for example in tqdm(examples, disable=silent)] return features diff --git a/simpletransformers/classification/multi_label_classification_model.py b/simpletransformers/classification/multi_label_classification_model.py index 2c07c28a..268f6435 100755 --- a/simpletransformers/classification/multi_label_classification_model.py +++ b/simpletransformers/classification/multi_label_classification_model.py @@ -1,42 +1,30 @@ import torch +from multiprocessing import cpu_count + from simpletransformers.classification import ClassificationModel -from simpletransformers.custom_models.models import ( - BertForMultiLabelSequenceClassification, - RobertaForMultiLabelSequenceClassification, - XLNetForMultiLabelSequenceClassification, - XLMForMultiLabelSequenceClassification, - DistilBertForMultiLabelSequenceClassification, - AlbertForMultiLabelSequenceClassification, -) +from simpletransformers.custom_models.models import (BertForMultiLabelSequenceClassification, + RobertaForMultiLabelSequenceClassification, + XLNetForMultiLabelSequenceClassification, + XLMForMultiLabelSequenceClassification, + DistilBertForMultiLabelSequenceClassification, + AlbertForMultiLabelSequenceClassification + ) from simpletransformers.config.global_args import global_args from transformers import ( - BertConfig, - BertTokenizer, - XLNetConfig, - XLNetTokenizer, - XLMConfig, - XLMTokenizer, - RobertaConfig, - RobertaTokenizer, - DistilBertConfig, - DistilBertTokenizer, - AlbertConfig, - AlbertTokenizer, + WEIGHTS_NAME, + BertConfig, BertTokenizer, + XLNetConfig, XLNetTokenizer, + XLMConfig, XLMTokenizer, + RobertaConfig, RobertaTokenizer, + DistilBertConfig, DistilBertTokenizer, + AlbertConfig, AlbertTokenizer ) class MultiLabelClassificationModel(ClassificationModel): - def __init__( - self, - model_type, - model_name, - num_labels=None, - pos_weight=None, - args=None, - use_cuda=True, - ): + def __init__(self, model_type, model_name, num_labels=None, pos_weight=None, args=None, use_cuda=True): """ Initializes a MultiLabelClassification model. @@ -47,41 +35,19 @@ def __init__( pos_weight (optional): A list of length num_labels containing the weights to assign to each label for loss calculation. args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args. use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only. - """ # noqa: ignore flake8 + """ MODEL_CLASSES = { - "bert": ( - BertConfig, - BertForMultiLabelSequenceClassification, - BertTokenizer, - ), - "roberta": ( - RobertaConfig, - RobertaForMultiLabelSequenceClassification, - RobertaTokenizer, - ), - "xlnet": ( - XLNetConfig, - XLNetForMultiLabelSequenceClassification, - XLNetTokenizer, - ), - "xlm": (XLMConfig, XLMForMultiLabelSequenceClassification, XLMTokenizer), - "distilbert": ( - DistilBertConfig, - DistilBertForMultiLabelSequenceClassification, - DistilBertTokenizer, - ), - "albert": ( - AlbertConfig, - AlbertForMultiLabelSequenceClassification, - AlbertTokenizer, - ), + 'bert': (BertConfig, BertForMultiLabelSequenceClassification, BertTokenizer), + 'roberta': (RobertaConfig, RobertaForMultiLabelSequenceClassification, RobertaTokenizer), + 'xlnet': (XLNetConfig, XLNetForMultiLabelSequenceClassification, XLNetTokenizer), + 'xlm': (XLMConfig, XLMForMultiLabelSequenceClassification, XLMTokenizer), + 'distilbert': (DistilBertConfig, DistilBertForMultiLabelSequenceClassification, DistilBertTokenizer), + 'albert': (AlbertConfig, AlbertForMultiLabelSequenceClassification, AlbertTokenizer) } config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type] if num_labels: - self.config = config_class.from_pretrained( - model_name, num_labels=num_labels - ) + self.config = config_class.from_pretrained(model_name, num_labels=num_labels) self.num_labels = num_labels else: self.config = config_class.from_pretrained(model_name) @@ -92,92 +58,52 @@ def __init__( if torch.cuda.is_available(): self.device = torch.device("cuda") else: - raise ValueError( - "'use_cuda' set to True when cuda is unavailable." - "Make sure CUDA is available or set use_cuda=False." - ) + raise ValueError("'use_cuda' set to True when cuda is unavailable. Make sure CUDA is available or set use_cuda=False.") else: self.device = "cpu" if self.pos_weight: - self.model = model_class.from_pretrained( - model_name, - config=self.config, - pos_weight=torch.Tensor(self.pos_weight).to(self.device), - ) + self.model = model_class.from_pretrained(model_name, config=self.config, pos_weight=torch.Tensor(self.pos_weight).to(self.device)) else: self.model = model_class.from_pretrained(model_name, config=self.config) self.results = {} self.args = { - "threshold": 0.5, - "sliding_window": False, - "tie_value": 1, - "stride": False, + 'threshold': 0.5, + + 'sliding_window': False, + 'tie_value': 1, + 'stride': False, } self.args.update(global_args) if not use_cuda: - self.args["fp16"] = False + self.args['fp16'] = False if args: self.args.update(args) - self.tokenizer = tokenizer_class.from_pretrained( - model_name, do_lower_case=self.args["do_lower_case"] - ) + self.tokenizer = tokenizer_class.from_pretrained(model_name, do_lower_case=self.args['do_lower_case']) self.args["model_name"] = model_name self.args["model_type"] = model_type - def train_model( - self, - train_df, - multi_label=True, - eval_df=None, - output_dir=None, - show_running_loss=True, - args=None, - **kwargs - ): - return super().train_model( - train_df, - multi_label=multi_label, - eval_df=eval_df, - output_dir=output_dir, - show_running_loss=show_running_loss, - args=args, - ) - - def eval_model( - self, eval_df, multi_label=True, output_dir=None, verbose=False, **kwargs - ): - return super().eval_model( - eval_df, - output_dir=output_dir, - multi_label=multi_label, - verbose=verbose, - **kwargs - ) - - def evaluate(self, eval_df, output_dir, multi_label=True, prefix="", **kwargs): - return super().evaluate( - eval_df, output_dir, multi_label=multi_label, prefix=prefix, **kwargs - ) - - def load_and_cache_examples( - self, examples, evaluate=False, no_cache=False, multi_label=True - ): - return super().load_and_cache_examples( - examples, evaluate=evaluate, no_cache=no_cache, multi_label=multi_label - ) + def train_model(self, train_df, multi_label=True, eval_df=None, output_dir=None, show_running_loss=True, args=None, **kwargs): + return super().train_model(train_df, multi_label=multi_label, eval_df=eval_df, output_dir=output_dir, show_running_loss=show_running_loss, args=args) + + def eval_model(self, eval_df, multi_label=True, output_dir=None, verbose=False, **kwargs): + return super().eval_model(eval_df, output_dir=output_dir, multi_label=multi_label, verbose=verbose, **kwargs) + + def evaluate(self, eval_df, output_dir, multi_label=True, prefix='', **kwargs): + return super().evaluate(eval_df, output_dir, multi_label=multi_label, prefix=prefix, **kwargs) + + def load_and_cache_examples(self, examples, evaluate=False, no_cache=False, multi_label=True): + return super().load_and_cache_examples(examples, evaluate=evaluate, no_cache=no_cache, multi_label=multi_label) def compute_metrics(self, preds, labels, eval_examples, multi_label=True, **kwargs): - return super().compute_metrics( - preds, labels, eval_examples, multi_label=multi_label, **kwargs - ) + return super().compute_metrics(preds, labels, eval_examples, multi_label=multi_label, **kwargs) def predict(self, to_predict, multi_label=True): - return super().predict(to_predict, multi_label=multi_label) + return super().predict(to_predict, multi_label=multi_label) \ No newline at end of file diff --git a/simpletransformers/config/global_args.py b/simpletransformers/config/global_args.py index fe9999c6..77e85098 100644 --- a/simpletransformers/config/global_args.py +++ b/simpletransformers/config/global_args.py @@ -2,36 +2,41 @@ global_args = { - "output_dir": "outputs/", - "cache_dir": "cache_dir/", - "fp16": True, - "fp16_opt_level": "O1", - "max_seq_length": 128, - "train_batch_size": 8, - "gradient_accumulation_steps": 1, - "eval_batch_size": 8, - "num_train_epochs": 1, - "weight_decay": 0, - "learning_rate": 4e-5, - "adam_epsilon": 1e-8, - "warmup_ratio": 0.06, - "warmup_steps": 0, - "max_grad_norm": 1.0, - "do_lower_case": False, - "logging_steps": 50, - "save_steps": 2000, - "save_model_every_epoch": True, - "evaluate_during_training": False, - "evaluate_during_training_steps": 2000, - "use_cached_eval_features": True, - "save_eval_checkpoints": True, - "tensorboard_dir": None, - "overwrite_output_dir": False, - "reprocess_input_data": False, - "process_count": cpu_count() - 2 if cpu_count() > 2 else 1, - "n_gpu": 1, - "use_multiprocessing": True, - "silent": False, - "wandb_project": None, - "wandb_kwargs": {}, -} + 'output_dir': 'outputs/', + 'cache_dir': 'cache_dir/', + + 'fp16': True, + 'fp16_opt_level': 'O1', + 'max_seq_length': 128, + 'train_batch_size': 8, + 'gradient_accumulation_steps': 1, + 'eval_batch_size': 8, + 'num_train_epochs': 1, + 'weight_decay': 0, + 'learning_rate': 4e-5, + 'adam_epsilon': 1e-8, + 'warmup_ratio': 0.06, + 'warmup_steps': 0, + 'max_grad_norm': 1.0, + 'do_lower_case': False, + + 'logging_steps': 50, + 'save_steps': 2000, + 'save_model_every_epoch': True, + 'evaluate_during_training': False, + 'evaluate_during_training_steps': 2000, + 'use_cached_eval_features': True, + 'save_eval_checkpoints': True, + 'tensorboard_dir': None, + + 'overwrite_output_dir': False, + 'reprocess_input_data': False, + + 'process_count': cpu_count() - 2 if cpu_count() > 2 else 1, + 'n_gpu': 1, + 'use_multiprocessing': True, + 'silent': False, + + 'wandb_project': None, + 'wandb_kwargs': {}, +} \ No newline at end of file diff --git a/simpletransformers/ner/__init__.py b/simpletransformers/ner/__init__.py index bae54d6e..c597ea4c 100755 --- a/simpletransformers/ner/__init__.py +++ b/simpletransformers/ner/__init__.py @@ -1 +1 @@ -from simpletransformers.ner.ner_model import NERModel +from simpletransformers.ner.ner_model import NERModel \ No newline at end of file diff --git a/simpletransformers/ner/ner_model.py b/simpletransformers/ner/ner_model.py index 586b1ec8..e510373c 100755 --- a/simpletransformers/ner/ner_model.py +++ b/simpletransformers/ner/ner_model.py @@ -17,54 +17,28 @@ from tqdm.auto import trange, tqdm from torch.nn import CrossEntropyLoss -from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset +from torch.utils.data import ( + DataLoader, + RandomSampler, + SequentialSampler, + TensorDataset +) from transformers import AdamW, get_linear_schedule_with_warmup -from transformers import ( - WEIGHTS_NAME, - BertConfig, - BertForTokenClassification, - BertTokenizer, -) -from transformers import ( - DistilBertConfig, - DistilBertForTokenClassification, - DistilBertTokenizer, -) +from transformers import WEIGHTS_NAME, BertConfig, BertForTokenClassification, BertTokenizer +from transformers import DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer from transformers import RobertaConfig, RobertaForTokenClassification, RobertaTokenizer -from transformers import ( - XLMRobertaConfig, - XLMRobertaForTokenClassification, - XLMRobertaTokenizer, -) +from transformers import XLMRobertaConfig, XLMRobertaForTokenClassification, XLMRobertaTokenizer -from simpletransformers.ner.ner_utils import ( - InputExample, - convert_examples_to_features, - get_labels, - read_examples_from_file, - get_examples_from_df, -) -from transformers import ( - CamembertConfig, - CamembertForTokenClassification, - CamembertTokenizer, -) +from simpletransformers.ner.ner_utils import InputExample, convert_examples_to_features, get_labels, read_examples_from_file, get_examples_from_df +from transformers import CamembertConfig, CamembertForTokenClassification, CamembertTokenizer from simpletransformers.config.global_args import global_args import wandb class NERModel: - def __init__( - self, - model_type, - model_name, - labels=None, - args=None, - use_cuda=True, - cuda_device=-1, - ): + def __init__(self, model_type, model_name, labels=None, args=None, use_cuda=True, cuda_device=-1): """ Initializes a NERModel @@ -75,42 +49,20 @@ def __init__( args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args. use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only. cuda_device (optional): Specific GPU that should be used. Will use the first available GPU by default. - """ # noqa: ignore flake8 + """ if labels: self.labels = labels else: - self.labels = [ - "O", - "B-MISC", - "I-MISC", - "B-PER", - "I-PER", - "B-ORG", - "I-ORG", - "B-LOC", - "I-LOC", - ] + self.labels = ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"] self.num_labels = len(self.labels) MODEL_CLASSES = { - "bert": (BertConfig, BertForTokenClassification, BertTokenizer), - "roberta": (RobertaConfig, RobertaForTokenClassification, RobertaTokenizer), - "distilbert": ( - DistilBertConfig, - DistilBertForTokenClassification, - DistilBertTokenizer, - ), - "camembert": ( - CamembertConfig, - CamembertForTokenClassification, - CamembertTokenizer, - ), - "xlmroberta": ( - XLMRobertaConfig, - XLMRobertaForTokenClassification, - XLMRobertaTokenizer, - ), + 'bert': (BertConfig, BertForTokenClassification, BertTokenizer), + 'roberta': (RobertaConfig, RobertaForTokenClassification, RobertaTokenizer), + 'distilbert': (DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer), + 'camembert': (CamembertConfig, CamembertForTokenClassification, CamembertTokenizer), + 'xlmroberta': (XLMRobertaConfig, XLMRobertaForTokenClassification, XLMRobertaTokenizer), } config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type] @@ -124,10 +76,7 @@ def __init__( else: self.device = torch.device(f"cuda:{cuda_device}") else: - raise ValueError( - "'use_cuda' set to True when cuda is unavailable." - " Make sure CUDA is available or set use_cuda=False." - ) + raise ValueError("'use_cuda' set to True when cuda is unavailable. Make sure CUDA is available or set use_cuda=False.") else: self.device = "cpu" @@ -138,35 +87,23 @@ def __init__( self.args.update(global_args) if not use_cuda: - self.args["fp16"] = False + self.args['fp16'] = False if args: self.args.update(args) - self.tokenizer = tokenizer_class.from_pretrained( - model_name, do_lower_case=self.args["do_lower_case"] - ) + self.tokenizer = tokenizer_class.from_pretrained(model_name, do_lower_case=self.args['do_lower_case']) - self.args["model_name"] = model_name - self.args["model_type"] = model_type + self.args['model_name'] = model_name + self.args['model_type'] = model_type self.pad_token_label_id = CrossEntropyLoss().ignore_index - if model_type == "camembert": - warnings.warn( - "use_multiprocessing automatically disabled as CamemBERT fails" - " when using multiprocessing for feature conversion." - ) - self.args["use_multiprocessing"] = False - - def train_model( - self, - train_data, - output_dir=None, - show_running_loss=True, - args=None, - eval_df=None, - ): + if model_type == 'camembert': + warnings.warn("use_multiprocessing automatically disabled as CamemBERT fails when using multiprocessing for feature conversion.") + self.args['use_multiprocessing'] = False + + def train_model(self, train_data, output_dir=None, show_running_loss=True, args=None, eval_df=None): """ Trains the model using 'train_data' @@ -182,33 +119,22 @@ def train_model( Returns: None - """ # noqa: ignore flake8 + """ if args: self.args.update(args) - if self.args["silent"]: + if self.args['silent']: show_running_loss = False - if self.args["evaluate_during_training"] and eval_df is None: - raise ValueError( - "evaluate_during_training is enabled but eval_df is not specified." - "Pass eval_df to model.train_model() " - "if using evaluate_during_training." - ) + if self.args['evaluate_during_training'] and eval_df is None: + raise ValueError("evaluate_during_training is enabled but eval_df is not specified. Pass eval_df to model.train_model() if using evaluate_during_training.") if not output_dir: - output_dir = self.args["output_dir"] + output_dir = self.args['output_dir'] - if ( - os.path.exists(output_dir) - and os.listdir(output_dir) - and not self.args["overwrite_output_dir"] - ): - raise ValueError( - "Output directory ({}) already exists and is not empty. " - "Use --overwrite_output_dir to overcome.".format(output_dir) - ) + if os.path.exists(output_dir) and os.listdir(output_dir) and not self.args["overwrite_output_dir"]: + raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(output_dir)) self._move_model_to_device() @@ -217,96 +143,54 @@ def train_model( if not os.path.exists(output_dir): os.makedirs(output_dir) - global_step, tr_loss = self.train( - train_dataset, - output_dir, - show_running_loss=show_running_loss, - eval_df=eval_df, - ) + global_step, tr_loss = self.train(train_dataset, output_dir, show_running_loss=show_running_loss, eval_df=eval_df) - model_to_save = ( - self.model.module if hasattr(self.model, "module") else self.model - ) + model_to_save = self.model.module if hasattr(self.model, "module") else self.model model_to_save.save_pretrained(output_dir) self.tokenizer.save_pretrained(output_dir) torch.save(self.args, os.path.join(output_dir, "training_args.bin")) - print( - "Training of {} model complete. Saved to {}.".format( - self.args["model_type"], output_dir - ) - ) + print("Training of {} model complete. Saved to {}.".format(self.args["model_type"], output_dir)) def train(self, train_dataset, output_dir, show_running_loss=True, eval_df=None): """ Trains the model on train_dataset. - Utility function to be used by the train_model() method. - Not intended to be used directly. + Utility function to be used by the train_model() method. Not intended to be used directly. """ + tokenizer = self.tokenizer device = self.device model = self.model args = self.args tb_writer = SummaryWriter(logdir=args["tensorboard_dir"]) train_sampler = RandomSampler(train_dataset) - train_dataloader = DataLoader( - train_dataset, sampler=train_sampler, batch_size=args["train_batch_size"] - ) + train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args["train_batch_size"]) - t_total = ( - len(train_dataloader) - // args["gradient_accumulation_steps"] - * args["num_train_epochs"] - ) + t_total = len(train_dataloader) // args["gradient_accumulation_steps"] * args["num_train_epochs"] no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ - { - "params": [ - p - for n, p in model.named_parameters() - if not any(nd in n for nd in no_decay) - ], - "weight_decay": args["weight_decay"], - }, - { - "params": [ - p - for n, p in model.named_parameters() - if any(nd in n for nd in no_decay) - ], - "weight_decay": 0.0, - }, + {"params": [p for n, p in model.named_parameters() if not any( + nd in n for nd in no_decay)], "weight_decay": args["weight_decay"]}, + {"params": [p for n, p in model.named_parameters() if any( + nd in n for nd in no_decay)], "weight_decay": 0.0} ] warmup_steps = math.ceil(t_total * args["warmup_ratio"]) - args["warmup_steps"] = ( - warmup_steps if args["warmup_steps"] == 0 else args["warmup_steps"] - ) - - optimizer = AdamW( - optimizer_grouped_parameters, - lr=args["learning_rate"], - eps=args["adam_epsilon"], - ) - scheduler = get_linear_schedule_with_warmup( - optimizer, num_warmup_steps=args["warmup_steps"], num_training_steps=t_total - ) + args["warmup_steps"] = warmup_steps if args["warmup_steps"] == 0 else args["warmup_steps"] + + optimizer = AdamW(optimizer_grouped_parameters, lr=args["learning_rate"], eps=args["adam_epsilon"]) + scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args["warmup_steps"], num_training_steps=t_total) if args["fp16"]: try: from apex import amp except ImportError: - raise ImportError( - "Please install apex from https://www.github.com/nvidia/apex" - " to use fp16 training." - ) + raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") - model, optimizer = amp.initialize( - model, optimizer, opt_level=args["fp16_opt_level"] - ) + model, optimizer = amp.initialize(model, optimizer, opt_level=args["fp16_opt_level"]) if args["n_gpu"] > 1: model = torch.nn.DataParallel(model) @@ -314,49 +198,41 @@ def train(self, train_dataset, output_dir, show_running_loss=True, eval_df=None) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() - train_iterator = trange( - int(args["num_train_epochs"]), desc="Epoch", disable=args["silent"] - ) + train_iterator = trange(int(args["num_train_epochs"]), desc="Epoch", disable=args['silent']) epoch_number = 0 - if args["evaluate_during_training"]: + if args['evaluate_during_training']: training_progress_scores = { - "global_step": [], - "precision": [], - "recall": [], - "f1_score": [], - "train_loss": [], - "eval_loss": [], + 'global_step': [], + 'precision': [], + 'recall': [], + 'f1_score': [], + 'train_loss': [], + 'eval_loss': [], } - if args["wandb_project"]: - wandb.init(project=args["wandb_project"], config={**args}) + if args['wandb_project']: + wandb.init(project=args['wandb_project'], config={**args}) wandb.watch(self.model) model.train() for _ in train_iterator: # epoch_iterator = tqdm(train_dataloader, desc="Iteration") - for step, batch in enumerate( - tqdm(train_dataloader, desc="Current iteration", disable=args["silent"]) - ): + for step, batch in enumerate(tqdm(train_dataloader, desc="Current iteration", disable=args['silent'])): batch = tuple(t.to(device) for t in batch) - inputs = { - "input_ids": batch[0], - "attention_mask": batch[1], - "labels": batch[3], - } + inputs = {"input_ids": batch[0], + "attention_mask": batch[1], + "labels": batch[3]} # XLM and RoBERTa don"t use segment_ids - if args["model_type"] in ["bert", "xlnet"]: + if args['model_type'] in ["bert", "xlnet"]: inputs["token_type_ids"] = batch[2] outputs = model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] - if args["n_gpu"] > 1: - loss = ( - loss.mean() - ) # mean() to average on multi-gpu parallel training + if args['n_gpu'] > 1: + loss = loss.mean() # mean() to average on multi-gpu parallel training current_loss = loss.item() @@ -369,14 +245,10 @@ def train(self, train_dataset, output_dir, show_running_loss=True, eval_df=None) if args["fp16"]: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() - torch.nn.utils.clip_grad_norm_( - amp.master_params(optimizer), args["max_grad_norm"] - ) + torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args["max_grad_norm"]) else: loss.backward() - torch.nn.utils.clip_grad_norm_( - model.parameters(), args["max_grad_norm"] - ) + torch.nn.utils.clip_grad_norm_(model.parameters(), args["max_grad_norm"]) tr_loss += loss.item() if (step + 1) % args["gradient_accumulation_steps"] == 0: @@ -385,108 +257,69 @@ def train(self, train_dataset, output_dir, show_running_loss=True, eval_df=None) model.zero_grad() global_step += 1 - if ( - args["logging_steps"] > 0 - and global_step % args["logging_steps"] == 0 - ): + if args["logging_steps"] > 0 and global_step % args["logging_steps"] == 0: # Log metrics tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) - tb_writer.add_scalar( - "loss", - (tr_loss - logging_loss) / args["logging_steps"], - global_step, - ) + tb_writer.add_scalar("loss", (tr_loss - logging_loss)/args["logging_steps"], global_step) logging_loss = tr_loss - if args["wandb_project"]: - wandb.log( - { - "Training loss": current_loss, - "lr": scheduler.get_lr()[0], - "global_step": global_step, - } - ) + if args['wandb_project']: + wandb.log({'Training loss': current_loss, 'lr': scheduler.get_lr()[0], 'global_step': global_step}) if args["save_steps"] > 0 and global_step % args["save_steps"] == 0: # Save model checkpoint - output_dir_current = os.path.join( - output_dir, "checkpoint-{}".format(global_step) - ) + output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir_current): os.makedirs(output_dir_current) # Take care of distributed/parallel training - model_to_save = ( - model.module if hasattr(model, "module") else model - ) + model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(output_dir_current) self.tokenizer.save_pretrained(output_dir_current) - if args["evaluate_during_training"] and ( - args["evaluate_during_training_steps"] > 0 - and global_step % args["evaluate_during_training_steps"] == 0 - ): - # Only evaluate when single GPU otherwise metrics may not - # average well + if args['evaluate_during_training'] and (args["evaluate_during_training_steps"] > 0 and global_step % args["evaluate_during_training_steps"] == 0): + # Only evaluate when single GPU otherwise metrics may not average well results, _, _ = self.eval_model(eval_df, verbose=True) for key, value in results.items(): - tb_writer.add_scalar( - "eval_{}".format(key), value, global_step - ) + tb_writer.add_scalar('eval_{}'.format(key), value, global_step) - output_dir_current = os.path.join( - output_dir, "checkpoint-{}".format(global_step) - ) + output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir_current): os.makedirs(output_dir_current) - if args["save_eval_checkpoints"]: - model_to_save = ( - model.module if hasattr(model, "module") else model - ) + if args['save_eval_checkpoints']: + model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(output_dir_current) self.tokenizer.save_pretrained(output_dir_current) - output_eval_file = os.path.join( - output_dir_current, "eval_results.txt" - ) + output_eval_file = os.path.join(output_dir_current, "eval_results.txt") with open(output_eval_file, "w") as writer: for key in sorted(results.keys()): writer.write("{} = {}\n".format(key, str(results[key]))) - training_progress_scores["global_step"].append(global_step) - training_progress_scores["train_loss"].append(current_loss) + training_progress_scores['global_step'].append(global_step) + training_progress_scores['train_loss'].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) - report.to_csv( - args["output_dir"] + "training_progress_scores.csv", - index=False, - ) + report.to_csv(args['output_dir'] + 'training_progress_scores.csv', index=False) - if args["wandb_project"]: + if args['wandb_project']: wandb.log(self._get_last_metrics(training_progress_scores)) epoch_number += 1 - output_dir_current = os.path.join( - output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number) - ) + output_dir_current = os.path.join(output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number)) - if ( - args["save_model_every_epoch"] or args["evaluate_during_training"] - ) and not os.path.exists(output_dir_current): + if (args['save_model_every_epoch'] or args['evaluate_during_training']) and not os.path.exists(output_dir_current): os.makedirs(output_dir_current) - if ( - args["save_model_every_epoch"] - and epoch_number != args["num_train_epochs"] - ): + if args['save_model_every_epoch'] and epoch_number != args['num_train_epochs']: model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(output_dir_current) self.tokenizer.save_pretrained(output_dir_current) - if args["evaluate_during_training"]: + if args['evaluate_during_training']: results, _, _ = self.eval_model(eval_df, verbose=True) output_eval_file = os.path.join(output_dir_current, "eval_results.txt") @@ -513,7 +346,7 @@ def eval_model(self, eval_data, output_dir=None, verbose=True): result: Dictionary containing evaluation results. (eval_loss, precision, recall, f1_score) model_outputs: List of raw model outputs preds_list: List of predicted tags - """ # noqa: ignore flake8 + """ if not output_dir: output_dir = self.args["output_dir"] @@ -533,10 +366,10 @@ def evaluate(self, eval_dataset, output_dir): """ Evaluates the model on eval_dataset. - Utility function to be used by the eval_model() method. - Not intended to be used directly. + Utility function to be used by the eval_model() method. Not intended to be used directly. """ + tokenizer = self.tokenizer device = self.device model = self.model args = self.args @@ -546,9 +379,7 @@ def evaluate(self, eval_dataset, output_dir): results = {} eval_sampler = SequentialSampler(eval_dataset) - eval_dataloader = DataLoader( - eval_dataset, sampler=eval_sampler, batch_size=args["eval_batch_size"] - ) + eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args["eval_batch_size"]) eval_loss = 0.0 nb_eval_steps = 0 @@ -556,17 +387,15 @@ def evaluate(self, eval_dataset, output_dir): out_label_ids = None model.eval() - for batch in tqdm(eval_dataloader, disable=args["silent"]): + for batch in tqdm(eval_dataloader, disable=args['silent']): batch = tuple(t.to(device) for t in batch) with torch.no_grad(): - inputs = { - "input_ids": batch[0], - "attention_mask": batch[1], - "labels": batch[3], - } + inputs = {"input_ids": batch[0], + "attention_mask": batch[1], + "labels": batch[3]} # XLM and RoBERTa don"t use segment_ids - if args["model_type"] in ["bert", "xlnet"]: + if args['model_type'] in ["bert", "xlnet"]: inputs["token_type_ids"] = batch[2] outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] @@ -580,9 +409,7 @@ def evaluate(self, eval_dataset, output_dir): out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) - out_label_ids = np.append( - out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0 - ) + out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps model_outputs = preds @@ -603,7 +430,7 @@ def evaluate(self, eval_dataset, output_dir): "eval_loss": eval_loss, "precision": precision_score(out_label_list, preds_list), "recall": recall_score(out_label_list, preds_list), - "f1_score": f1_score(out_label_list, preds_list), + "f1_score": f1_score(out_label_list, preds_list) } results.update(result) @@ -620,16 +447,14 @@ def predict(self, to_predict): Performs predictions on a list of text. Args: - to_predict: A python list of text (str) to be sent to the model - for prediction. + to_predict: A python list of text (str) to be sent to the model for prediction. Returns: - preds: A Python list of lists with dicts containg each word - mapped to its NER tag. - model_outputs: A python list of the raw model outputs for - each text. + preds: A Python list of lists with dicts containg each word mapped to its NER tag. + model_outputs: A python list of the raw model outputs for each text. """ + tokenizer = self.tokenizer device = self.device model = self.model args = self.args @@ -637,17 +462,12 @@ def predict(self, to_predict): self._move_model_to_device() - predict_examples = [ - InputExample(i, sentence.split(), ["O" for word in sentence.split()]) - for i, sentence in enumerate(to_predict) - ] + predict_examples = [InputExample(i, sentence.split(), ["O" for word in sentence.split()]) for i, sentence in enumerate(to_predict)] eval_dataset = self.load_and_cache_examples(None, to_predict=predict_examples) eval_sampler = SequentialSampler(eval_dataset) - eval_dataloader = DataLoader( - eval_dataset, sampler=eval_sampler, batch_size=args["eval_batch_size"] - ) + eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args["eval_batch_size"]) eval_loss = 0.0 nb_eval_steps = 0 @@ -655,17 +475,15 @@ def predict(self, to_predict): out_label_ids = None model.eval() - for batch in tqdm(eval_dataloader, disable=args["silent"]): + for batch in tqdm(eval_dataloader, disable=args['silent']): batch = tuple(t.to(device) for t in batch) with torch.no_grad(): - inputs = { - "input_ids": batch[0], - "attention_mask": batch[1], - "labels": batch[3], - } + inputs = {"input_ids": batch[0], + "attention_mask": batch[1], + "labels": batch[3]} # XLM and RoBERTa don"t use segment_ids - if args["model_type"] in ["bert", "xlnet"]: + if args['model_type'] in ["bert", "xlnet"]: inputs["token_type_ids"] = batch[2] outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] @@ -679,9 +497,7 @@ def predict(self, to_predict): out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) - out_label_ids = np.append( - out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0 - ) + out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps model_outputs = preds @@ -698,19 +514,11 @@ def predict(self, to_predict): out_label_list[i].append(label_map[out_label_ids[i][j]]) preds_list[i].append(label_map[preds[i][j]]) - preds = [ - [ - {word: preds_list[i][j]} - for j, word in enumerate(sentence.split()[: len(preds_list[i])]) - ] - for i, sentence in enumerate(to_predict) - ] + preds = [[{word: preds_list[i][j]} for j, word in enumerate(sentence.split()[:len(preds_list[i])])] for i, sentence in enumerate(to_predict)] return preds, model_outputs - def load_and_cache_examples( - self, data, evaluate=False, no_cache=False, to_predict=None - ): + def load_and_cache_examples(self, data, evaluate=False, no_cache=False, to_predict=None): """ Reads data_file and generates a TensorDataset containing InputFeatures. Caches the InputFeatures. Utility function for train() and eval() methods. Not intended to be used directly. @@ -721,11 +529,12 @@ def load_and_cache_examples( evaluate (optional): Indicates whether the examples are for evaluation or for training. no_cache (optional): Force feature conversion and prevent caching. I.e. Ignore cached features even if present. - """ # noqa: ignore flake8 + """ process_count = self.args["process_count"] tokenizer = self.tokenizer + output_mode = "classification" args = self.args mode = "dev" if evaluate else "train" @@ -739,24 +548,12 @@ def load_and_cache_examples( examples = to_predict no_cache = True - cached_features_file = os.path.join( - args["cache_dir"], - "cached_{}_{}_{}_{}_{}".format( - mode, - args["model_type"], - args["max_seq_length"], - self.num_labels, - len(examples), - ), - ) + cached_features_file = os.path.join(args["cache_dir"], "cached_{}_{}_{}_{}_{}".format(mode, args["model_type"], args["max_seq_length"], self.num_labels, len(examples))) if not os.path.isdir(self.args["cache_dir"]): os.mkdir(self.args["cache_dir"]) - if os.path.exists(cached_features_file) and ( - (not args["reprocess_input_data"] and not no_cache) - or (mode == "dev" and args["use_cached_eval_features"]) - ): + if os.path.exists(cached_features_file) and ((not args["reprocess_input_data"] and not no_cache) or (mode == "dev" and args['use_cached_eval_features'])): features = torch.load(cached_features_file) print(f"Features loaded from cache at {cached_features_file}") else: @@ -764,15 +561,14 @@ def load_and_cache_examples( features = convert_examples_to_features( examples, self.labels, - self.args["max_seq_length"], + self.args['max_seq_length'], self.tokenizer, # XLNet has a CLS token at the end cls_token_at_end=bool(args["model_type"] in ["xlnet"]), cls_token=tokenizer.cls_token, cls_token_segment_id=2 if args["model_type"] in ["xlnet"] else 0, sep_token=tokenizer.sep_token, - # RoBERTa uses an extra separator b/w pairs of sentences, - # cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 # noqa: ignore flake8 + # RoBERTa uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 sep_token_extra=bool(args["model_type"] in ["roberta"]), # PAD on the left for XLNet pad_on_left=bool(args["model_type"] in ["xlnet"]), @@ -780,25 +576,19 @@ def load_and_cache_examples( pad_token_segment_id=4 if args["model_type"] in ["xlnet"] else 0, pad_token_label_id=self.pad_token_label_id, process_count=process_count, - silent=args["silent"], - use_multiprocessing=args["use_multiprocessing"], + silent=args['silent'], + use_multiprocessing=args['use_multiprocessing'] ) if not no_cache: torch.save(features, cached_features_file) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) - all_input_mask = torch.tensor( - [f.input_mask for f in features], dtype=torch.long - ) - all_segment_ids = torch.tensor( - [f.segment_ids for f in features], dtype=torch.long - ) + all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) + all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long) - dataset = TensorDataset( - all_input_ids, all_input_mask, all_segment_ids, all_label_ids - ) + dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset diff --git a/simpletransformers/ner/ner_utils.py b/simpletransformers/ner/ner_utils.py index 1e3dd2c9..99845411 100755 --- a/simpletransformers/ner/ner_utils.py +++ b/simpletransformers/ner/ner_utils.py @@ -17,9 +17,12 @@ from __future__ import absolute_import, division, print_function +import logging +import os from io import open from multiprocessing import Pool, cpu_count from tqdm.auto import tqdm +import pandas as pd class InputExample(object): @@ -30,9 +33,8 @@ def __init__(self, guid, words, labels): Args: guid: Unique id for the example. words: list. The words of the sequence. - labels: (Optional) list. The labels for each word of the sequence. - This should be specified for train and dev examples, but not for - test examples. + labels: (Optional) list. The labels for each word of the sequence. This should be + specified for train and dev examples, but not for test examples. """ self.guid = guid self.words = words @@ -59,13 +61,9 @@ def read_examples_from_file(data_file, mode): for line in f: if line.startswith("-DOCSTART-") or line == "" or line == "\n": if words: - examples.append( - InputExample( - guid="{}-{}".format(mode, guid_index), - words=words, - labels=labels, - ) - ) + examples.append(InputExample(guid="{}-{}".format(mode, guid_index), + words=words, + labels=labels)) guid_index += 1 words = [] labels = [] @@ -78,60 +76,32 @@ def read_examples_from_file(data_file, mode): # Examples could have no label for mode = "test" labels.append("O") if words: - examples.append( - InputExample( - guid="%s-%d".format(mode, guid_index), words=words, labels=labels - ) - ) + examples.append(InputExample(guid="%s-%d".format(mode, guid_index), + words=words, + labels=labels)) return examples def get_examples_from_df(data): - return [ - InputExample( - guid=sentence_id, - words=sentence_df["words"].tolist(), - labels=sentence_df["labels"].tolist(), - ) - for sentence_id, sentence_df in data.groupby(["sentence_id"]) - ] + return [InputExample(guid=sentence_id, words=sentence_df['words'].tolist(), labels=sentence_df['labels'].tolist()) for sentence_id, sentence_df in data.groupby(['sentence_id'])] def convert_example_to_feature(example_row): - ( - example, - label_map, - max_seq_length, - tokenizer, - cls_token_at_end, - cls_token, - cls_token_segment_id, - sep_token, - sep_token_extra, - pad_on_left, - pad_token, - pad_token_segment_id, - pad_token_label_id, - sequence_a_segment_id, - mask_padding_with_zero, - ) = example_row + example, label_map, max_seq_length, tokenizer, cls_token_at_end, cls_token, cls_token_segment_id, sep_token, sep_token_extra, pad_on_left, pad_token, pad_token_segment_id, pad_token_label_id, sequence_a_segment_id, mask_padding_with_zero = example_row tokens = [] label_ids = [] for word, label in zip(example.words, example.labels): word_tokens = tokenizer.tokenize(word) tokens.extend(word_tokens) - # Use the real label id for the first token of the word, and padding - # ids for the remaining tokens - label_ids.extend( - [label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1) - ) + # Use the real label id for the first token of the word, and padding ids for the remaining tokens + label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1)) # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa. special_tokens_count = 3 if sep_token_extra else 2 if len(tokens) > max_seq_length - special_tokens_count: - tokens = tokens[: (max_seq_length - special_tokens_count)] - label_ids = label_ids[: (max_seq_length - special_tokens_count)] + tokens = tokens[:(max_seq_length - special_tokens_count)] + label_ids = label_ids[:(max_seq_length - special_tokens_count)] # The convention in BERT is: # (a) For sequence pairs: @@ -178,16 +148,14 @@ def convert_example_to_feature(example_row): padding_length = max_seq_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids - input_mask = ( - [0 if mask_padding_with_zero else 1] * padding_length - ) + input_mask + input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids label_ids = ([pad_token_label_id] * padding_length) + label_ids else: - input_ids += [pad_token] * padding_length - input_mask += [0 if mask_padding_with_zero else 1] * padding_length - segment_ids += [pad_token_segment_id] * padding_length - label_ids += [pad_token_label_id] * padding_length + input_ids += ([pad_token] * padding_length) + input_mask += ([0 if mask_padding_with_zero else 1] * padding_length) + segment_ids += ([pad_token_segment_id] * padding_length) + label_ids += ([pad_token_label_id] * padding_length) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length @@ -198,70 +166,59 @@ def convert_example_to_feature(example_row): input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, - label_ids=label_ids, + label_ids=label_ids ) - def convert_examples_to_features( - examples, - label_list, - max_seq_length, - tokenizer, - cls_token_at_end=False, - cls_token="[CLS]", - cls_token_segment_id=1, - sep_token="[SEP]", - sep_token_extra=False, - pad_on_left=False, - pad_token=0, - pad_token_segment_id=0, - pad_token_label_id=-1, - sequence_a_segment_id=0, - mask_padding_with_zero=True, - process_count=cpu_count() - 2, - chunksize=500, - silent=False, - use_multiprocessing=True, -): + examples, + label_list, + max_seq_length, + tokenizer, + cls_token_at_end=False, + cls_token="[CLS]", + cls_token_segment_id=1, + sep_token="[SEP]", + sep_token_extra=False, + pad_on_left=False, + pad_token=0, + pad_token_segment_id=0, + pad_token_label_id=-1, + sequence_a_segment_id=0, + mask_padding_with_zero=True, + process_count=cpu_count() - 2, + chunksize=500, + silent=False, + use_multiprocessing=True + ): """ Loads a data file into a list of `InputBatch`s `cls_token_at_end` define the location of the CLS token: - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet) - """ # noqa: ignore flake8 + """ label_map = {label: i for i, label in enumerate(label_list)} - examples = [ - ( - example, - label_map, - max_seq_length, - tokenizer, - cls_token_at_end, - cls_token, - cls_token_segment_id, - sep_token, - sep_token_extra, - pad_on_left, - pad_token, - pad_token_segment_id, - pad_token_label_id, - sequence_a_segment_id, - mask_padding_with_zero, - ) - for example in examples - ] + examples = [( + example, + label_map, + max_seq_length, + tokenizer, + cls_token_at_end, + cls_token, + cls_token_segment_id, + sep_token, + sep_token_extra, + pad_on_left, + pad_token, + pad_token_segment_id, + pad_token_label_id, + sequence_a_segment_id, + mask_padding_with_zero) for example in examples] if use_multiprocessing: with Pool(process_count) as p: - features = list( - tqdm( - p.imap(convert_example_to_feature, examples, chunksize=chunksize), - total=len(examples), - disable=silent, - ) - ) + features = list(tqdm(p.imap(convert_example_to_feature, examples, chunksize=chunksize), total=len(examples), disable=silent)) else: features = [] for example in tqdm(examples): @@ -277,14 +234,4 @@ def get_labels(path): labels = ["O"] + labels return labels else: - return [ - "O", - "B-MISC", - "I-MISC", - "B-PER", - "I-PER", - "B-ORG", - "I-ORG", - "B-LOC", - "I-LOC", - ] + return ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"] \ No newline at end of file diff --git a/simpletransformers/question_answering/__init__.py b/simpletransformers/question_answering/__init__.py index c0c73d44..a8745088 100755 --- a/simpletransformers/question_answering/__init__.py +++ b/simpletransformers/question_answering/__init__.py @@ -1,3 +1 @@ -from simpletransformers.question_answering.question_answering_model import ( - QuestionAnsweringModel, -) +from simpletransformers.question_answering.question_answering_model import QuestionAnsweringModel \ No newline at end of file diff --git a/simpletransformers/question_answering/question_answering_model.py b/simpletransformers/question_answering/question_answering_model.py index 651e4cfa..1c6f4072 100755 --- a/simpletransformers/question_answering/question_answering_model.py +++ b/simpletransformers/question_answering/question_answering_model.py @@ -12,36 +12,27 @@ import pandas as pd from scipy.stats import pearsonr -from sklearn.metrics import ( - mean_squared_error, - matthews_corrcoef, - confusion_matrix, - label_ranking_average_precision_score, -) +from sklearn.metrics import mean_squared_error, matthews_corrcoef, confusion_matrix, label_ranking_average_precision_score from tensorboardX import SummaryWriter from tqdm.auto import trange, tqdm from torch.utils.data.distributed import DistributedSampler -from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset +from torch.utils.data import ( + DataLoader, + RandomSampler, + SequentialSampler, + TensorDataset +) from transformers import AdamW, get_linear_schedule_with_warmup from transformers import ( WEIGHTS_NAME, BertConfig, - BertForQuestionAnswering, - BertTokenizer, - XLMConfig, - XLMForQuestionAnswering, - XLMTokenizer, - XLNetConfig, - XLNetForQuestionAnswering, - XLNetTokenizer, - DistilBertConfig, - DistilBertForQuestionAnswering, - DistilBertTokenizer, - AlbertConfig, - AlbertForQuestionAnswering, - AlbertTokenizer, + BertForQuestionAnswering, BertTokenizer, + XLMConfig, XLMForQuestionAnswering, XLMTokenizer, + XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer, + DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer, + AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer ) from simpletransformers.question_answering.question_answering_utils import ( @@ -54,7 +45,7 @@ to_list, build_examples, get_best_predictions, - get_best_predictions_extended, + get_best_predictions_extended ) from simpletransformers.config.global_args import global_args @@ -62,9 +53,7 @@ class QuestionAnsweringModel: - def __init__( - self, model_type, model_name, args=None, use_cuda=True, cuda_device=-1 - ): + def __init__(self, model_type, model_name, args=None, use_cuda=True, cuda_device=-1): """ Initializes a QuestionAnsweringModel model. @@ -74,18 +63,14 @@ def __init__( args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args[' use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only. cuda_device (optional): Specific GPU that should be used. Will use the first available GPU by default. - """ # noqa: ignore flake8 + """ MODEL_CLASSES = { - "bert": (BertConfig, BertForQuestionAnswering, BertTokenizer), - "xlnet": (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer), - "xlm": (XLMConfig, XLMForQuestionAnswering, XLMTokenizer), - "distilbert": ( - DistilBertConfig, - DistilBertForQuestionAnswering, - DistilBertTokenizer, - ), - "albert": (AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer), + 'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer), + 'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer), + 'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer), + 'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer), + 'albert': (AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer), } config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type] @@ -98,49 +83,41 @@ def __init__( else: self.device = torch.device(f"cuda:{cuda_device}") else: - raise ValueError( - "'use_cuda' set to True when cuda is unavailable." - " Make sure CUDA is available or set use_cuda=False." - ) + raise ValueError("'use_cuda' set to True when cuda is unavailable. Make sure CUDA is available or set use_cuda=False.") else: self.device = "cpu" self.results = {} self.args = { - "doc_stride": 384, - "max_query_length": 64, - "n_best_size": 20, - "max_answer_length": 100, - "null_score_diff_threshold": 0.0, - "wandb_project": False, - "wandb_kwargs": {}, + 'doc_stride': 384, + 'max_query_length': 64, + 'n_best_size': 20, + 'max_answer_length': 100, + 'null_score_diff_threshold': 0.0, + + 'wandb_project': False, + 'wandb_kwargs': {}, } self.args.update(global_args) if not use_cuda: - self.args["fp16"] = False + self.args['fp16'] = False if args: self.args.update(args) - self.tokenizer = tokenizer_class.from_pretrained( - model_name, do_lower_case=self.args["do_lower_case"] - ) + self.tokenizer = tokenizer_class.from_pretrained(model_name, do_lower_case=self.args['do_lower_case']) - self.args["model_name"] = model_name - self.args["model_type"] = model_type + self.args['model_name'] = model_name + self.args['model_type'] = model_type - def load_and_cache_examples( - self, examples, evaluate=False, no_cache=False, output_examples=False - ): + def load_and_cache_examples(self, examples, evaluate=False, no_cache=False, output_examples=False): """ - Converts a list of examples to a TensorDataset - containing InputFeatures. Caches the InputFeatures. + Converts a list of examples to a TensorDataset containing InputFeatures. Caches the InputFeatures. - Utility function for train() and eval() methods. - Not intended to be used directly. + Utility function for train() and eval() methods. Not intended to be used directly. """ tokenizer = self.tokenizer @@ -152,86 +129,50 @@ def load_and_cache_examples( examples = get_examples(examples, is_training=not evaluate) mode = "dev" if evaluate else "train" - cached_features_file = os.path.join( - args["cache_dir"], - "cached_{}_{}_{}_{}".format( - mode, args["model_type"], args["max_seq_length"], len(examples) - ), - ) - - if os.path.exists(cached_features_file) and ( - (not args["reprocess_input_data"] and not no_cache) - or (mode == "dev" and args["use_cached_eval_features"]) - ): + cached_features_file = os.path.join(args["cache_dir"], "cached_{}_{}_{}_{}".format(mode, args["model_type"], args["max_seq_length"], len(examples))) + + if os.path.exists(cached_features_file) and ((not args["reprocess_input_data"] and not no_cache) or (mode == "dev" and args['use_cached_eval_features'])): features = torch.load(cached_features_file) print(f"Features loaded from cache at {cached_features_file}") else: print(f"Converting to features started.") - features = convert_examples_to_features( - examples=examples, - tokenizer=tokenizer, - max_seq_length=args["max_seq_length"], - doc_stride=args["doc_stride"], - max_query_length=args["max_query_length"], - is_training=not evaluate, - cls_token_segment_id=2 if args["model_type"] in ["xlnet"] else 0, - pad_token_segment_id=3 if args["model_type"] in ["xlnet"] else 0, - cls_token_at_end=True if args["model_type"] in ["xlnet"] else False, - sequence_a_is_doc=True if args["model_type"] in ["xlnet"] else False, - silent=args["silent"], - ) + features = convert_examples_to_features(examples=examples, + tokenizer=tokenizer, + max_seq_length=args['max_seq_length'], + doc_stride=args['doc_stride'], + max_query_length=args['max_query_length'], + is_training=not evaluate, + cls_token_segment_id=2 if args['model_type'] in ['xlnet'] else 0, + pad_token_segment_id=3 if args['model_type'] in ['xlnet'] else 0, + cls_token_at_end=True if args['model_type'] in ['xlnet'] else False, + sequence_a_is_doc=True if args['model_type'] in ['xlnet'] else False, + silent=args['silent'] + ) if not no_cache: torch.save(features, cached_features_file) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) - all_input_mask = torch.tensor( - [f.input_mask for f in features], dtype=torch.long - ) - all_segment_ids = torch.tensor( - [f.segment_ids for f in features], dtype=torch.long - ) + all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) + all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long) all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) if evaluate: - dataset = TensorDataset( - all_input_ids, - all_input_mask, - all_segment_ids, - all_example_index, - all_cls_index, - all_p_mask, - ) + dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, + all_example_index, all_cls_index, all_p_mask) else: - all_start_positions = torch.tensor( - [f.start_position for f in features], dtype=torch.long - ) - all_end_positions = torch.tensor( - [f.end_position for f in features], dtype=torch.long - ) - dataset = TensorDataset( - all_input_ids, - all_input_mask, - all_segment_ids, - all_start_positions, - all_end_positions, - all_cls_index, - all_p_mask, - ) + all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long) + all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long) + dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, + all_start_positions, all_end_positions, + all_cls_index, all_p_mask) if output_examples: return dataset, examples, features return dataset - def train_model( - self, - train_data, - output_dir=False, - show_running_loss=True, - args=None, - eval_data=None, - ): + def train_model(self, train_data, output_dir=False, show_running_loss=True, args=None, eval_data=None): """ Trains the model using 'train_data' @@ -243,38 +184,27 @@ def train_model( eval_data (optional): Path to JSON file containing evaluation data against which evaluation will be performed when evaluate_during_training is enabled. Is required if evaluate_during_training is enabled. Returns: None - """ # noqa: ignore flake8 + """ if args: self.args.update(args) - if self.args["silent"]: + if self.args['silent']: show_running_loss = False - if self.args["evaluate_during_training"] and eval_data is None: - raise ValueError( - "evaluate_during_training is enabled but eval_data is not specified. " - "Pass eval_data to model.train_model()" - "if using evaluate_during_training." - ) + if self.args['evaluate_during_training'] and eval_data is None: + raise ValueError("evaluate_during_training is enabled but eval_data is not specified. Pass eval_data to model.train_model() if using evaluate_during_training.") if not output_dir: - output_dir = self.args["output_dir"] + output_dir = self.args['output_dir'] - if ( - os.path.exists(output_dir) - and os.listdir(output_dir) - and not self.args["overwrite_output_dir"] - ): - raise ValueError( - "Output directory ({}) already exists and is not empty." - " Use --overwrite_output_dir to overcome.".format(output_dir) - ) + if os.path.exists(output_dir) and os.listdir(output_dir) and not self.args["overwrite_output_dir"]: + raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(output_dir)) self._move_model_to_device() if isinstance(train_data, str): - with open(train_data, "r") as f: + with open(train_data, 'r') as f: train_examples = json.load(f) else: train_examples = train_data @@ -284,96 +214,55 @@ def train_model( if not os.path.exists(output_dir): os.makedirs(output_dir) - global_step, tr_loss = self.train( - train_dataset, - output_dir, - show_running_loss=show_running_loss, - eval_data=eval_data, - ) + global_step, tr_loss = self.train(train_dataset, output_dir, show_running_loss=show_running_loss, eval_data=eval_data) - model_to_save = ( - self.model.module if hasattr(self.model, "module") else self.model - ) + model_to_save = self.model.module if hasattr(self.model, "module") else self.model model_to_save.save_pretrained(output_dir) self.tokenizer.save_pretrained(output_dir) torch.save(self.args, os.path.join(output_dir, "training_args.bin")) - print( - "Training of {} model complete. Saved to {}.".format( - self.args["model_type"], output_dir - ) - ) + print("Training of {} model complete. Saved to {}.".format(self.args["model_type"], output_dir)) def train(self, train_dataset, output_dir, show_running_loss=True, eval_data=None): """ Trains the model on train_dataset. - Utility function to be used by the train_model() method. - Not intended to be used directly. + Utility function to be used by the train_model() method. Not intended to be used directly. """ + tokenizer = self.tokenizer device = self.device model = self.model args = self.args tb_writer = SummaryWriter(logdir=args["tensorboard_dir"]) train_sampler = RandomSampler(train_dataset) - train_dataloader = DataLoader( - train_dataset, sampler=train_sampler, batch_size=args["train_batch_size"] - ) + train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args["train_batch_size"]) - t_total = ( - len(train_dataloader) - // args["gradient_accumulation_steps"] - * args["num_train_epochs"] - ) + t_total = len(train_dataloader) // args["gradient_accumulation_steps"] * args["num_train_epochs"] no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ - { - "params": [ - p - for n, p in model.named_parameters() - if not any(nd in n for nd in no_decay) - ], - "weight_decay": args["weight_decay"], - }, - { - "params": [ - p - for n, p in model.named_parameters() - if any(nd in n for nd in no_decay) - ], - "weight_decay": 0.0, - }, + {"params": [p for n, p in model.named_parameters() if not any( + nd in n for nd in no_decay)], "weight_decay": args["weight_decay"]}, + {"params": [p for n, p in model.named_parameters() if any( + nd in n for nd in no_decay)], "weight_decay": 0.0} ] warmup_steps = math.ceil(t_total * args["warmup_ratio"]) - args["warmup_steps"] = ( - warmup_steps if args["warmup_steps"] == 0 else args["warmup_steps"] - ) - - optimizer = AdamW( - optimizer_grouped_parameters, - lr=args["learning_rate"], - eps=args["adam_epsilon"], - ) - scheduler = get_linear_schedule_with_warmup( - optimizer, num_warmup_steps=args["warmup_steps"], num_training_steps=t_total - ) + args["warmup_steps"] = warmup_steps if args["warmup_steps"] == 0 else args["warmup_steps"] + + optimizer = AdamW(optimizer_grouped_parameters, lr=args["learning_rate"], eps=args["adam_epsilon"]) + scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args["warmup_steps"], num_training_steps=t_total) if args["fp16"]: try: from apex import amp except ImportError: raise ImportError( - "Please install apex from https://www.github.com/nvidia/apex" - "to use fp16 training." - ) + "Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") - model, optimizer = amp.initialize( - model, optimizer, opt_level=args["fp16_opt_level"] - ) + model, optimizer = amp.initialize(model, optimizer, opt_level=args["fp16_opt_level"]) if args["n_gpu"] > 1: model = torch.nn.DataParallel(model) @@ -381,53 +270,45 @@ def train(self, train_dataset, output_dir, show_running_loss=True, eval_data=Non global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() - train_iterator = trange( - int(args["num_train_epochs"]), desc="Epoch", disable=args["silent"] - ) + train_iterator = trange(int(args["num_train_epochs"]), desc="Epoch", disable=args['silent']) epoch_number = 0 - if args["evaluate_during_training"]: + if args['evaluate_during_training']: training_progress_scores = { - "global_step": [], - "correct": [], - "similar": [], - "incorrect": [], - "train_loss": [], + 'global_step': [], + 'correct': [], + 'similar': [], + 'incorrect': [], + 'train_loss': [], } - if args["wandb_project"]: - wandb.init(project=args["wandb_project"], config={**args}) + if args['wandb_project']: + wandb.init(project=args['wandb_project'], config={**args}) wandb.watch(self.model) model.train() for _ in train_iterator: # epoch_iterator = tqdm(train_dataloader, desc="Iteration") - for step, batch in enumerate( - tqdm(train_dataloader, desc="Current iteration", disable=args["silent"]) - ): + for step, batch in enumerate(tqdm(train_dataloader, desc="Current iteration", disable=args['silent'])): batch = tuple(t.to(device) for t in batch) - inputs = { - "input_ids": batch[0], - "attention_mask": batch[1], - "start_positions": batch[3], - "end_positions": batch[4], - } + inputs = {'input_ids': batch[0], + 'attention_mask': batch[1], + 'start_positions': batch[3], + 'end_positions': batch[4] + } - if args["model_type"] != "distilbert": - inputs["token_type_ids"] = ( - None if args["model_type"] == "xlm" else batch[2] - ) - if args["model_type"] in ["xlnet", "xlm"]: - inputs.update({"cls_index": batch[5], "p_mask": batch[6]}) + if args['model_type'] != 'distilbert': + inputs['token_type_ids'] = None if args['model_type'] == 'xlm' else batch[2] + if args['model_type'] in ['xlnet', 'xlm']: + inputs.update({'cls_index': batch[5], + 'p_mask': batch[6]}) outputs = model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] - if args["n_gpu"] > 1: - loss = ( - loss.mean() - ) # mean() to average on multi-gpu parallel training + if args['n_gpu'] > 1: + loss = loss.mean() # mean() to average on multi-gpu parallel training current_loss = loss.item() @@ -440,14 +321,10 @@ def train(self, train_dataset, output_dir, show_running_loss=True, eval_data=Non if args["fp16"]: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() - torch.nn.utils.clip_grad_norm_( - amp.master_params(optimizer), args["max_grad_norm"] - ) + torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args["max_grad_norm"]) else: loss.backward() - torch.nn.utils.clip_grad_norm_( - model.parameters(), args["max_grad_norm"] - ) + torch.nn.utils.clip_grad_norm_(model.parameters(), args["max_grad_norm"]) tr_loss += loss.item() if (step + 1) % args["gradient_accumulation_steps"] == 0: @@ -456,107 +333,68 @@ def train(self, train_dataset, output_dir, show_running_loss=True, eval_data=Non model.zero_grad() global_step += 1 - if ( - args["logging_steps"] > 0 - and global_step % args["logging_steps"] == 0 - ): + if args["logging_steps"] > 0 and global_step % args["logging_steps"] == 0: # Log metrics tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) - tb_writer.add_scalar( - "loss", - (tr_loss - logging_loss) / args["logging_steps"], - global_step, - ) + tb_writer.add_scalar("loss", (tr_loss - logging_loss)/args["logging_steps"], global_step) logging_loss = tr_loss - if args["wandb_project"]: - wandb.log( - { - "Training loss": current_loss, - "lr": scheduler.get_lr()[0], - "global_step": global_step, - } - ) + if args['wandb_project']: + wandb.log({'Training loss': current_loss, 'lr': scheduler.get_lr()[0], 'global_step': global_step}) if args["save_steps"] > 0 and global_step % args["save_steps"] == 0: # Save model checkpoint - output_dir_current = os.path.join( - output_dir, "checkpoint-{}".format(global_step) - ) + output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir_current): os.makedirs(output_dir_current) - model_to_save = ( - model.module if hasattr(model, "module") else model - ) + model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(output_dir_current) self.tokenizer.save_pretrained(output_dir_current) - if args["evaluate_during_training"] and ( - args["evaluate_during_training_steps"] > 0 - and global_step % args["evaluate_during_training_steps"] == 0 - ): - # Only evaluate when single GPU otherwise metrics may not - # average well + if args['evaluate_during_training'] and (args["evaluate_during_training_steps"] > 0 and global_step % args["evaluate_during_training_steps"] == 0): + # Only evaluate when single GPU otherwise metrics may not average well results, _ = self.eval_model(eval_data, verbose=True) for key, value in results.items(): - tb_writer.add_scalar( - "eval_{}".format(key), value, global_step - ) + tb_writer.add_scalar('eval_{}'.format(key), value, global_step) - output_dir_current = os.path.join( - output_dir, "checkpoint-{}".format(global_step) - ) + output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir_current): os.makedirs(output_dir_current) - if args["save_eval_checkpoints"]: - model_to_save = ( - model.module if hasattr(model, "module") else model - ) + if args['save_eval_checkpoints']: + model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(output_dir_current) self.tokenizer.save_pretrained(output_dir_current) - output_eval_file = os.path.join( - output_dir_current, "eval_results.txt" - ) + output_eval_file = os.path.join(output_dir_current, "eval_results.txt") with open(output_eval_file, "w") as writer: for key in sorted(results.keys()): writer.write("{} = {}\n".format(key, str(results[key]))) - training_progress_scores["global_step"].append(global_step) - training_progress_scores["train_loss"].append(current_loss) + training_progress_scores['global_step'].append(global_step) + training_progress_scores['train_loss'].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) - report.to_csv( - args["output_dir"] + "training_progress_scores.csv", - index=False, - ) + report.to_csv(args['output_dir'] + 'training_progress_scores.csv', index=False) - if args["wandb_project"]: + if args['wandb_project']: wandb.log(self._get_last_metrics(training_progress_scores)) epoch_number += 1 - output_dir_current = os.path.join( - output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number) - ) + output_dir_current = os.path.join(output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number)) - if ( - args["save_model_every_epoch"] or args["evaluate_during_training"] - ) and not os.path.exists(output_dir_current): + if (args['save_model_every_epoch'] or args['evaluate_during_training']) and not os.path.exists(output_dir_current): os.makedirs(output_dir_current) - if ( - args["save_model_every_epoch"] - and epoch_number != args["num_train_epochs"] - ): + if args['save_model_every_epoch'] and epoch_number != args['num_train_epochs']: model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(output_dir_current) self.tokenizer.save_pretrained(output_dir_current) - if args["evaluate_during_training"]: + if args['evaluate_during_training']: results, _ = self.eval_model(eval_data, verbose=True) output_eval_file = os.path.join(output_dir_current, "eval_results.txt") @@ -578,19 +416,17 @@ def eval_model(self, eval_data, output_dir=None, verbose=False): Returns: result: Dictionary containing evaluation results. (correct, similar, incorrect) text: A dictionary containing the 3 dictionaries correct_text, similar_text (the predicted answer is a substring of the correct answer or vise versa), incorrect_text. - """ # noqa: ignore flake8 + """ if not output_dir: output_dir = self.args["output_dir"] self._move_model_to_device() - all_predictions, all_nbest_json, scores_diff_json = self.evaluate( - eval_data, output_dir - ) + all_predictions, all_nbest_json, scores_diff_json = self.evaluate(eval_data, output_dir) if isinstance(eval_data, str): - with open(eval_data, "r") as f: + with open(eval_data, 'r') as f: truth = json.load(f) else: truth = eval_data @@ -608,125 +444,90 @@ def evaluate(self, eval_data, output_dir): """ Evaluates the model on eval_data. - Utility function to be used by the eval_model() method. - Not intended to be used directly. + Utility function to be used by the eval_model() method. Not intended to be used directly. """ tokenizer = self.tokenizer device = self.device model = self.model args = self.args + eval_output_dir = output_dir + + results = {} if isinstance(eval_data, str): - with open(eval_data, "r") as f: + with open(eval_data, 'r') as f: eval_examples = json.load(f) else: eval_examples = eval_data - eval_dataset, examples, features = self.load_and_cache_examples( - eval_examples, evaluate=True, output_examples=True - ) + eval_dataset, examples, features = self.load_and_cache_examples(eval_examples, evaluate=True, output_examples=True) eval_sampler = SequentialSampler(eval_dataset) - eval_dataloader = DataLoader( - eval_dataset, sampler=eval_sampler, batch_size=args["eval_batch_size"] - ) + eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args["eval_batch_size"]) + eval_loss = 0.0 + nb_eval_steps = 0 + preds = None + out_label_ids = None model.eval() all_results = [] - for batch in tqdm(eval_dataloader, disable=args["silent"]): + for batch in tqdm(eval_dataloader, disable=args['silent']): batch = tuple(t.to(device) for t in batch) with torch.no_grad(): - inputs = { - "input_ids": batch[0], - "attention_mask": batch[1], - } + inputs = {'input_ids': batch[0], + 'attention_mask': batch[1], + } - if args["model_type"] != "distilbert": - inputs["token_type_ids"] = ( - None if args["model_type"] == "xlm" else batch[2] - ) + if args['model_type'] != 'distilbert': + inputs['token_type_ids'] = None if args['model_type'] == 'xlm' else batch[2] example_indices = batch[3] - if args["model_type"] in ["xlnet", "xlm"]: - inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) + if args['model_type'] in ['xlnet', 'xlm']: + inputs.update({'cls_index': batch[4], + 'p_mask': batch[5]}) outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) - if args["model_type"] in ["xlnet", "xlm"]: + if args['model_type'] in ['xlnet', 'xlm']: # XLNet uses a more complex post-processing procedure - result = RawResultExtended( - unique_id=unique_id, - start_top_log_probs=to_list(outputs[0][i]), - start_top_index=to_list(outputs[1][i]), - end_top_log_probs=to_list(outputs[2][i]), - end_top_index=to_list(outputs[3][i]), - cls_logits=to_list(outputs[4][i]), - ) + result = RawResultExtended(unique_id=unique_id, + start_top_log_probs=to_list(outputs[0][i]), + start_top_index=to_list(outputs[1][i]), + end_top_log_probs=to_list(outputs[2][i]), + end_top_index=to_list(outputs[3][i]), + cls_logits=to_list(outputs[4][i])) else: - result = RawResult( - unique_id=unique_id, - start_logits=to_list(outputs[0][i]), - end_logits=to_list(outputs[1][i]), - ) + result = RawResult(unique_id=unique_id, + start_logits=to_list(outputs[0][i]), + end_logits=to_list(outputs[1][i])) all_results.append(result) - prefix = "test" + prefix = 'test' if not os.path.isdir(output_dir): os.mkdir(output_dir) - output_prediction_file = os.path.join( - output_dir, "predictions_{}.json".format(prefix) - ) - output_nbest_file = os.path.join( - output_dir, "nbest_predictions_{}.json".format(prefix) - ) - output_null_log_odds_file = os.path.join( - output_dir, "null_odds_{}.json".format(prefix) - ) - - if args["model_type"] in ["xlnet", "xlm"]: + output_prediction_file = os.path.join(output_dir, "predictions_{}.json".format(prefix)) + output_nbest_file = os.path.join(output_dir, "nbest_predictions_{}.json".format(prefix)) + output_null_log_odds_file = os.path.join(output_dir, "null_odds_{}.json".format(prefix)) + + if args['model_type'] in ['xlnet', 'xlm']: # XLNet uses a more complex post-processing procedure - ( - all_predictions, - all_nbest_json, - scores_diff_json, - ) = write_predictions_extended( - examples, - features, - all_results, - args["n_best_size"], - args["max_answer_length"], - output_prediction_file, - output_nbest_file, - output_null_log_odds_file, - eval_data, - model.config.start_n_top, - model.config.end_n_top, - True, - tokenizer, - not args["silent"], - ) + all_predictions, all_nbest_json, scores_diff_json = write_predictions_extended(examples, features, all_results, args['n_best_size'], + args['max_answer_length'], output_prediction_file, + output_nbest_file, output_null_log_odds_file, eval_data, + model.config.start_n_top, model.config.end_n_top, + True, tokenizer, not args['silent']) else: - all_predictions, all_nbest_json, scores_diff_json = write_predictions( - examples, - features, - all_results, - args["n_best_size"], - args["max_answer_length"], - False, - output_prediction_file, - output_nbest_file, - output_null_log_odds_file, - not args["silent"], - True, - args["null_score_diff_threshold"], - ) + all_predictions, all_nbest_json, scores_diff_json = write_predictions(examples, features, all_results, args['n_best_size'], + args['max_answer_length'], False, output_prediction_file, + output_nbest_file, output_null_log_odds_file, not args['silent'], + True, args['null_score_diff_threshold']) return all_predictions, all_nbest_json, scores_diff_json @@ -749,97 +550,71 @@ def predict(self, to_predict, n_best_size=None): Returns: preds: A python list containg the predicted answer, and id for each question in to_predict. - """ # noqa: ignore flake8 + """ tokenizer = self.tokenizer device = self.device model = self.model args = self.args if not n_best_size: - n_best_size = args["n_best_size"] + n_best_size = args['n_best_size'] self._move_model_to_device() eval_examples = build_examples(to_predict) - eval_dataset, examples, features = self.load_and_cache_examples( - eval_examples, evaluate=True, output_examples=True, no_cache=True - ) + eval_dataset, examples, features = self.load_and_cache_examples(eval_examples, evaluate=True, output_examples=True, no_cache=True) eval_sampler = SequentialSampler(eval_dataset) - eval_dataloader = DataLoader( - eval_dataset, sampler=eval_sampler, batch_size=args["eval_batch_size"] - ) + eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args["eval_batch_size"]) + eval_loss = 0.0 + nb_eval_steps = 0 + preds = None + out_label_ids = None model.eval() all_results = [] - for batch in tqdm(eval_dataloader, disable=args["silent"]): + for batch in tqdm(eval_dataloader, disable=args['silent']): batch = tuple(t.to(device) for t in batch) with torch.no_grad(): - inputs = { - "input_ids": batch[0], - "attention_mask": batch[1], - } + inputs = {'input_ids': batch[0], + 'attention_mask': batch[1], + } - if args["model_type"] != "distilbert": - inputs["token_type_ids"] = ( - None if args["model_type"] == "xlm" else batch[2] - ) + if args['model_type'] != 'distilbert': + inputs['token_type_ids'] = None if args['model_type'] == 'xlm' else batch[2] example_indices = batch[3] - if args["model_type"] in ["xlnet", "xlm"]: - inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) + if args['model_type'] in ['xlnet', 'xlm']: + inputs.update({'cls_index': batch[4], + 'p_mask': batch[5]}) outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) - if args["model_type"] in ["xlnet", "xlm"]: + if args['model_type'] in ['xlnet', 'xlm']: # XLNet uses a more complex post-processing procedure - result = RawResultExtended( - unique_id=unique_id, - start_top_log_probs=to_list(outputs[0][i]), - start_top_index=to_list(outputs[1][i]), - end_top_log_probs=to_list(outputs[2][i]), - end_top_index=to_list(outputs[3][i]), - cls_logits=to_list(outputs[4][i]), - ) + result = RawResultExtended(unique_id=unique_id, + start_top_log_probs=to_list(outputs[0][i]), + start_top_index=to_list(outputs[1][i]), + end_top_log_probs=to_list(outputs[2][i]), + end_top_index=to_list(outputs[3][i]), + cls_logits=to_list(outputs[4][i])) else: - result = RawResult( - unique_id=unique_id, - start_logits=to_list(outputs[0][i]), - end_logits=to_list(outputs[1][i]), - ) + result = RawResult(unique_id=unique_id, + start_logits=to_list(outputs[0][i]), + end_logits=to_list(outputs[1][i])) all_results.append(result) - if args["model_type"] in ["xlnet", "xlm"]: - answers = get_best_predictions_extended( - examples, - features, - all_results, - n_best_size, - args["max_answer_length"], - model.config.start_n_top, - model.config.end_n_top, - True, - tokenizer, - args["null_score_diff_threshold"], - ) + if args['model_type'] in ['xlnet', 'xlm']: + answers = get_best_predictions_extended(examples, features, all_results, n_best_size, + args['max_answer_length'], model.config.start_n_top, model.config.end_n_top, True, tokenizer, args['null_score_diff_threshold']) else: - answers = get_best_predictions( - examples, - features, - all_results, - n_best_size, - args["max_answer_length"], - False, - False, - True, - False, - ) + answers = get_best_predictions(examples, features, all_results, n_best_size, args['max_answer_length'], False, False, True, False) return answers @@ -848,12 +623,12 @@ def calculate_results(self, truth, predictions): questions_dict = {} print(truth) for item in truth: - for answer in item["qas"]: - if answer["answers"]: - truth_dict[answer["id"]] = answer["answers"][0]["text"] + for answer in item['qas']: + if answer['answers']: + truth_dict[answer['id']] = answer['answers'][0]['text'] else: - truth_dict[answer["id"]] = "" - questions_dict[answer["id"]] = answer["question"] + truth_dict[answer['id']] = '' + questions_dict[answer['id']] = answer['question'] correct = 0 incorrect = 0 @@ -866,34 +641,23 @@ def calculate_results(self, truth, predictions): if predictions[q_id].strip() == answer.strip(): correct += 1 correct_text[q_id] = answer - elif ( - predictions[q_id].strip() in answer.strip() - or answer.strip() in predictions[q_id].strip() - ): + elif predictions[q_id].strip() in answer.strip() or answer.strip() in predictions[q_id].strip(): similar += 1 - similar_text[q_id] = { - "truth": answer, - "predicted": predictions[q_id], - "question": questions_dict[q_id], - } + similar_text[q_id] = {'truth': answer, 'predicted': predictions[q_id], 'question': questions_dict[q_id]} else: incorrect += 1 - incorrect_text[q_id] = { - "truth": answer, - "predicted": predictions[q_id], - "question": questions_dict[q_id], - } + incorrect_text[q_id] = {'truth': answer, 'predicted': predictions[q_id], 'question': questions_dict[q_id]} result = { - "correct": correct, - "similar": similar, - "incorrect": incorrect, + 'correct': correct, + 'similar': similar, + 'incorrect': incorrect, } texts = { - "correct_text": correct_text, - "similar_text": similar_text, - "incorrect_text": incorrect_text, + 'correct_text': correct_text, + 'similar_text': similar_text, + 'incorrect_text': incorrect_text, } return result, texts diff --git a/simpletransformers/question_answering/question_answering_utils.py b/simpletransformers/question_answering/question_answering_utils.py index 306bdda2..ce740de7 100755 --- a/simpletransformers/question_answering/question_answering_utils.py +++ b/simpletransformers/question_answering/question_answering_utils.py @@ -11,7 +11,8 @@ from tqdm import tqdm, trange import os import torch -from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset +from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, + TensorDataset) from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize from pprint import pprint @@ -28,16 +29,14 @@ class InputExample(object): For examples without an answer, the start and end position are -1. """ - def __init__( - self, - qas_id, - question_text, - doc_tokens, - orig_answer_text=None, - start_position=None, - end_position=None, - is_impossible=None, - ): + def __init__(self, + qas_id, + question_text, + doc_tokens, + orig_answer_text=None, + start_position=None, + end_position=None, + is_impossible=None): self.qas_id = qas_id self.question_text = question_text self.doc_tokens = doc_tokens @@ -52,7 +51,8 @@ def __str__(self): def __repr__(self): s = "" s += "qas_id: %s" % (self.qas_id) - s += ", question_text: %s" % (self.question_text) + s += ", question_text: %s" % ( + self.question_text) s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) if self.start_position: s += ", start_position: %d" % (self.start_position) @@ -70,24 +70,22 @@ def to_list(tensor): class InputFeatures(object): """A single set of features of data.""" - def __init__( - self, - unique_id, - example_index, - doc_span_index, - tokens, - token_to_orig_map, - token_is_max_context, - input_ids, - input_mask, - segment_ids, - cls_index, - p_mask, - paragraph_len, - start_position=None, - end_position=None, - is_impossible=None, - ): + def __init__(self, + unique_id, + example_index, + doc_span_index, + tokens, + token_to_orig_map, + token_is_max_context, + input_ids, + input_mask, + segment_ids, + cls_index, + p_mask, + paragraph_len, + start_position=None, + end_position=None, + is_impossible=None): self.unique_id = unique_id self.example_index = example_index self.doc_span_index = doc_span_index @@ -137,23 +135,20 @@ def is_whitespace(c): start_position = None end_position = None orig_answer_text = None - is_impossible = qa.get("is_impossible") + is_impossible = qa.get('is_impossible') if is_training: if version_2_with_negative: is_impossible = qa["is_impossible"] if (len(qa["answers"]) != 1) and (not is_impossible): - raise ValueError( - "For training, each question should have exactly 1 answer." - ) + raise ValueError("For training, each question should have exactly 1 answer.") if not is_impossible: answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] - end_position = char_to_word_offset[ - answer_offset + answer_length - 1 - ] + end_position = char_to_word_offset[answer_offset + + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. @@ -161,17 +156,11 @@ def is_whitespace(c): # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( - doc_tokens[start_position : (end_position + 1)] - ) + doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( - whitespace_tokenize(orig_answer_text) - ) + whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: - logger.warning( - "Could not find answer: '%s' vs. '%s'", - actual_text, - cleaned_answer_text, - ) + logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue else: start_position = -1 @@ -185,31 +174,20 @@ def is_whitespace(c): orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, - is_impossible=is_impossible, - ) + is_impossible=is_impossible) examples.append(example) return examples -def convert_examples_to_features( - examples, - tokenizer, - max_seq_length, - doc_stride, - max_query_length, - is_training, - cls_token_at_end=False, - cls_token="[CLS]", - sep_token="[SEP]", - pad_token=0, - sequence_a_segment_id=0, - sequence_b_segment_id=1, - cls_token_segment_id=0, - pad_token_segment_id=0, - mask_padding_with_zero=True, - sequence_a_is_doc=False, - silent=False, -): +def convert_examples_to_features(examples, tokenizer, max_seq_length, + doc_stride, max_query_length, is_training, + cls_token_at_end=False, + cls_token='[CLS]', sep_token='[SEP]', pad_token=0, + sequence_a_segment_id=0, sequence_b_segment_id=1, + cls_token_segment_id=0, pad_token_segment_id=0, + mask_padding_with_zero=True, + sequence_a_is_doc=False, + silent=False): """Loads a data file into a list of `InputBatch`s.""" unique_id = 1000000000 @@ -221,10 +199,7 @@ def convert_examples_to_features( for (example_index, example) in enumerate(tqdm(examples, disable=silent)): # if example_index % 100 == 0: - # logger.info('Converting %s/%s pos %s neg %s', - # example_index, - # len(examples), - # cnt_pos, cnt_neg) + # logger.info('Converting %s/%s pos %s neg %s', example_index, len(examples), cnt_pos, cnt_neg) query_tokens = tokenizer.tokenize(example.question_text) @@ -253,12 +228,8 @@ def convert_examples_to_features( else: tok_end_position = len(all_doc_tokens) - 1 (tok_start_position, tok_end_position) = _improve_answer_span( - all_doc_tokens, - tok_start_position, - tok_end_position, - tokenizer, - example.orig_answer_text, - ) + all_doc_tokens, tok_start_position, tok_end_position, tokenizer, + example.orig_answer_text) # The -3 accounts for [CLS], [SEP] and [SEP] max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 @@ -267,8 +238,7 @@ def convert_examples_to_features( # To deal with this we do a sliding window approach, where we take chunks # of the up to our max length with a stride of `doc_stride`. _DocSpan = collections.namedtuple( # pylint: disable=invalid-name - "DocSpan", ["start", "length"] - ) + "DocSpan", ["start", "length"]) doc_spans = [] start_offset = 0 while start_offset < len(all_doc_tokens): @@ -286,10 +256,8 @@ def convert_examples_to_features( token_is_max_context = {} segment_ids = [] - # p_mask: mask with 1 for token than cannot be in the answer - # (0 for token which can be in an answer) - # Original TF implem also keep the classification - # token (set to 0) (not sure why...) + # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer) + # Original TF implem also keep the classification token (set to 0) (not sure why...) p_mask = [] # CLS token at the beginning @@ -315,11 +283,11 @@ def convert_examples_to_features( # Paragraph for i in range(doc_span.length): split_token_index = doc_span.start + i - token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] + token_to_orig_map[len( + tokens)] = tok_to_orig_index[split_token_index] - is_max_context = _check_is_max_context( - doc_spans, doc_span_index, split_token_index - ) + is_max_context = _check_is_max_context(doc_spans, doc_span_index, + split_token_index) token_is_max_context[len(tokens)] = is_max_context tokens.append(all_doc_tokens[split_token_index]) if not sequence_a_is_doc: @@ -377,9 +345,8 @@ def convert_examples_to_features( doc_start = doc_span.start doc_end = doc_span.start + doc_span.length - 1 out_of_span = False - if not ( - tok_start_position >= doc_start and tok_end_position <= doc_end - ): + if not (tok_start_position >= doc_start and + tok_end_position <= doc_end): out_of_span = True if out_of_span: start_position = 0 @@ -403,28 +370,26 @@ def convert_examples_to_features( logger.info("example_index: %s" % (example_index)) logger.info("doc_span_index: %s" % (doc_span_index)) logger.info("tokens: %s" % " ".join(tokens)) + logger.info("token_to_orig_map: %s" % " ".join([ + "%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()])) + logger.info("token_is_max_context: %s" % " ".join([ + "%d:%s" % (x, y) for (x, y) in token_is_max_context.items() + ])) + logger.info("input_ids: %s" % + " ".join([str(x) for x in input_ids])) logger.info( - "token_to_orig_map: %s" - % " ".join( - ["%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()] - ) - ) + "input_mask: %s" % " ".join([str(x) for x in input_mask])) logger.info( - "token_is_max_context: %s" - % " ".join( - ["%d:%s" % (x, y) for (x, y) in token_is_max_context.items()] - ) - ) - logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) - logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) - logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) if is_training and span_is_impossible: logger.info("impossible example") if is_training and not span_is_impossible: - answer_text = " ".join(tokens[start_position : (end_position + 1)]) + answer_text = " ".join( + tokens[start_position:(end_position + 1)]) logger.info("start_position: %d" % (start_position)) logger.info("end_position: %d" % (end_position)) - logger.info("answer: %s" % (answer_text)) + logger.info( + "answer: %s" % (answer_text)) features.append( InputFeatures( @@ -442,17 +407,14 @@ def convert_examples_to_features( paragraph_len=paragraph_len, start_position=start_position, end_position=end_position, - is_impossible=span_is_impossible, - ) - ) + is_impossible=span_is_impossible)) unique_id += 1 return features -def _improve_answer_span( - doc_tokens, input_start, input_end, tokenizer, orig_answer_text -): +def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, + orig_answer_text): """Returns tokenized answer spans that better match the annotated answer.""" # The SQuAD annotations are character based. We first project them to @@ -481,7 +443,7 @@ def _improve_answer_span( for new_start in range(input_start, input_end + 1): for new_end in range(input_end, new_start - 1, -1): - text_span = " ".join(doc_tokens[new_start : (new_end + 1)]) + text_span = " ".join(doc_tokens[new_start:(new_end + 1)]) if text_span == tok_answer_text: return (new_start, new_end) @@ -517,7 +479,8 @@ def _check_is_max_context(doc_spans, cur_span_index, position): continue num_left_context = position - doc_span.start num_right_context = end - position - score = min(num_left_context, num_right_context) + 0.01 * doc_span.length + score = min(num_left_context, num_right_context) + \ + 0.01 * doc_span.length if best_score is None or score > best_score: best_score = score best_span_index = span_index @@ -525,25 +488,14 @@ def _check_is_max_context(doc_spans, cur_span_index, position): return cur_span_index == best_span_index -RawResult = collections.namedtuple( - "RawResult", ["unique_id", "start_logits", "end_logits"] -) - - -def write_predictions( - all_examples, - all_features, - all_results, - n_best_size, - max_answer_length, - do_lower_case, - output_prediction_file, - output_nbest_file, - output_null_log_odds_file, - verbose_logging, - version_2_with_negative, - null_score_diff_threshold, -): +RawResult = collections.namedtuple("RawResult", + ["unique_id", "start_logits", "end_logits"]) + + +def write_predictions(all_examples, all_features, all_results, n_best_size, + max_answer_length, do_lower_case, output_prediction_file, + output_nbest_file, output_null_log_odds_file, verbose_logging, + version_2_with_negative, null_score_diff_threshold): """Write final predictions to the json file and log-odds of null if needed.""" logger.info("Writing predictions to: %s" % (output_prediction_file)) logger.info("Writing nbest to: %s" % (output_nbest_file)) @@ -558,8 +510,7 @@ def write_predictions( _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name "PrelimPrediction", - ["feature_index", "start_index", "end_index", "start_logit", "end_logit"], - ) + ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]) all_predictions = collections.OrderedDict() all_nbest_json = collections.OrderedDict() @@ -580,7 +531,8 @@ def write_predictions( end_indexes = _get_best_indexes(result.end_logits, n_best_size) # if we could have irrelevant answers, get the min score of irrelevant if version_2_with_negative: - feature_null_score = result.start_logits[0] + result.end_logits[0] + feature_null_score = result.start_logits[0] + \ + result.end_logits[0] if feature_null_score < score_null: score_null = feature_null_score min_null_feature_index = feature_index @@ -612,9 +564,7 @@ def write_predictions( start_index=start_index, end_index=end_index, start_logit=result.start_logits[start_index], - end_logit=result.end_logits[end_index], - ) - ) + end_logit=result.end_logits[end_index])) if version_2_with_negative: prelim_predictions.append( _PrelimPrediction( @@ -622,18 +572,14 @@ def write_predictions( start_index=0, end_index=0, start_logit=null_start_logit, - end_logit=null_end_logit, - ) - ) + end_logit=null_end_logit)) prelim_predictions = sorted( prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), - reverse=True, - ) + reverse=True) _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name - "NbestPrediction", ["text", "start_logit", "end_logit"] - ) + "NbestPrediction", ["text", "start_logit", "end_logit"]) seen_predictions = {} nbest = [] @@ -642,10 +588,12 @@ def write_predictions( break feature = features[pred.feature_index] if pred.start_index > 0: # this is a non-null prediction - tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)] + tok_tokens = feature.tokens[pred.start_index:( + pred.end_index + 1)] orig_doc_start = feature.token_to_orig_map[pred.start_index] orig_doc_end = feature.token_to_orig_map[pred.end_index] - orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)] + orig_tokens = example.doc_tokens[orig_doc_start:( + orig_doc_end + 1)] tok_text = " ".join(tok_tokens) # De-tokenize WordPieces that have been split off. @@ -658,8 +606,7 @@ def write_predictions( orig_text = " ".join(orig_tokens) final_text = get_final_text( - tok_text, orig_text, do_lower_case, verbose_logging - ) + tok_text, orig_text, do_lower_case, verbose_logging) if final_text in seen_predictions: continue @@ -672,29 +619,27 @@ def write_predictions( _NbestPrediction( text=final_text, start_logit=pred.start_logit, - end_logit=pred.end_logit, - ) - ) + end_logit=pred.end_logit)) # if we didn't include the empty option in the n-best, include it if version_2_with_negative: if "" not in seen_predictions: nbest.append( _NbestPrediction( - text="", start_logit=null_start_logit, end_logit=null_end_logit - ) - ) + text="", + start_logit=null_start_logit, + end_logit=null_end_logit)) # In very rare edge cases we could only have single null prediction. # So we just create a nonce prediction in this case to avoid failure. if len(nbest) == 1: - nbest.insert( - 0, _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0) - ) + nbest.insert(0, + _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) # In very rare edge cases we could have no valid predictions. So we # just create a nonce prediction in this case to avoid failure. if not nbest: - nbest.append(_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) + nbest.append( + _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) assert len(nbest) >= 1 @@ -723,11 +668,8 @@ def write_predictions( all_predictions[example.qas_id] = nbest_json[0]["text"] else: # predict "" iff the null score - the score of best non-null > threshold - score_diff = ( - score_null - - best_non_null_entry.start_logit - - (best_non_null_entry.end_logit) - ) + score_diff = score_null - best_non_null_entry.start_logit - ( + best_non_null_entry.end_logit) scores_diff_json[example.qas_id] = score_diff if score_diff > null_score_diff_threshold: all_predictions[example.qas_id] = "" @@ -749,48 +691,28 @@ def write_predictions( # For XLNet (and XLM which uses the same head) -RawResultExtended = collections.namedtuple( - "RawResultExtended", - [ - "unique_id", - "start_top_log_probs", - "start_top_index", - "end_top_log_probs", - "end_top_index", - "cls_logits", - ], -) - - -def write_predictions_extended( - all_examples, - all_features, - all_results, - n_best_size, - max_answer_length, - output_prediction_file, - output_nbest_file, - output_null_log_odds_file, - orig_data_file, - start_n_top, - end_n_top, - version_2_with_negative, - tokenizer, - verbose_logging, -): +RawResultExtended = collections.namedtuple("RawResultExtended", + ["unique_id", "start_top_log_probs", "start_top_index", + "end_top_log_probs", "end_top_index", "cls_logits"]) + + +def write_predictions_extended(all_examples, all_features, all_results, n_best_size, + max_answer_length, output_prediction_file, + output_nbest_file, + output_null_log_odds_file, orig_data_file, + start_n_top, end_n_top, version_2_with_negative, + tokenizer, verbose_logging): """ XLNet write prediction logic (more complex than Bert's). - Write final predictions to the json file and - log-odds of null if needed. + Write final predictions to the json file and log-odds of null if needed. Requires utils_squad_evaluate.py """ _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name "PrelimPrediction", - ["feature_index", "start_index", "end_index", "start_log_prob", "end_log_prob"], - ) + ["feature_index", "start_index", "end_index", + "start_log_prob", "end_log_prob"]) _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name - "NbestPrediction", ["text", "start_log_prob", "end_log_prob"] - ) + "NbestPrediction", ["text", "start_log_prob", "end_log_prob"]) logger.info("Writing predictions to: %s", output_prediction_file) # logger.info("Writing nbest to: %s" % (output_nbest_file)) @@ -854,15 +776,12 @@ def write_predictions_extended( start_index=start_index, end_index=end_index, start_log_prob=start_log_prob, - end_log_prob=end_log_prob, - ) - ) + end_log_prob=end_log_prob)) prelim_predictions = sorted( prelim_predictions, key=lambda x: (x.start_log_prob + x.end_log_prob), - reverse=True, - ) + reverse=True) seen_predictions = {} nbest = [] @@ -882,10 +801,10 @@ def write_predictions_extended( # final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip() # Previously used Bert untokenizer - tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)] + tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)] orig_doc_start = feature.token_to_orig_map[pred.start_index] orig_doc_end = feature.token_to_orig_map[pred.end_index] - orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)] + orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)] tok_text = tokenizer.convert_tokens_to_string(tok_tokens) # Clean whitespace @@ -893,7 +812,8 @@ def write_predictions_extended( tok_text = " ".join(tok_text.split()) orig_text = " ".join(orig_tokens) - final_text = get_final_text(tok_text, orig_text, False, verbose_logging) + final_text = get_final_text(tok_text, orig_text, False, + verbose_logging) if final_text in seen_predictions: continue @@ -904,16 +824,14 @@ def write_predictions_extended( _NbestPrediction( text=final_text, start_log_prob=pred.start_log_prob, - end_log_prob=pred.end_log_prob, - ) - ) + end_log_prob=pred.end_log_prob)) # In very rare edge cases we could have no valid predictions. So we # just create a nonce prediction in this case to avoid failure. if not nbest: nbest.append( - _NbestPrediction(text="", start_log_prob=-1e6, end_log_prob=-1e6) - ) + _NbestPrediction(text="", start_log_prob=-1e6, + end_log_prob=-1e6)) total_scores = [] best_non_null_entry = None @@ -955,33 +873,26 @@ def write_predictions_extended( writer.write(json.dumps(scores_diff_json, indent=4) + "\n") if isinstance(orig_data_file, str): - with open(orig_data_file, "r", encoding="utf-8") as reader: + with open(orig_data_file, "r", encoding='utf-8') as reader: orig_data = json.load(reader) else: orig_data = orig_data_file qid_to_has_ans = make_qid_to_has_ans(orig_data) + has_ans_qids = [k for k, v in qid_to_has_ans.items() if v] + no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v] exact_raw, f1_raw = get_raw_scores(orig_data, all_predictions) out_eval = {} - find_all_best_thresh_v2( - out_eval, all_predictions, exact_raw, f1_raw, scores_diff_json, qid_to_has_ans - ) + find_all_best_thresh_v2(out_eval, all_predictions, + exact_raw, f1_raw, scores_diff_json, qid_to_has_ans) return all_predictions, all_nbest_json, scores_diff_json -def get_best_predictions( - all_examples, - all_features, - all_results, - n_best_size, - max_answer_length, - do_lower_case, - verbose_logging, - version_2_with_negative, - null_score_diff_threshold, -): +def get_best_predictions(all_examples, all_features, all_results, n_best_size, + max_answer_length, do_lower_case, verbose_logging, + version_2_with_negative, null_score_diff_threshold): example_index_to_features = collections.defaultdict(list) for feature in all_features: @@ -993,8 +904,7 @@ def get_best_predictions( _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name "PrelimPrediction", - ["feature_index", "start_index", "end_index", "start_logit", "end_logit"], - ) + ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]) all_predictions = collections.OrderedDict() all_nbest_json = collections.OrderedDict() @@ -1015,7 +925,8 @@ def get_best_predictions( end_indexes = _get_best_indexes(result.end_logits, n_best_size) # if we could have irrelevant answers, get the min score of irrelevant if version_2_with_negative: - feature_null_score = result.start_logits[0] + result.end_logits[0] + feature_null_score = result.start_logits[0] + \ + result.end_logits[0] if feature_null_score < score_null: score_null = feature_null_score min_null_feature_index = feature_index @@ -1047,9 +958,7 @@ def get_best_predictions( start_index=start_index, end_index=end_index, start_logit=result.start_logits[start_index], - end_logit=result.end_logits[end_index], - ) - ) + end_logit=result.end_logits[end_index])) if version_2_with_negative: prelim_predictions.append( _PrelimPrediction( @@ -1057,18 +966,14 @@ def get_best_predictions( start_index=0, end_index=0, start_logit=null_start_logit, - end_logit=null_end_logit, - ) - ) + end_logit=null_end_logit)) prelim_predictions = sorted( prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), - reverse=True, - ) + reverse=True) _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name - "NbestPrediction", ["text", "start_logit", "end_logit"] - ) + "NbestPrediction", ["text", "start_logit", "end_logit"]) seen_predictions = {} nbest = [] @@ -1077,10 +982,12 @@ def get_best_predictions( break feature = features[pred.feature_index] if pred.start_index > 0: # this is a non-null prediction - tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)] + tok_tokens = feature.tokens[pred.start_index:( + pred.end_index + 1)] orig_doc_start = feature.token_to_orig_map[pred.start_index] orig_doc_end = feature.token_to_orig_map[pred.end_index] - orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)] + orig_tokens = example.doc_tokens[orig_doc_start:( + orig_doc_end + 1)] tok_text = " ".join(tok_tokens) # De-tokenize WordPieces that have been split off. @@ -1093,8 +1000,7 @@ def get_best_predictions( orig_text = " ".join(orig_tokens) final_text = get_final_text( - tok_text, orig_text, do_lower_case, verbose_logging - ) + tok_text, orig_text, do_lower_case, verbose_logging) if final_text in seen_predictions: continue @@ -1107,29 +1013,27 @@ def get_best_predictions( _NbestPrediction( text=final_text, start_logit=pred.start_logit, - end_logit=pred.end_logit, - ) - ) + end_logit=pred.end_logit)) # if we didn't include the empty option in the n-best, include it if version_2_with_negative: if "" not in seen_predictions: nbest.append( _NbestPrediction( - text="", start_logit=null_start_logit, end_logit=null_end_logit - ) - ) + text="", + start_logit=null_start_logit, + end_logit=null_end_logit)) # In very rare edge cases we could only have single null prediction. # So we just create a nonce prediction in this case to avoid failure. if len(nbest) == 1: - nbest.insert( - 0, _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0) - ) + nbest.insert(0, + _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) # In very rare edge cases we could have no valid predictions. So we # just create a nonce prediction in this case to avoid failure. if not nbest: - nbest.append(_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) + nbest.append( + _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) assert len(nbest) >= 1 @@ -1158,11 +1062,8 @@ def get_best_predictions( all_predictions[example.qas_id] = nbest_json[0]["text"] else: # predict "" iff the null score - the score of best non-null > threshold - score_diff = ( - score_null - - best_non_null_entry.start_logit - - (best_non_null_entry.end_logit) - ) + score_diff = score_null - best_non_null_entry.start_logit - ( + best_non_null_entry.end_logit) scores_diff_json[example.qas_id] = score_diff if score_diff > null_score_diff_threshold: all_predictions[example.qas_id] = "" @@ -1170,38 +1071,25 @@ def get_best_predictions( all_predictions[example.qas_id] = best_non_null_entry.text all_nbest_json[example.qas_id] = nbest_json - all_best = [ - {"id": id, "answer": answers[0]["text"]} - for id, answers in all_nbest_json.items() - ] + all_best = [{'id': id, 'answer': answers[0]['text']} for id, answers in all_nbest_json.items()] return all_best -def get_best_predictions_extended( - all_examples, - all_features, - all_results, - n_best_size, - max_answer_length, - start_n_top, - end_n_top, - version_2_with_negative, - tokenizer, - verbose_logging, -): +def get_best_predictions_extended(all_examples, all_features, all_results, n_best_size, + max_answer_length, + start_n_top, end_n_top, version_2_with_negative, + tokenizer, verbose_logging): """ XLNet write prediction logic (more complex than Bert's). - Write final predictions to the json file and - log-odds of null if needed. + Write final predictions to the json file and log-odds of null if needed. Requires utils_squad_evaluate.py """ _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name "PrelimPrediction", - ["feature_index", "start_index", "end_index", "start_log_prob", "end_log_prob"], - ) + ["feature_index", "start_index", "end_index", + "start_log_prob", "end_log_prob"]) _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name - "NbestPrediction", ["text", "start_log_prob", "end_log_prob"] - ) + "NbestPrediction", ["text", "start_log_prob", "end_log_prob"]) example_index_to_features = collections.defaultdict(list) for feature in all_features: @@ -1262,15 +1150,12 @@ def get_best_predictions_extended( start_index=start_index, end_index=end_index, start_log_prob=start_log_prob, - end_log_prob=end_log_prob, - ) - ) + end_log_prob=end_log_prob)) prelim_predictions = sorted( prelim_predictions, key=lambda x: (x.start_log_prob + x.end_log_prob), - reverse=True, - ) + reverse=True) seen_predictions = {} nbest = [] @@ -1290,10 +1175,10 @@ def get_best_predictions_extended( # final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip() # Previously used Bert untokenizer - tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)] + tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)] orig_doc_start = feature.token_to_orig_map[pred.start_index] orig_doc_end = feature.token_to_orig_map[pred.end_index] - orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)] + orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)] tok_text = tokenizer.convert_tokens_to_string(tok_tokens) # Clean whitespace @@ -1301,9 +1186,8 @@ def get_best_predictions_extended( tok_text = " ".join(tok_text.split()) orig_text = " ".join(orig_tokens) - final_text = get_final_text( - tok_text, orig_text, tokenizer.do_lower_case, verbose_logging - ) + final_text = get_final_text(tok_text, orig_text, tokenizer.do_lower_case, + verbose_logging) if final_text in seen_predictions: continue @@ -1314,16 +1198,14 @@ def get_best_predictions_extended( _NbestPrediction( text=final_text, start_log_prob=pred.start_log_prob, - end_log_prob=pred.end_log_prob, - ) - ) + end_log_prob=pred.end_log_prob)) # In very rare edge cases we could have no valid predictions. So we # just create a nonce prediction in this case to avoid failure. if not nbest: nbest.append( - _NbestPrediction(text="", start_log_prob=-1e6, end_log_prob=-1e6) - ) + _NbestPrediction(text="", start_log_prob=-1e6, + end_log_prob=-1e6)) total_scores = [] best_non_null_entry = None @@ -1354,28 +1236,21 @@ def get_best_predictions_extended( all_nbest_json[example.qas_id] = nbest_json - all_best = [ - {"id": id, "answer": answers[0]["text"]} - for id, answers in all_nbest_json.items() - ] + all_best = [{'id': id, 'answer': answers[0]['text']} for id, answers in all_nbest_json.items()] return all_best -def find_all_best_thresh_v2( - main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans -): +def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans): best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2( - preds, exact_raw, na_probs, qid_to_has_ans - ) + preds, exact_raw, na_probs, qid_to_has_ans) best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2( - preds, f1_raw, na_probs, qid_to_has_ans - ) - main_eval["best_exact"] = best_exact - main_eval["best_exact_thresh"] = exact_thresh - main_eval["best_f1"] = best_f1 - main_eval["best_f1_thresh"] = f1_thresh - main_eval["has_ans_exact"] = has_ans_exact - main_eval["has_ans_f1"] = has_ans_f1 + preds, f1_raw, na_probs, qid_to_has_ans) + main_eval['best_exact'] = best_exact + main_eval['best_exact_thresh'] = exact_thresh + main_eval['best_f1'] = best_f1 + main_eval['best_f1_thresh'] = f1_thresh + main_eval['has_ans_exact'] = has_ans_exact + main_eval['has_ans_f1'] = has_ans_f1 def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans): @@ -1409,18 +1284,14 @@ def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans): continue has_ans_score += scores[qid] - return ( - 100.0 * best_score / len(scores), - best_thresh, - 1.0 * has_ans_score / has_ans_cnt, - ) + return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt def make_qid_to_has_ans(dataset): qid_to_has_ans = {} for p in dataset: - for qa in p["qas"]: - qid_to_has_ans[qa["id"]] = bool(qa["answers"]) + for qa in p['qas']: + qid_to_has_ans[qa['id']] = bool(qa['answers']) return qid_to_has_ans @@ -1428,16 +1299,15 @@ def get_raw_scores(dataset, preds): exact_scores = {} f1_scores = {} for p in dataset: - for qa in p["qas"]: - qid = qa["id"] - gold_answers = [ - a["text"] for a in qa["answers"] if normalize_answer(a["text"]) - ] + for qa in p['qas']: + qid = qa['id'] + gold_answers = [a['text'] for a in qa['answers'] + if normalize_answer(a['text'])] if not gold_answers: # For unanswerable questions, only correct answer is empty string - gold_answers = [""] + gold_answers = [''] if qid not in preds: - logger.warning("Missing prediction for %s" % qid) + logger.warning('Missing prediction for %s' % qid) continue a_pred = preds[qid] # Take max over all gold answers @@ -1474,21 +1344,19 @@ def get_tokens(s): def normalize_answer(s): """Lower text and remove punctuation, articles and extra whitespace.""" - def remove_articles(text): - regex = re.compile(r"\b(a|an|the)\b", re.UNICODE) - return re.sub(regex, " ", text) + regex = re.compile(r'\b(a|an|the)\b', re.UNICODE) + return re.sub(regex, ' ', text) def white_space_fix(text): - return " ".join(text.split()) + return ' '.join(text.split()) def remove_punc(text): exclude = set(string.punctuation) - return "".join(ch for ch in text if ch not in exclude) + return ''.join(ch for ch in text if ch not in exclude) def lower(text): return text.lower() - return white_space_fix(remove_articles(remove_punc(lower(s)))) @@ -1542,7 +1410,8 @@ def _strip_spaces(text): start_position = tok_text.find(pred_text) if start_position == -1: if verbose_logging: - logger.info("Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) + logger.info( + "Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) return orig_text end_position = start_position + len(pred_text) - 1 @@ -1551,11 +1420,8 @@ def _strip_spaces(text): if len(orig_ns_text) != len(tok_ns_text): if verbose_logging: - logger.info( - "Length not equal after stripping spaces: '%s' vs '%s'", - orig_ns_text, - tok_ns_text, - ) + logger.info("Length not equal after stripping spaces: '%s' vs '%s'", + orig_ns_text, tok_ns_text) return orig_text # We then project the characters in `pred_text` back to `orig_text` using @@ -1586,13 +1452,14 @@ def _strip_spaces(text): logger.info("Couldn't map end position") return orig_text - output_text = orig_text[orig_start_position : (orig_end_position + 1)] + output_text = orig_text[orig_start_position:(orig_end_position + 1)] return output_text def _get_best_indexes(logits, n_best_size): """Get the n-best logits from a list.""" - index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True) + index_and_score = sorted( + enumerate(logits), key=lambda x: x[1], reverse=True) best_indexes = [] for i in range(len(index_and_score)): @@ -1625,6 +1492,10 @@ def _compute_softmax(scores): return probs +def to_list(tensor): + return tensor.detach().cpu().tolist() + + def build_examples(to_predict): """ Builds a list of dicts in input data format from a list of contexts and qas. @@ -1632,11 +1503,17 @@ def build_examples(to_predict): examples = [] for row in to_predict: - context = row["context"] - for qa in row["qas"]: - qa["answers"] = [{"text": " ", "answer_start": 0}] - qa["is_impossible"]: False - example = {"context": context, "qas": row["qas"]} + context = row['context'] + for qa in row['qas']: + qa['answers'] = [{ + 'text': ' ', + 'answer_start': 0 + }] + qa['is_impossible']: False + example = { + 'context': context, + 'qas': row['qas'] + } examples.append(example) return examples