From d115872b38ed4bd68fbb9a007df9e9e4ba45ab3a Mon Sep 17 00:00:00 2001 From: Rodolfo De Nadai Date: Fri, 24 Jul 2020 15:11:14 -0300 Subject: [PATCH 001/127] Create README.md (#6020) * Create README.md * Update README.md * Update model_cards/rdenadai/BR_BERTo/README.md Co-authored-by: Julien Chaumond * Update model_cards/rdenadai/BR_BERTo/README.md Co-authored-by: Julien Chaumond --- model_cards/rdenadai/BR_BERTo/README.md | 54 +++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 model_cards/rdenadai/BR_BERTo/README.md diff --git a/model_cards/rdenadai/BR_BERTo/README.md b/model_cards/rdenadai/BR_BERTo/README.md new file mode 100644 index 000000000000..acb625b7bd62 --- /dev/null +++ b/model_cards/rdenadai/BR_BERTo/README.md @@ -0,0 +1,54 @@ +--- +language: pt +tags: +- portuguese +- brazil +- pt_BR +--- + +# BR_BERTo + +Portuguese (Brazil) model for text inference. + +## Params + +Trained on a corpus of 5_258_624 sentences, with 132_807_374 non unique tokens (992_418 unique tokens). + +But since my machine doesn`t support bigger model, at the end it has a vocab size of 54_000 tokens. The rest of the parameters are the default used in the HuggingFace tutorial. + +[How to train a new language model from scratch using Transformers and Tokenizers](https://huggingface.co/blog/how-to-train) + +## Results + +```python +fill_mask("gostei muito dessa ") + +#[{'sequence': '~~gostei muito dessa experiÃªncia~~', +# 'score': 0.0719294399023056, +# 'token': 2322, +# 'token_str': 'Ä experiÃƒÂªncia'}, +# {'sequence': '~~gostei muito dessa diferenÃ§a~~', +# 'score': 0.05286405608057976, +# 'token': 3472, +# 'token_str': 'Ä diferenÃƒÂ§a'}, +# {'sequence': '~~gostei muito dessa atenÃ§Ã£o~~', +# 'score': 0.027575725689530373, +# 'token': 2557, +# 'token_str': 'Ä atenÃƒÂ§ÃƒÂ£o'}, +# {'sequence': '~~gostei muito dessa histÃ³ria~~', +# 'score': 0.026764703914523125, +# 'token': 1329, +# 'token_str': 'Ä histÃƒÂ³ria'}, +# {'sequence': '~~gostei muito dessa razÃ£o~~', +# 'score': 0.0250675268471241, +# 'token': 3323, +# 'token_str': 'Ä razÃƒÂ£o'}, +# {'sequence': '~~gostei muito dessa resposta~~', +# 'score': 0.024784332141280174, +# 'token': 2403, +# 'token_str': 'Ä resposta'}, +# {'sequence': '~~gostei muito dessa dose~~', +# 'score': 0.01720510423183441, +# 'token': 1042, +# 'token_str': 'Ä dose'}] +``` From 87a779dfa8afebf40297bcf214db9c639d483953 Mon Sep 17 00:00:00 2001 From: Manuel Romero Date: Fri, 24 Jul 2020 20:12:09 +0200 Subject: [PATCH 002/127] Create README.md (#5951) --- .../README.md | 74 +++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 model_cards/mrm8488/mobilebert-uncased-finetuned-squadv2/README.md diff --git a/model_cards/mrm8488/mobilebert-uncased-finetuned-squadv2/README.md b/model_cards/mrm8488/mobilebert-uncased-finetuned-squadv2/README.md new file mode 100644 index 000000000000..4e925af9c5d1 --- /dev/null +++ b/model_cards/mrm8488/mobilebert-uncased-finetuned-squadv2/README.md @@ -0,0 +1,74 @@ +--- +language: en +datasets: +- squad_v2 +--- + +# MobileBERT + SQuAD v2 ðŸ“±â“ + +[mobilebert-uncased](https://huggingface.co/google/mobilebert-uncased) fine-tuned on [SQUAD v2.0 dataset](https://rajpurkar.github.io/SQuAD-explorer/explore/v2.0/dev/) for **Q&A** downstream task. + +## Details of the downstream task (Q&A) - Model ðŸ§ + +**MobileBERT** is a thin version of *BERT_LARGE*, while equipped with bottleneck structures and a carefully designed balance between self-attentions and feed-forward networks. + +The checkpoint used here is the original MobileBert Optimized Uncased English: (uncased_L-24_H-128_B-512_A-4_F-4_OPT) checkpoint. + +More about the model [here](https://arxiv.org/abs/2004.02984) + +## Details of the downstream task (Q&A) - Dataset ðŸ“š + +**SQuAD2.0** combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable questions written adversarially by crowdworkers to look similar to answerable ones. To do well on SQuAD2.0, systems must not only answer questions when possible, but also determine when no answer is supported by the paragraph and abstain from answering. + +## Model training ðŸ‹ï¸â€ + +The model was trained on a Tesla P100 GPU and 25GB of RAM with the following command: + +```bash +python transformers/examples/question-answering/run_squad.py \ + --model_type bert \ + --model_name_or_path 'google/mobilebert-uncased' \ + --do_eval \ + --do_train \ + --do_lower_case \ + --train_file '/content/dataset/train-v2.0.json' \ + --predict_file '/content/dataset/dev-v2.0.json' \ + --per_gpu_train_batch_size 16 \ + --learning_rate 3e-5 \ + --num_train_epochs 5 \ + --max_seq_length 384 \ + --doc_stride 128 \ + --output_dir '/content/output' \ + --overwrite_output_dir \ + --save_steps 1000 \ + --version_2_with_negative +``` + +It is importatnt to say that this models converges much faster than other ones. So, it is also cheap to fine-tune. + +## Test set Results ðŸ§¾ + +| Metric | # Value | +| ------ | --------- | +| **EM** | **75.37** | +| **F1** | **78.48** | +| **Size**| **94 MB** | + +### Model in action ðŸš€ + +Fast usage with **pipelines**: + +```python +from transformers import pipeline +QnA_pipeline = pipeline('question-answering', model='mrm8488/mobilebert-uncased-finetuned-squadv2') +QnA_pipeline({ + 'context': 'A new strain of flu that has the potential to become a pandemic has been identified in China by scientists.', + 'question': 'Who did identified it ?' + }) + +# Output: {'answer': 'scientists.', 'end': 106, 'score': 0.41531604528427124, 'start': 96} +``` + +> Created by [Manuel Romero/@mrm8488](https://twitter.com/mrm8488) | [LinkedIn](https://www.linkedin.com/in/manuel-romero-cs/) + +> Made with ♥ in Spain From bd51f0a7ab1146542e37e106968f14f1487887e8 Mon Sep 17 00:00:00 2001 From: Manuel Romero Date: Fri, 24 Jul 2020 20:12:14 +0200 Subject: [PATCH 003/127] Create README.md (#5952) --- .../README.md | 74 +++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 model_cards/mrm8488/mobilebert-uncased-finetuned-squadv1/README.md diff --git a/model_cards/mrm8488/mobilebert-uncased-finetuned-squadv1/README.md b/model_cards/mrm8488/mobilebert-uncased-finetuned-squadv1/README.md new file mode 100644 index 000000000000..55ca9b6c75c4 --- /dev/null +++ b/model_cards/mrm8488/mobilebert-uncased-finetuned-squadv1/README.md @@ -0,0 +1,74 @@ +--- +language: en +datasets: +- squad +--- + +# MobileBERT + SQuAD (v1.1) ðŸ“±â“ + +[mobilebert-uncased](https://huggingface.co/google/mobilebert-uncased) fine-tuned on [SQUAD v2.0 dataset](https://rajpurkar.github.io/SQuAD-explorer/explore/v2.0/dev/) for **Q&A** downstream task. + +## Details of the downstream task (Q&A) - Model ðŸ§ + +**MobileBERT** is a thin version of *BERT_LARGE*, while equipped with bottleneck structures and a carefully designed balance between self-attentions and feed-forward networks. + +The checkpoint used here is the original MobileBert Optimized Uncased English: (uncased_L-24_H-128_B-512_A-4_F-4_OPT) checkpoint. + +More about the model [here](https://arxiv.org/abs/2004.02984) + +## Details of the downstream task (Q&A) - Dataset ðŸ“š + +**S**tanford **Q**uestion **A**nswering **D**ataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable. +SQuAD v1.1 contains **100,000+** question-answer pairs on **500+** articles. + +## Model training ðŸ‹ï¸â€ + +The model was trained on a Tesla P100 GPU and 25GB of RAM with the following command: + +```bash +python transformers/examples/question-answering/run_squad.py \ + --model_type bert \ + --model_name_or_path 'google/mobilebert-uncased' \ + --do_eval \ + --do_train \ + --do_lower_case \ + --train_file '/content/dataset/train-v1.1.json' \ + --predict_file '/content/dataset/dev-v1.1.json' \ + --per_gpu_train_batch_size 16 \ + --learning_rate 3e-5 \ + --num_train_epochs 5 \ + --max_seq_length 384 \ + --doc_stride 128 \ + --output_dir '/content/output' \ + --overwrite_output_dir \ + --save_steps 1000 +``` + +It is importatnt to say that this models converges much faster than other ones. So, it is also cheap to fine-tune. + +## Test set Results ðŸ§¾ + +| Metric | # Value | +| ------ | --------- | +| **EM** | **82.33** | +| **F1** | **89.64** | +| **Size**| **94 MB** | + +### Model in action ðŸš€ + +Fast usage with **pipelines**: + +```python +from transformers import pipeline +QnA_pipeline = pipeline('question-answering', model='mrm8488/mobilebert-uncased-finetuned-squadv1') +QnA_pipeline({ + 'context': 'A new strain of flu that has the potential to become a pandemic has been identified in China by scientists.', + 'question': 'Who did identified it ?' + }) + +# Output: {'answer': 'scientists.', 'end': 106, 'score': 0.7885545492172241, 'start': 96} +``` + +> Created by [Manuel Romero/@mrm8488](https://twitter.com/mrm8488) | [LinkedIn](https://www.linkedin.com/in/manuel-romero-cs/) + +> Made with ♥ in Spain From 518361d69afd45bf6022f9fa1b1f760aeceb1730 Mon Sep 17 00:00:00 2001 From: Manuel Romero Date: Fri, 24 Jul 2020 20:12:29 +0200 Subject: [PATCH 004/127] Create model card for RuPERTa-base (#6016) * Update README.md * Update model_cards/mrm8488/RuPERTa-base/README.md Co-authored-by: Julien Chaumond Co-authored-by: Julien Chaumond --- model_cards/mrm8488/RuPERTa-base/README.md | 120 +++++++++++++++++++++ 1 file changed, 120 insertions(+) diff --git a/model_cards/mrm8488/RuPERTa-base/README.md b/model_cards/mrm8488/RuPERTa-base/README.md index 313aa7572a83..b822c996b3a1 100644 --- a/model_cards/mrm8488/RuPERTa-base/README.md +++ b/model_cards/mrm8488/RuPERTa-base/README.md @@ -1,5 +1,125 @@ --- language: es +thumbnail: https://i.imgur.com/DUlT077.jpg widget: - text: "EspaÃ±a es un paÃs muy en la UE" --- + +# RuPERTa: the Spanish RoBERTa ðŸŽƒ

+ +RuPERTa-base (uncased) is a [RoBERTa model](https://github.com/pytorch/fairseq/tree/master/examples/roberta) trained on a *uncased* verison of [big Spanish corpus](https://github.com/josecannete/spanish-corpora). +RoBERTa iterates on BERT's pretraining procedure, including training the model longer, with bigger batches over more data; removing the next sentence prediction objective; training on longer sequences; and dynamically changing the masking pattern applied to the training data. +The architecture is the same as `roberta-base`: + +`roberta.base:` **RoBERTa** using the **BERT-base architecture 125M** params + +## Benchmarks ðŸ§¾ +WIP (I continue working on it) ðŸš§ + +| Task | F1 | Precision | Recall | Fine-tuned model | Reproduce it | +| -------- | ----: | --------: | -----: | --------------------------------------------------------------------------------------: | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| POS | 97.39 | 97.47 | 97.32 | [RuPERTa-base-finetuned-pos](https://huggingface.co/mrm8488/RuPERTa-base-finetuned-pos) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mrm8488/shared_colab_notebooks/blob/master/RuPERTa_base_finetuned_POS.ipynb) +| NER | 77.55 | 75.53 | 79.68 | [RuPERTa-base-finetuned-ner](https://huggingface.co/mrm8488/RuPERTa-base-finetuned-ner) | +| SQUAD-es v1 | to-do | | | +| SQUAD-es v2 | to-do | | | + +## Model in action ðŸ”¨ + +### Usage for POS and NER ðŸ· + +```python +import torch +from transformers import AutoModelForTokenClassification, AutoTokenizer + +id2label = { + "0": "B-LOC", + "1": "B-MISC", + "2": "B-ORG", + "3": "B-PER", + "4": "I-LOC", + "5": "I-MISC", + "6": "I-ORG", + "7": "I-PER", + "8": "O" +} + +tokenizer = AutoTokenizer.from_pretrained('mrm8488/RuPERTa-base-finetuned-ner') +model = AutoModelForTokenClassification.from_pretrained('mrm8488/RuPERTa-base-finetuned-ner') + +text ="Julien, CEO de HF, naciÃ³ en Francia." + +input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0) + +outputs = model(input_ids) +last_hidden_states = outputs[0] + +for m in last_hidden_states: + for index, n in enumerate(m): + if(index > 0 and index <= len(text.split(" "))): + print(text.split(" ")[index-1] + ": " + id2label[str(torch.argmax(n).item())]) + +# Output: +''' +Julien,: I-PER +CEO: O +de: O +HF,: B-ORG +naciÃ³: I-PER +en: I-PER +Francia.: I-LOC +''' +``` + +For **POS** just change the `id2label` dictionary and the model path to [mrm8488/RuPERTa-base-finetuned-pos](https://huggingface.co/mrm8488/RuPERTa-base-finetuned-pos) + +### Fast usage for LM with `pipelines` ðŸ§ª + +```python +from transformers import AutoModelWithLMHead, AutoTokenizer +model = AutoModelWithLMHead.from_pretrained('mrm8488/RuPERTa-base') +tokenizer = AutoTokenizer.from_pretrained("mrm8488/RuPERTa-base", do_lower_case=True) + +from transformers import pipeline + +pipeline_fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer) + +pipeline_fill_mask("EspaÃ±a es un paÃs muy en la UE") +``` + +```json +[ + { + "score": 0.1814306527376175, + "sequence": " ~~espaÃ±a es un paÃs muy importante en la ue~~", + "token": 1560 + }, + { + "score": 0.024842597544193268, + "sequence": " ~~espaÃ±a es un paÃs muy fuerte en la ue~~", + "token": 2854 + }, + { + "score": 0.02473250962793827, + "sequence": " ~~espaÃ±a es un paÃs muy pequeÃ±o en la ue~~", + "token": 2948 + }, + { + "score": 0.023991240188479424, + "sequence": " ~~espaÃ±a es un paÃs muy antiguo en la ue~~", + "token": 5240 + }, + { + "score": 0.0215945765376091, + "sequence": " ~~espaÃ±a es un paÃs muy popular en la ue~~", + "token": 5782 + } +] +``` + +## Acknowledgments + +I thank [ðŸ¤—/transformers team](https://github.com/huggingface/transformers) for answering my doubts and Google for helping me with the [TensorFlow Research Cloud](https://www.tensorflow.org/tfrc) program. + +> Created by [Manuel Romero/@mrm8488](https://twitter.com/mrm8488) + +> Made with ♥ in Spain From 295466aae615a92d3a953fc5b6f6fb370b032b4c Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Fri, 24 Jul 2020 14:14:10 -0400 Subject: [PATCH 005/127] [model_card] Sample input for rdenadai/BR_BERTo cc @rdenadai --- model_cards/rdenadai/BR_BERTo/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/model_cards/rdenadai/BR_BERTo/README.md b/model_cards/rdenadai/BR_BERTo/README.md index acb625b7bd62..5872e2aa359f 100644 --- a/model_cards/rdenadai/BR_BERTo/README.md +++ b/model_cards/rdenadai/BR_BERTo/README.md @@ -4,6 +4,8 @@ tags: - portuguese - brazil - pt_BR +widget: +- text: gostei muito dessa --- # BR_BERTo From a884b7fa38de9082a6f3f7889b9f7348a8dadbf5 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Fri, 24 Jul 2020 14:15:37 -0400 Subject: [PATCH 006/127] Update the new model template (#6019) --- templates/adding_a_new_model/README.md | 96 ++- .../adding_a_new_model/configuration_xxx.py | 132 ++-- .../adding_a_new_model/modeling_tf_xxx.py | 673 ++++++++++++------ templates/adding_a_new_model/modeling_xxx.py | 522 ++++++++------ .../adding_a_new_model/tokenization_xxx.py | 137 +++- 5 files changed, 989 insertions(+), 571 deletions(-) diff --git a/templates/adding_a_new_model/README.md b/templates/adding_a_new_model/README.md index b7805bbf108a..ea97ff761285 100644 --- a/templates/adding_a_new_model/README.md +++ b/templates/adding_a_new_model/README.md @@ -1,64 +1,90 @@ -# How to add a new model in ðŸ¤—Transformers +# How to add a new model in ðŸ¤— Transformers -This folder describes the process to add a new model in ðŸ¤—Transformers and provide templates for the required files. +This folder describes the process to add a new model in ðŸ¤— Transformers and provide templates for the required files. -The library is designed to incorporate a variety of models and code bases. As such the process for adding a new model usually mostly consists in copy-pasting to relevant original code in the various sections of the templates included in the present repository. +The library is designed to incorporate a variety of models and code bases. As such the process for adding a new model +usually mostly consists in copy-pasting to relevant original code in the various sections of the templates included in +the present repository. One important point though is that the library has the following goals impacting the way models are incorporated: -- one specific feature of the API is the capability to run the model and tokenizer inline. The tokenization code thus often have to be slightly adapted to allow for running in the python interpreter. -- the package is also designed to be as self-consistent and with a small and reliable set of packages dependencies. In consequence, additional dependencies are usually not allowed when adding a model but can be allowed for the inclusion of a new tokenizer (recent examples of dependencies added for tokenizer specificities include `sentencepiece` and `sacremoses`). Please make sure to check the existing dependencies when possible before adding a new one. +- One specific feature of the API is the capability to run the model and tokenizer inline. The tokenization code thus + often have to be slightly adapted to allow for running in the python interpreter. +- the package is also designed to be as self-consistent and with a small and reliable set of packages dependencies. In + consequence, additional dependencies are usually not allowed when adding a model but can be allowed for the + inclusion of a new tokenizer (recent examples of dependencies added for tokenizer specificities include + `sentencepiece` and `sacremoses`). Please make sure to check the existing dependencies when possible before adding a + new one. -For a quick overview of the library organization, please check the [QuickStart section of the documentation](https://huggingface.co/transformers/quickstart.html). +For a quick overview of the general philosphy of the library and its organization, please check the +[QuickStart section of the documentation](https://huggingface.co/transformers/philosophy.html). # Typical workflow for including a model Here an overview of the general workflow: -- [ ] add model/configuration/tokenization classes -- [ ] add conversion scripts -- [ ] add tests -- [ ] add @slow integration test -- [ ] finalize +- [ ] Add model/configuration/tokenization classes. +- [ ] Add conversion scripts. +- [ ] Add tests and a @slow integration test. +- [ ] Document your model. +- [ ] Finalize. -Let's detail what should be done at each step +Let's detail what should be done at each step. ## Adding model/configuration/tokenization classes Here is the workflow for adding model/configuration/tokenization classes: -- [ ] copy the python files from the present folder to the main folder and rename them, replacing `xxx` with your model name, -- [ ] edit the files to replace `XXX` (with various casing) with your model name -- [ ] copy-paste or create a simple configuration class for your model in the `configuration_...` file -- [ ] copy-paste or create the code for your model in the `modeling_...` files (PyTorch and TF 2.0) -- [ ] copy-paste or create a tokenizer class for your model in the `tokenization_...` file +- [ ] Copy the python files from the present folder to the main folder and rename them, replacing `xxx` with your model + name. +- [ ] Edit the files to replace `XXX` (with various casing) with your model name. +- [ ] Copy-paste or create a simple configuration class for your model in the `configuration_...` file. +- [ ] Copy-paste or create the code for your model in the `modeling_...` files (PyTorch and TF 2.0). +- [ ] Copy-paste or create a tokenizer class for your model in the `tokenization_...` file. -# Adding conversion scripts +## Adding conversion scripts Here is the workflow for the conversion scripts: -- [ ] copy the conversion script (`convert_...`) from the present folder to the main folder. -- [ ] edit this script to convert your original checkpoint weights to the current pytorch ones. +- [ ] Copy the conversion script (`convert_...`) from the present folder to the main folder. +- [ ] Edit this script to convert your original checkpoint weights to the current pytorch ones. -# Adding tests: +## Adding tests: Here is the workflow for the adding tests: -- [ ] copy the python files from the `tests` sub-folder of the present folder to the `tests` subfolder of the main folder and rename them, replacing `xxx` with your model name, -- [ ] edit the tests files to replace `XXX` (with various casing) with your model name -- [ ] edit the tests code as needed +- [ ] Copy the python files from the `tests` sub-folder of the present folder to the `tests` subfolder of the main + folder and rename them, replacing `xxx` with your model name. +- [ ] Edit the tests files to replace `XXX` (with various casing) with your model name. +- [ ] Edit the tests code as needed. -# Final steps +## Documenting your model: + +Here is the workflow for documentation: + +- [ ] Make sure all your arguments are properly documened in your configuration and tokenizer. +- [ ] Most of the documentation of the models is automatically generated, you just ahve to male sure that + `XXX_START_DOCSTRING` contains an introduction to the model you're adding and a link to the original + article and that `XXX_INPUTS_DOCSTRING` contains all the inputs of your model. +- [ ] Create a new page `xxx.rst` in the folder `docs/source/model_doc` and add this file in `docs/source/index.rst`. + +Make sure to check you have no sphinx warnings when building the documentation locally and follow our +[documentaiton guide](https://github.com/huggingface/transformers/tree/master/docs#writing-documentation---specification). + +## Final steps You can then finish the addition step by adding imports for your classes in the common files: -- [ ] add import for all the relevant classes in `__init__.py` -- [ ] add your configuration in `configuration_auto.py` -- [ ] add your PyTorch and TF 2.0 model respectively in `modeling_auto.py` and `modeling_tf_auto.py` -- [ ] add your tokenizer in `tokenization_auto.py` -- [ ] add your models and tokenizer to `pipeline.py` -- [ ] add a link to your conversion script in the main conversion utility (in `commands/convert.py`) -- [ ] edit the PyTorch to TF 2.0 conversion script to add your model in the `convert_pytorch_checkpoint_to_tf2.py` file -- [ ] add a mention of your model in the doc: `README.md` and the documentation itself at `docs/source/pretrained_models.rst`. -- [ ] upload the pretrained weights, configurations and vocabulary files. -- [ ] create model card(s) for your models on huggingface.co. For those last two steps, check the [model sharing documentation](https://github.com/huggingface/transformers#quick-tour-of-model-sharing). +- [ ] Add import for all the relevant classes in `__init__.py`. +- [ ] Add your configuration in `configuration_auto.py`. +- [ ] Add your PyTorch and TF 2.0 model respectively in `modeling_auto.py` and `modeling_tf_auto.py`. +- [ ] Add your tokenizer in `tokenization_auto.py`. +- [ ] Add your models and tokenizer to `pipeline.py`. +- [ ] Add a link to your conversion script in the main conversion utility (in `commands/convert.py`) +- [ ] Edit the PyTorch to TF 2.0 conversion script to add your model in the `convert_pytorch_checkpoint_to_tf2.py` + file. +- [ ] Add a mention of your model in the doc: `README.md` and the documentation itself + in `docs/source/index.rst` and `docs/source/pretrained_models.rst`. +- [ ] Upload the pretrained weights, configurations and vocabulary files. +- [ ] Create model card(s) for your models on huggingface.co. For those last two steps, check the + [model sharing documentation](https://huggingface.co/transformers/model_sharing.html). diff --git a/templates/adding_a_new_model/configuration_xxx.py b/templates/adding_a_new_model/configuration_xxx.py index 683b52628a12..5d59949e108b 100644 --- a/templates/adding_a_new_model/configuration_xxx.py +++ b/templates/adding_a_new_model/configuration_xxx.py @@ -16,6 +16,7 @@ import logging +from typing import Callable, Union from .configuration_utils import PretrainedConfig @@ -30,85 +31,76 @@ class XxxConfig(PretrainedConfig): r""" - :class:`~transformers.XxxConfig` is the configuration class to store the configuration of a - `XxxModel`. + This is the configuration class to store the configuration of a :class:`~transformers.XXXModel`. + It is used to instantiate a XXX model according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of + the XXX `xxx-base-uncased `__ architecture. + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used + to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` + for more information. - Arguments: - vocab_size: Vocabulary size of `inputs_ids` in `XxxModel`. - hidden_size: Size of the encoder layers and the pooler layer. - num_hidden_layers: Number of hidden layers in the Transformer encoder. - num_attention_heads: Number of attention heads for each attention layer in - the Transformer encoder. - intermediate_size: The size of the "intermediate" (i.e., feed-forward) - layer in the Transformer encoder. - hidden_act: The non-linear activation function (function or string) in the - encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. - hidden_dropout_prob: The dropout probabilitiy for all fully connected - layers in the embeddings, encoder, and pooler. - attention_probs_dropout_prob: The dropout ratio for the attention - probabilities. - max_position_embeddings: The maximum sequence length that this model might - ever be used with. Typically set this to something large just in case - (e.g., 512 or 1024 or 2048). - type_vocab_size: The vocabulary size of the `token_type_ids` passed into - `XxxModel`. - initializer_range: The sttdev of the truncated_normal_initializer for - initializing all weight matrices. - layer_norm_eps: The epsilon used by LayerNorm. + + Args: + vocab_size (:obj:`int`, optional, defaults to 30522): + Vocabulary size of the XXX model. Defines the different tokens that + can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XXXModel`. + hidden_size (:obj:`int`, optional, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (:obj:`int`, optional, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (:obj:`int`, optional, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + hidden_act (:obj:`str` or :obj:`function`, optional, defaults to :obj:`"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. + + If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1): + The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (:obj:`int`, optional, defaults to 512): + The maximum sequence length that this model might ever be used with. + Typically set this to something large just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (:obj:`int`, optional, defaults to 2): + The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`. + initializer_range (:obj:`float`, optional, defaults to 0.02): + The standard deviation of the :obj:`truncated_normal_initializer` for initializing all weight matrices. + layer_norm_eps (:obj:`float`, optional, defaults to 1e-5): + The epsilon used by the layer normalization layers. + gradient_checkpointing (:obj:`bool`, optional, defaults to :obj:`False`): + If :obj:`True`, use gradient checkpointing to save memory at the expense of slower backward pass. + kwargs: + Additional arguments for common configurations, passed to :class:`~transformers.PretrainedConfig`. """ model_type = "xxx" def __init__( self, - vocab_size=50257, - n_positions=1024, - n_ctx=1024, - n_embd=768, - n_layer=12, - n_head=12, - resid_pdrop=0.1, - embd_pdrop=0.1, - attn_pdrop=0.1, - layer_norm_epsilon=1e-5, - initializer_range=0.02, - summary_type="cls_index", - summary_use_proj=True, - summary_activation=None, - summary_proj_to_labels=True, - summary_first_dropout=0.1, + vocab_size: int = 50257, + hidden_size: int = 1024, + num_hidden_layers: int = 12, + num_attention_heads: int = 12, + hidden_act: Union[str, Callable] = "gelu", + hidden_dropout_prob: float = 0.1, + attention_probs_dropout_prob: float = 0.1, + max_position_embeddings: int = 512, + type_vocab_size: int = 2, + initializer_range: float = 0.02, + layer_norm_epsilon: float = 1e-5, + gradient_checkpointing: bool = False, **kwargs ): super().__init__(**kwargs) self.vocab_size = vocab_size - self.n_ctx = n_ctx - self.n_positions = n_positions - self.n_embd = n_embd - self.n_layer = n_layer - self.n_head = n_head - self.resid_pdrop = resid_pdrop - self.embd_pdrop = embd_pdrop - self.attn_pdrop = attn_pdrop - self.layer_norm_epsilon = layer_norm_epsilon + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size self.initializer_range = initializer_range - self.summary_type = summary_type - self.summary_use_proj = summary_use_proj - self.summary_activation = summary_activation - self.summary_first_dropout = summary_first_dropout - self.summary_proj_to_labels = summary_proj_to_labels - - @property - def max_position_embeddings(self): - return self.n_positions - - @property - def hidden_size(self): - return self.n_embd - - @property - def num_attention_heads(self): - return self.n_head - - @property - def num_hidden_layers(self): - return self.n_layer + self.layer_norm_epsilon = layer_norm_epsilon + self.gradient_checkpointing = gradient_checkpointing diff --git a/templates/adding_a_new_model/modeling_tf_xxx.py b/templates/adding_a_new_model/modeling_tf_xxx.py index 406a5d3c330c..07b47a10a8be 100644 --- a/templates/adding_a_new_model/modeling_tf_xxx.py +++ b/templates/adding_a_new_model/modeling_tf_xxx.py @@ -25,12 +25,29 @@ import tensorflow as tf from .configuration_xxx import XxxConfig -from .file_utils import add_start_docstrings -from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list +from .file_utils import ( + MULTIPLE_CHOICE_DUMMY_INPUTS, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_callable, +) +from .modeling_tf_utils import ( + TFMaskedLanguageModelingLoss, + TFMultipleChoiceLoss, + TFPreTrainedModel, + TFQuestionAnsweringLoss, + TFSequenceClassificationLoss, + TFTokenClassificationLoss, + get_initializer, + shape_list, +) +from .tokenization_utils import BatchEncoding logger = logging.getLogger(__name__) +_TOKENIZER_FOR_DOC = "XxxTokenizer" + #################################################### # This list contrains shortcut names for some of # the pretrained weights provided with the models @@ -183,36 +200,33 @@ class TFXxxPreTrainedModel(TFPreTrainedModel): base_model_prefix = "transformer" -XXX_START_DOCSTRING = r""" The XXX model was proposed in - `XXX: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ - by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. It's a bidirectional transformer - pre-trained using a combination of masked language modeling objective and next sentence prediction - on a large corpus comprising the Toronto Book Corpus and Wikipedia. +XXX_START_DOCSTRING = r""" + The XXX model was proposed in + `XXX: Pre-training of Deep Bidirectional Transformers for Language Understanding + `__ by.... - This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and + This model is a `tf.keras.Model `__ sub-class. + Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. - .. _`XXX: Pre-training of Deep Bidirectional Transformers for Language Understanding`: - https://arxiv.org/abs/1810.04805 + .. note:: - .. _`tf.keras.Model`: - https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model - - Note on the model inputs: TF 2.0 models accepts two formats as inputs: - having all inputs as keyword arguments (like PyTorch models), or - having all inputs as a list, tuple or dict in the first positional arguments. - This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`. + This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having + all the tensors in the first argument of the model call function: :obj:`model(inputs)`. - If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument : + If you choose this second option, there are three possibilities you can use to gather all the input Tensors + in the first positional argument : - - a single Tensor with input_ids only and nothing else: `model(inputs_ids) + - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: - `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])` - - a dictionary with one or several input Tensors associaed to the input names given in the docstring: - `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` + :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associated to the input names given in the docstring: + :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` Parameters: config (:class:`~transformers.XxxConfig`): Model configuration class with all the parameters of the model. @@ -221,95 +235,84 @@ class TFXxxPreTrainedModel(TFPreTrainedModel): """ XXX_INPUTS_DOCSTRING = r""" - Inputs: - **input_ids**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``: + Args: + input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`): Indices of input sequence tokens in the vocabulary. - To match pre-training, XXX input sequence should be formatted with [CLS] and [SEP] tokens as follows: - - (a) For sequence pairs: - - ``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]`` - - ``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1`` - - (b) For single sequences: - - ``tokens: [CLS] the dog is hairy . [SEP]`` - - ``token_type_ids: 0 0 0 0 0 0 0`` - - Xxx is a model with absolute position embeddings so it's usually advised to pad the inputs on - the right rather than the left. Indices can be obtained using :class:`transformers.XxxTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. - **attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``: + :func:`transformers.PreTrainedTokenizer.__call__` for details. + + `What are input IDs? <../glossary.html#input-ids>`__ + attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. - **token_type_ids**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``: + + `What are attention masks? <../glossary.html#attention-mask>`__ + token_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token - (see `XXX: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details). - **position_ids**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``: + + `What are token type IDs? <../glossary.html#token-type-ids>`__ + position_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. - **head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: + + `What are position IDs? <../glossary.html#position-ids>`__ + head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: - ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. - **inputs_embeds**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, embedding_dim)``: - Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation. + :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. + inputs_embeds (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, embedding_dim)`, `optional`, defaults to :obj:`None`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. + training (:obj:`boolean`, `optional`, defaults to :obj:`False`): + Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them + (if set to :obj:`False`) for evaluation. + output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. """ @add_start_docstrings( - "The bare Xxx Model transformer outputing raw hidden-states without any specific head on top.", + "The bare XXX Model transformer outputing raw hidden-states without any specific head on top.", XXX_START_DOCSTRING, - XXX_INPUTS_DOCSTRING, ) class TFXxxModel(TFXxxPreTrainedModel): - r""" - Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: - **last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)`` + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.transformer = TFXxxMainLayer(config, name="transformer") + + @add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) + @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xxx-base-cased") + def call(self, inputs, **kwargs): + r""" + Returns: + :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XxxConfig`) and inputs: + last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. - **pooler_output**: ``tf.Tensor`` of shape ``(batch_size, hidden_size)`` + pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`): Last layer hidden-state of the first token of the sequence (classification token) further processed by a Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence prediction (classification) - objective during Xxx pretraining. This output is usually *not* a good summary + objective during XXX pretraining. This output is usually *not* a good summary of the semantic content of the input, you're often better with averaging or pooling the sequence of hidden-states for the whole input sequence. - **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) - list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings) - of shape ``(batch_size, sequence_length, hidden_size)``: - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - **attentions**: (`optional`, returned when ``output_attentions=True``) - list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - - Examples:: - - import tensorflow as tf - from transformers import XxxTokenizer, TFXxxModel - - tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased') - model = TFXxxModel.from_pretrained('xxx-base-uncased') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 - outputs = model(input_ids) - last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. - """ - - def __init__(self, config, *inputs, **kwargs): - super().__init__(config, *inputs, **kwargs) - self.transformer = TFXxxMainLayer(config, name="transformer") + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - def call(self, inputs, **kwargs): + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ outputs = self.transformer(inputs, **kwargs) return outputs @@ -317,84 +320,89 @@ def call(self, inputs, **kwargs): TFXxxMLMHead = tf.keras.layers.Layer -@add_start_docstrings( - """Xxx Model with a `language modeling` head on top. """, XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING -) -class TFXxxForMaskedLM(TFXxxPreTrainedModel): - r""" - Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: - **prediction_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)`` - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) - list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings) - of shape ``(batch_size, sequence_length, hidden_size)``: - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - **attentions**: (`optional`, returned when ``output_attentions=True``) - list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - - Examples:: - - import tensorflow as tf - from transformers import XxxTokenizer, TFXxxForMaskedLM - - tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased') - model = TFXxxForMaskedLM.from_pretrained('xxx-base-uncased') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 - outputs = model(input_ids) - prediction_scores = outputs[0] - - """ - +@add_start_docstrings("""Xxx Model with a `language modeling` head on top. """, XXX_START_DOCSTRING) +class TFXxxForMaskedLM(TFXxxPreTrainedModel, TFMaskedLanguageModelingLoss): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFXxxMainLayer(config, name="transformer") self.mlm = TFXxxMLMHead(config, self.transformer.embeddings, name="mlm") - def call(self, inputs, **kwargs): - outputs = self.transformer(inputs, **kwargs) + @add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) + @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xxx-base-cased") + def call( + self, + inputs=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + labels=None, + training=False, + ): + r""" + labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + Labels for computing the masked language modeling loss. + Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) + Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels + in ``[0, ..., config.vocab_size]`` + + Return: + :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XxxConfig`) and inputs: + prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + if isinstance(inputs, (tuple, list)): + labels = inputs[8] if len(inputs) > 8 else labels + if len(inputs) > 8: + inputs = inputs[:8] + elif isinstance(inputs, (dict, BatchEncoding)): + labels = inputs.pop("labels", labels) + + outputs = self.transformer( + inputs, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + training=training, + ) sequence_output = outputs[0] - prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False)) + prediction_scores = self.mlm(sequence_output, training=training) outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here - return outputs # prediction_scores, (hidden_states), (attentions) + if labels is not None: + loss = self.compute_loss(labels, prediction_scores) + outputs = (loss,) + outputs + + return outputs # (loss), prediction_scores, (hidden_states), (attentions) @add_start_docstrings( - """Xxx Model transformer with a sequence classification/regression head on top (a linear layer on top of + """XXX Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, XXX_START_DOCSTRING, - XXX_INPUTS_DOCSTRING, ) -class TFXxxForSequenceClassification(TFXxxPreTrainedModel): - r""" - Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: - **logits**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, config.num_labels)`` - Classification (or regression if config.num_labels==1) scores (before SoftMax). - **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) - list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings) - of shape ``(batch_size, sequence_length, hidden_size)``: - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - **attentions**: (`optional`, returned when ``output_attentions=True``) - list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - - Examples:: - - import tensorflow as tf - from transformers import XxxTokenizer, TFXxxForSequenceClassification - - tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased') - model = TFXxxForSequenceClassification.from_pretrained('xxx-base-uncased') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 - outputs = model(input_ids) - logits = outputs[0] - - """ - +class TFXxxForSequenceClassification(TFXxxPreTrainedModel, TFSequenceClassificationLoss): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels @@ -405,51 +413,216 @@ def __init__(self, config, *inputs, **kwargs): config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - def call(self, inputs, **kwargs): - outputs = self.transformer(inputs, **kwargs) + @add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING) + @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xxx-base-cased") + def call( + self, + inputs=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + labels=None, + training=False, + ): + r""" + labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + Labels for computing the sequence classification/regression loss. + Indices should be in :obj:`[0, ..., config.num_labels - 1]`. + If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), + If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). + + Return: + :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XxxConfig`) and inputs: + logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + if isinstance(inputs, (tuple, list)): + labels = inputs[8] if len(inputs) > 8 else labels + if len(inputs) > 8: + inputs = inputs[:8] + elif isinstance(inputs, (dict, BatchEncoding)): + labels = inputs.pop("labels", labels) + + outputs = self.transformer( + inputs, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + training=training, + ) pooled_output = outputs[1] - pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False)) + pooled_output = self.dropout(pooled_output, training=training) logits = self.classifier(pooled_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here - return outputs # logits, (hidden_states), (attentions) + if labels is not None: + loss = self.compute_loss(labels, logits) + outputs = (loss,) + outputs + + return outputs # (loss), logits, (hidden_states), (attentions) @add_start_docstrings( - """Xxx Model with a token classification head on top (a linear layer on top of - the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, + """XXX Model with a multiple choice classification head on top (a linear layer on top of + the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, XXX_START_DOCSTRING, - XXX_INPUTS_DOCSTRING, ) -class TFXxxForTokenClassification(TFXxxPreTrainedModel): - r""" - Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: - **scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.num_labels)`` +class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.transformer = TFXxxMainLayer(config, name="transformer") + self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.classifier = tf.keras.layers.Dense( + 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) + + @property + def dummy_inputs(self): + """ Dummy inputs to build the network. + + Returns: + tf.Tensor with dummy inputs + """ + return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} + + @add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) + @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xxx-base-cased") + def call( + self, + inputs, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + labels=None, + training=False, + ): + r""" + labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + Labels for computing the multiple choice classification loss. + Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension + of the input tensors. (see `input_ids` above) + + Return: + :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XxxConfig`) and inputs: + classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`: + `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above). + Classification scores (before SoftMax). - **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) - list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings) - of shape ``(batch_size, sequence_length, hidden_size)``: + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + Hidden-states of the model at the output of each layer plus the initial embedding outputs. - **attentions**: (`optional`, returned when ``output_attentions=True``) - list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Examples:: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + if isinstance(inputs, (tuple, list)): + input_ids = inputs[0] + attention_mask = inputs[1] if len(inputs) > 1 else attention_mask + token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids + position_ids = inputs[3] if len(inputs) > 3 else position_ids + head_mask = inputs[4] if len(inputs) > 4 else head_mask + inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds + output_attentions = inputs[6] if len(inputs) > 6 else output_attentions + output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states + labels = inputs[8] if len(inputs) > 8 else labels + assert len(inputs) <= 9, "Too many inputs." + elif isinstance(inputs, (dict, BatchEncoding)): + input_ids = inputs.get("input_ids") + attention_mask = inputs.get("attention_mask", attention_mask) + token_type_ids = inputs.get("token_type_ids", token_type_ids) + position_ids = inputs.get("position_ids", position_ids) + head_mask = inputs.get("head_mask", head_mask) + inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) + output_attentions = inputs.get("output_attentions", output_attentions) + output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) + labels = inputs.get("labels", labels) + assert len(inputs) <= 9, "Too many inputs." + else: + input_ids = inputs + + if input_ids is not None: + num_choices = shape_list(input_ids)[1] + seq_length = shape_list(input_ids)[2] + else: + num_choices = shape_list(inputs_embeds)[1] + seq_length = shape_list(inputs_embeds)[2] + + flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None + flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None + flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None + flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None + flat_inputs_embeds = ( + tf.reshape(inputs_embeds, (-1, seq_length, shape_list(inputs_embeds)[3])) + if inputs_embeds is not None + else None + ) - import tensorflow as tf - from transformers import XxxTokenizer, TFXxxForTokenClassification + flat_inputs = [ + flat_input_ids, + flat_attention_mask, + flat_token_type_ids, + flat_position_ids, + head_mask, + flat_inputs_embeds, + output_attentions, + output_hidden_states, + ] - tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased') - model = TFXxxForTokenClassification.from_pretrained('xxx-base-uncased') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 - outputs = model(input_ids) - scores = outputs[0] + outputs = self.transformer(flat_inputs, training=training) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output, training=training) + logits = self.classifier(pooled_output) + reshaped_logits = tf.reshape(logits, (-1, num_choices)) + + outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here + + if labels is not None: + loss = self.compute_loss(labels, reshaped_logits) + outputs = (loss,) + outputs + + return outputs # (loss), reshaped_logits, (hidden_states), (attentions) - """ +@add_start_docstrings( + """XXX Model with a token classification head on top (a linear layer on top of + the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, + XXX_START_DOCSTRING, +) +class TFXxxForTokenClassification(TFXxxPreTrainedModel, TFTokenClassificationLoss): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels @@ -460,53 +633,81 @@ def __init__(self, config, *inputs, **kwargs): config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - def call(self, inputs, **kwargs): - outputs = self.transformer(inputs, **kwargs) + @add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING) + @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xxx-base-cased") + def call( + self, + inputs=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + labels=None, + training=False, + ): + r""" + labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + Labels for computing the token classification loss. + Indices should be in ``[0, ..., config.num_labels - 1]``. + + Return: + :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XxxConfig`) and inputs: + scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): + Classification scores (before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + if isinstance(inputs, (tuple, list)): + labels = inputs[8] if len(inputs) > 8 else labels + if len(inputs) > 8: + inputs = inputs[:8] + elif isinstance(inputs, (dict, BatchEncoding)): + labels = inputs.pop("labels", labels) + + outputs = self.transformer( + inputs, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + training=training, + ) sequence_output = outputs[0] - sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False)) + sequence_output = self.dropout(sequence_output, training=training) logits = self.classifier(sequence_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here - return outputs # scores, (hidden_states), (attentions) + if labels is not None: + loss = self.compute_loss(labels, logits) + outputs = (loss,) + outputs + + return outputs # (loss), logits, (hidden_states), (attentions) @add_start_docstrings( - """Xxx Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of + """XXX Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, XXX_START_DOCSTRING, - XXX_INPUTS_DOCSTRING, ) -class TFXxxForQuestionAnswering(TFXxxPreTrainedModel): - r""" - Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: - **start_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length,)`` - Span-start scores (before SoftMax). - **end_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length,)`` - Span-end scores (before SoftMax). - **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) - list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings) - of shape ``(batch_size, sequence_length, hidden_size)``: - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - **attentions**: (`optional`, returned when ``output_attentions=True``) - list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - - Examples:: - - import tensorflow as tf - from transformers import XxxTokenizer, TFXxxForQuestionAnswering - - tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased') - model = TFXxxForQuestionAnswering.from_pretrained('xxx-base-uncased') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 - outputs = model(input_ids) - start_scores, end_scores = outputs[:2] - - """ - +class TFXxxForQuestionAnswering(TFXxxPreTrainedModel, TFQuestionAnsweringLoss): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels @@ -516,8 +717,70 @@ def __init__(self, config, *inputs, **kwargs): config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) - def call(self, inputs, **kwargs): - outputs = self.transformer(inputs, **kwargs) + @add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING) + @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xxx-base-cased") + def call( + self, + inputs=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + start_positions=None, + end_positions=None, + training=False, + ): + r""" + start_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + end_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + + Return: + :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XxxConfig`) and inputs: + start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): + Span-start scores (before SoftMax). + end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): + Span-end scores (before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + if isinstance(inputs, (tuple, list)): + start_positions = inputs[8] if len(inputs) > 8 else start_positions + end_positions = inputs[9] if len(inputs) > 9 else end_positions + if len(inputs) > 8: + inputs = inputs[:8] + elif isinstance(inputs, (dict, BatchEncoding)): + start_positions = inputs.pop("start_positions", start_positions) + end_positions = inputs.pop("end_positions", start_positions) + + outputs = self.transformer( + inputs, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + training=training, + ) sequence_output = outputs[0] @@ -528,4 +791,10 @@ def call(self, inputs, **kwargs): outputs = (start_logits, end_logits,) + outputs[2:] - return outputs # start_logits, end_logits, (hidden_states), (attentions) + if start_positions is not None and end_positions is not None: + labels = {"start_position": start_positions} + labels["end_position"] = end_positions + loss = self.compute_loss(labels, outputs[:2]) + outputs = (loss,) + outputs + + return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) diff --git a/templates/adding_a_new_model/modeling_xxx.py b/templates/adding_a_new_model/modeling_xxx.py index 6b43993ff09b..f1e031bc3228 100644 --- a/templates/adding_a_new_model/modeling_xxx.py +++ b/templates/adding_a_new_model/modeling_xxx.py @@ -27,12 +27,23 @@ from torch.nn import CrossEntropyLoss, MSELoss from .configuration_xxx import XxxConfig -from .file_utils import add_start_docstrings +from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable +from .modeling_outputs import ( + BaseModelOutputWithPooling, + MaskedLMOutput, + MultipleChoiceModelOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) from .modeling_utils import PreTrainedModel logger = logging.getLogger(__name__) +_CONFIG_FOR_DOC = "XXXConfig" +_TOKENIZER_FOR_DOC = "XXXTokenizer" + #################################################### # This list contrains shortcut names for some of # the pretrained weights provided with the models @@ -197,19 +208,12 @@ def _init_weights(self, module): XXX_START_DOCSTRING = r""" The XXX model was proposed in - `XXX: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ - by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. It's a bidirectional transformer - pre-trained using a combination of masked language modeling objective and next sentence prediction - on a large corpus comprising the Toronto Book Corpus and Wikipedia. - - This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and - refer to the PyTorch documentation for all matter related to general usage and behavior. - - .. _`XXX: Pre-training of Deep Bidirectional Transformers for Language Understanding`: - https://arxiv.org/abs/1810.04805 + `XXX: Pre-training of Deep Bidirectional Transformers for Language Understanding + `__ by.... - .. _`torch.nn.Module`: - https://pytorch.org/docs/stable/nn.html#module + This model is a PyTorch `torch.nn.Module `_ sub-class. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general + usage and behavior. Parameters: config (:class:`~transformers.XxxConfig`): Model configuration class with all the parameters of the model. @@ -219,86 +223,53 @@ def _init_weights(self, module): XXX_INPUTS_DOCSTRING = r""" Inputs: - **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + input_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`): Indices of input sequence tokens in the vocabulary. - To match pre-training, XXX input sequence should be formatted with [CLS] and [SEP] tokens as follows: - - (a) For sequence pairs: - - ``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]`` - - ``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1`` - - (b) For single sequences: - - ``tokens: [CLS] the dog is hairy . [SEP]`` - - ``token_type_ids: 0 0 0 0 0 0 0`` - - Xxx is a model with absolute position embeddings so it's usually advised to pad the inputs on - the right rather than the left. Indices can be obtained using :class:`transformers.XxxTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. - **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``: + :func:`transformers.PreTrainedTokenizer.__call__` for details. + + `What are input IDs? <../glossary.html#input-ids>`__ + attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. - **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + + `What are attention masks? <../glossary.html#attention-mask>`__ + token_type_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token - (see `XXX: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details). - **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + + `What are token type IDs? <../glossary.html#token-type-ids>`_ + position_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. - **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: + + `What are position IDs? <../glossary.html#position-ids>`_ + head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: - ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. - **inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``: - Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation. + :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. + inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. + output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. + return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. """ @add_start_docstrings( - "The bare Xxx Model transformer outputting raw hidden-states without any specific head on top.", + "The bare XXX Model transformer outputting raw hidden-states without any specific head on top.", XXX_START_DOCSTRING, - XXX_INPUTS_DOCSTRING, ) class XxxModel(XxxPreTrainedModel): - r""" - Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: - **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)`` - Sequence of hidden-states at the output of the last layer of the model. - **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)`` - Last layer hidden-state of the first token of the sequence (classification token) - further processed by a Linear layer and a Tanh activation function. The Linear - layer weights are trained from the next sentence prediction (classification) - objective during Xxx pretraining. This output is usually *not* a good summary - of the semantic content of the input, you're often better with averaging or pooling - the sequence of hidden-states for the whole input sequence. - **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) - list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) - of shape ``(batch_size, sequence_length, hidden_size)``: - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - **attentions**: (`optional`, returned when ``output_attentions=True``) - list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - - Examples:: - - tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased') - model = XxxModel.from_pretrained('xxx-base-uncased') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 - outputs = model(input_ids) - last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple - - """ - def __init__(self, config): super().__init__(config) @@ -322,6 +293,13 @@ def _prune_heads(self, heads_to_prune): for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) + @add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xxx-base-uncased", + output_type=BaseModelOutputWithPooling, + config_class=_CONFIG_FOR_DOC, + ) def forward( self, input_ids=None, @@ -330,7 +308,16 @@ def forward( position_ids=None, head_mask=None, inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + return_tuple=None, ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: @@ -362,45 +349,21 @@ def forward( ) encoder_outputs = self.encoder(embedding_output, extended_attention_mask, head_mask=head_mask) sequence_output = encoder_outputs[0] - outputs = (sequence_output,) + encoder_outputs[1:] # add hidden_states and attentions if they are here + pooled_output = self.pooler(sequence_output) - return outputs # sequence_output, (hidden_states), (attentions) + if return_tuple: + return (sequence_output, pooled_output) + encoder_outputs[1:] + return BaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) -@add_start_docstrings( - """Xxx Model with a `language modeling` head on top. """, XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING -) -class XxxForMaskedLM(XxxPreTrainedModel): - r""" - **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: - Labels for computing the masked language modeling loss. - Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) - Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels - in ``[0, ..., config.vocab_size]`` - - Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: - **loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: - Masked language modeling loss. - **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)`` - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) - list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) - of shape ``(batch_size, sequence_length, hidden_size)``: - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - **attentions**: (`optional`, returned when ``output_attentions=True``) - list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - - Examples:: - - tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased') - model = XxxForMaskedLM.from_pretrained('xxx-base-uncased') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 - outputs = model(input_ids, masked_lm_labels=input_ids) - loss, prediction_scores = outputs[:2] - - """ +@add_start_docstrings("""XXX Model with a `language modeling` head on top. """, XXX_START_DOCSTRING) +class XxxForMaskedLM(XxxPreTrainedModel): def __init__(self, config): super().__init__(config) @@ -412,6 +375,13 @@ def __init__(self, config): def get_output_embeddings(self): return self.lm_head + @add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xxx-base-uncased", + output_type=MaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + ) def forward( self, input_ids=None, @@ -420,8 +390,19 @@ def forward( position_ids=None, head_mask=None, inputs_embeds=None, - masked_lm_labels=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_tuple=None, ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + Labels for computing the masked language modeling loss. + Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) + Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels + in ``[0, ..., config.vocab_size]`` + """ + return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple outputs = self.transformer( input_ids, @@ -430,58 +411,37 @@ def forward( position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_tuple=return_tuple, ) sequence_output = outputs[0] prediction_scores = self.cls(sequence_output) - outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here - if masked_lm_labels is not None: - loss_fct = CrossEntropyLoss() - masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) - outputs = (masked_lm_loss,) + outputs - - return outputs # (masked_lm_loss), prediction_scores, (hidden_states), (attentions) + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() # -100 index = padding token + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + + if return_tuple: + output = (prediction_scores,) + outputs[2:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return MaskedLMOutput( + loss=masked_lm_loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) @add_start_docstrings( - """Xxx Model transformer with a sequence classification/regression head on top (a linear layer on top of + """XXX Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, XXX_START_DOCSTRING, - XXX_INPUTS_DOCSTRING, ) class XxxForSequenceClassification(XxxPreTrainedModel): - r""" - **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: - Labels for computing the sequence classification/regression loss. - Indices should be in ``[0, ..., config.num_labels - 1]``. - If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), - If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). - - Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: - **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: - Classification (or regression if config.num_labels==1) loss. - **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)`` - Classification (or regression if config.num_labels==1) scores (before SoftMax). - **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) - list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) - of shape ``(batch_size, sequence_length, hidden_size)``: - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - **attentions**: (`optional`, returned when ``output_attentions=True``) - list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - - Examples:: - - tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased') - model = XxxForSequenceClassification.from_pretrained('xxx-base-uncased') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 - labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 - outputs = model(input_ids, labels=labels) - loss, logits = outputs[:2] - - """ - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -492,6 +452,13 @@ def __init__(self, config): self.init_weights() + @add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xxx-base-uncased", + output_type=SequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def forward( self, input_ids=None, @@ -501,7 +468,18 @@ def forward( head_mask=None, inputs_embeds=None, labels=None, + output_attentions=None, + output_hidden_states=None, + return_tuple=None, ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + Labels for computing the sequence classification/regression loss. + Indices should be in :obj:`[0, ..., config.num_labels - 1]`. + If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), + If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple outputs = self.transformer( input_ids, @@ -510,6 +488,9 @@ def forward( position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_tuple=return_tuple, ) pooled_output = outputs[1] @@ -517,8 +498,7 @@ def forward( pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) - outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here - + loss = None if labels is not None: if self.num_labels == 1: # We are doing regression @@ -527,47 +507,108 @@ def forward( else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - outputs = (loss,) + outputs - return outputs # (loss), logits, (hidden_states), (attentions) + if return_tuple: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( - """Xxx Model with a token classification head on top (a linear layer on top of - the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, + """XXX Model with a multiple choice classification head on top (a linear layer on top of + the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, XXX_START_DOCSTRING, - XXX_INPUTS_DOCSTRING, ) -class XxxForTokenClassification(XxxPreTrainedModel): - r""" - **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: - Labels for computing the token classification loss. - Indices should be in ``[0, ..., config.num_labels - 1]``. +class XxxForMultipleChoice(XxxPreTrainedModel): + def __init__(self, config): + super().__init__(config) - Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: - **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: - Classification loss. - **scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.num_labels)`` - Classification scores (before SoftMax). - **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) - list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) - of shape ``(batch_size, sequence_length, hidden_size)``: - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - **attentions**: (`optional`, returned when ``output_attentions=True``) - list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - - Examples:: - - tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased') - model = XxxForTokenClassification.from_pretrained('xxx-base-uncased') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 - labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 - outputs = model(input_ids, labels=labels) - loss, scores = outputs[:2] + self.transformer = XxxModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, 1) - """ + self.init_weights() + + @add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xxx-base-uncased", + output_type=MultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_tuple=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + Labels for computing the multiple choice classification loss. + Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension + of the input tensors. (see `input_ids` above) + """ + return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + + input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None + attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None + inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) + if inputs_embeds is not None + else None + ) + + outputs = self.transformer( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_tuple=return_tuple, + ) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, num_choices) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if return_tuple: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return MultipleChoiceModelOutput( + loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) + +@add_start_docstrings( + """XXX Model with a token classification head on top (a linear layer on top of + the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, + XXX_START_DOCSTRING, +) +class XxxForTokenClassification(XxxPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -578,6 +619,13 @@ def __init__(self, config): self.init_weights() + @add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xxx-base-uncased", + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def forward( self, input_ids=None, @@ -587,7 +635,16 @@ def forward( head_mask=None, inputs_embeds=None, labels=None, + output_attentions=None, + output_hidden_states=None, + return_tuple=None, ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + Labels for computing the token classification loss. + Indices should be in ``[0, ..., config.num_labels - 1]``. + """ + return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple outputs = self.transformer( input_ids, @@ -596,6 +653,9 @@ def forward( position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_tuple=return_tuple, ) sequence_output = outputs[0] @@ -603,70 +663,35 @@ def forward( sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) - outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here + loss = None if labels is not None: loss_fct = CrossEntropyLoss() # Only keep active parts of the loss if attention_mask is not None: active_loss = attention_mask.view(-1) == 1 - active_logits = logits.view(-1, self.num_labels)[active_loss] - active_labels = labels.view(-1)[active_loss] + active_logits = logits.view(-1, self.num_labels) + active_labels = torch.where( + active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) + ) loss = loss_fct(active_logits, active_labels) else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - outputs = (loss,) + outputs - return outputs # (loss), scores, (hidden_states), (attentions) + if return_tuple: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( - """Xxx Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of - the hidden-states output to compute `span start logits` and `span end logits`). """, + """XXX Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, XXX_START_DOCSTRING, - XXX_INPUTS_DOCSTRING, ) class XxxForQuestionAnswering(XxxPreTrainedModel): - r""" - **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: - Labels for position (index) of the start of the labelled span for computing the token classification loss. - Positions are clamped to the length of the sequence (`sequence_length`). - Position outside of the sequence are not taken into account for computing the loss. - **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: - Labels for position (index) of the end of the labelled span for computing the token classification loss. - Positions are clamped to the length of the sequence (`sequence_length`). - Position outside of the sequence are not taken into account for computing the loss. - - Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: - **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: - Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. - **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` - Span-start scores (before SoftMax). - **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` - Span-end scores (before SoftMax). - **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) - list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) - of shape ``(batch_size, sequence_length, hidden_size)``: - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - **attentions**: (`optional`, returned when ``output_attentions=True``) - list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - - Examples:: - - tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased') - model = XxxForQuestionAnswering.from_pretrained('xxx-large-uncased-whole-word-masking-finetuned-squad') - question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" - input_text = "[CLS] " + question + " [SEP] " + text + " [SEP]" - input_ids = tokenizer.encode(input_text) - token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))] - start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids])) - all_tokens = tokenizer.convert_ids_to_tokens(input_ids) - print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])) - # a nice puppet - - - """ - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -676,6 +701,13 @@ def __init__(self, config): self.init_weights() + @add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xxx-base-uncased", + output_type=QuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def forward( self, input_ids=None, @@ -686,7 +718,21 @@ def forward( inputs_embeds=None, start_positions=None, end_positions=None, + output_attentions=None, + output_hidden_states=None, + return_tuple=None, ): + r""" + start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + """ + return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple outputs = self.transformer( input_ids, @@ -695,6 +741,9 @@ def forward( position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_tuple=return_tuple, ) sequence_output = outputs[0] @@ -704,7 +753,7 @@ def forward( start_logits = start_logits.squeeze(-1) end_logits = end_logits.squeeze(-1) - outputs = (start_logits, end_logits,) + outputs[2:] + total_loss = None if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: @@ -720,6 +769,15 @@ def forward( start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - outputs = (total_loss,) + outputs - return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) + if return_tuple: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/templates/adding_a_new_model/tokenization_xxx.py b/templates/adding_a_new_model/tokenization_xxx.py index 91dc7f8c0b90..c45873a9f303 100644 --- a/templates/adding_a_new_model/tokenization_xxx.py +++ b/templates/adding_a_new_model/tokenization_xxx.py @@ -18,6 +18,7 @@ import collections import logging import os +from typing import List, Optional from .tokenization_utils import PreTrainedTokenizer @@ -77,12 +78,37 @@ def load_vocab(vocab_file): class XxxTokenizer(PreTrainedTokenizer): r""" - Constructs a XxxTokenizer. - :class:`~transformers.XxxTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece + Constructs a XXX tokenizer. Based on XXX. + + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users + should refer to the superclass for more information regarding methods. Args: - vocab_file: Path to a one-wordpiece-per-line vocabulary file - do_lower_case: Whether to lower case the input. Only has an effect when do_basic_tokenize=True + vocab_file (:obj:`str`): + File containing the vocabulary. + do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to lowercase the input when tokenizing. + do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to do basic tokenization before WordPiece. + never_split (:obj:`Iterable`, `optional`, defaults to :obj:`None`): + Collection of tokens which will never be split during tokenization. Only has an effect when + :obj:`do_basic_tokenize=True` + unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences + for sequence classification or for a text and a question for question answering. + It is also used as the last token of a sequence built with special tokens. + pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole + sequence instead of per-token classification). It is the first token of the sequence when built with + special tokens. + mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. """ vocab_files_names = VOCAB_FILES_NAMES @@ -94,21 +120,16 @@ def __init__( self, vocab_file, do_lower_case=True, + do_basic_tokenize=True, + never_split=None, unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]", mask_token="[MASK]", + tokenize_chinese_chars=True, **kwargs ): - """Constructs a XxxTokenizer. - - Args: - **vocab_file**: Path to a one-wordpiece-per-line vocabulary file - **do_lower_case**: (`optional`) boolean (default True) - Whether to lower case the input - Only has an effect when do_basic_tokenize=True - """ super().__init__( unk_token=unk_token, sep_token=sep_token, @@ -121,22 +142,35 @@ def __init__( if not os.path.isfile(vocab_file): raise ValueError( "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " - "model use `tokenizer = XxxTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file) + "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file) ) self.vocab = load_vocab(vocab_file) + self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) + self.do_basic_tokenize = do_basic_tokenize + # Replace and adapt + # if do_basic_tokenize: + # self.basic_tokenizer = BasicTokenizer( + # do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=tokenize_chinese_chars + # ) + # self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) @property def vocab_size(self): return len(self.vocab) + def get_vocab(self): + return dict(self.vocab, **self.added_tokens_encoder) + def _tokenize(self, text): - """ Take as input a string and return a list of strings (tokens) for words/sub-words - """ split_tokens = [] if self.do_basic_tokenize: for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): - for sub_token in self.wordpiece_tokenizer.tokenize(token): - split_tokens.append(sub_token) + + # If the token is part of the never_split set + if token in self.basic_tokenizer.never_split: + split_tokens.append(token) + else: + split_tokens += self.wordpiece_tokenizer.tokenize(token) else: split_tokens = self.wordpiece_tokenizer.tokenize(text) return split_tokens @@ -154,13 +188,25 @@ def convert_tokens_to_string(self, tokens): out_string = " ".join(tokens).replace(" ##", "").strip() return out_string - def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A BERT sequence has the following format: - single sequence: [CLS] X [SEP] - pair of sequences: [CLS] A [SEP] B [SEP] + + - single sequence: ``[CLS] X [SEP]`` + - pair of sequences: ``[CLS] A [SEP] B [SEP]`` + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs to which the special tokens will be added + token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. """ if token_ids_1 is None: return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] @@ -168,20 +214,23 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): sep = [self.sep_token_id] return cls + token_ids_0 + sep + token_ids_1 + sep - def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer ``prepare_for_model`` methods. + special tokens using the tokenizer ``prepare_for_model`` method. Args: - token_ids_0: list of ids (must not contain special tokens) - token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids - for sequence pairs - already_has_special_tokens: (default False) Set to True if the token list is already formated with - special tokens for the model + token_ids_0 (:obj:`List[int]`): + List of ids. + token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Set to True if the token list is already formatted with special tokens for the model Returns: - A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: @@ -196,14 +245,29 @@ def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_spe return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] return [1] + ([0] * len(token_ids_0)) + [1] - def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence pair mask has the following format: - 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 - | first sequence | second sequence + + :: + + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | if token_ids_1 is None, only returns the first portion of the mask (0's). + + Args: + token_ids_0 (:obj:`List[int]`): + List of ids. + token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given + sequence(s). """ sep = [self.sep_token_id] cls = [self.cls_token_id] @@ -212,7 +276,16 @@ def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] def save_vocabulary(self, vocab_path): - """Save the tokenizer vocabulary to a directory or file.""" + """ + Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. + + Args: + vocab_path (:obj:`str`): + The directory in which to save the vocabulary. + + Returns: + :obj:`Tuple(str)`: Paths to the files saved. + """ index = 0 if os.path.isdir(vocab_path): vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"]) From c69ea5efc4eac65b183e8d07b1bf91d20bbe0c8c Mon Sep 17 00:00:00 2001 From: Sam Shleifer Date: Fri, 24 Jul 2020 15:34:16 -0400 Subject: [PATCH 007/127] [CI] Don't test apex (#6021) --- examples/seq2seq/test_bash_script.py | 3 +-- examples/seq2seq/test_seq2seq_examples.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/seq2seq/test_bash_script.py b/examples/seq2seq/test_bash_script.py index f3470113bd96..a9cb6e3a0926 100644 --- a/examples/seq2seq/test_bash_script.py +++ b/examples/seq2seq/test_bash_script.py @@ -55,9 +55,8 @@ def test_train_mbart_cc25_enro_script(): if CUDA_AVAILABLE: gpus = 1 # torch.cuda.device_count() else: - bash_script = bash_script.replace("--fp16", "") gpus = 0 - + bash_script = bash_script.replace("--fp16", "") testargs = ( ["finetune.py"] + bash_script.split() diff --git a/examples/seq2seq/test_seq2seq_examples.py b/examples/seq2seq/test_seq2seq_examples.py index e25fb0b0e7ed..191bbfac70fd 100644 --- a/examples/seq2seq/test_seq2seq_examples.py +++ b/examples/seq2seq/test_seq2seq_examples.py @@ -43,7 +43,7 @@ "student_decoder_layers": 1, "val_check_interval": 1.0, "output_dir": "", - "fp16": CUDA_AVAILABLE, + "fp16": False, # TODO(SS): set this to CUDA_AVAILABLE if ci installs apex or start using native amp "no_teacher": False, "fp16_opt_level": "O1", "gpus": 1 if CUDA_AVAILABLE else 0, From daa5dd12025764943ff4a8d788859d320ffbf116 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Sun, 26 Jul 2020 11:09:14 -0700 Subject: [PATCH 008/127] add a summary report flag for run_examples on CI (#6035) Currently, it's hard to derive which example tests were run on CI, and which weren't. Adding `-rA` flag to `pytest`, will now include a summary like: ``` ==================================================================== short test summary info ===================================================================== PASSED examples/test_examples.py::ExamplesTests::test_generation PASSED examples/test_examples.py::ExamplesTests::test_run_glue PASSED examples/test_examples.py::ExamplesTests::test_run_language_modeling PASSED examples/test_examples.py::ExamplesTests::test_run_squad FAILED examples/test_examples.py::ExamplesTests::test_run_pl_glue - AttributeError: 'Namespace' object has no attribute 'gpus' ============================================================ 1 failed, 4 passed, 8 warnings in 42.96s ============================================================ ``` which makes it easier to validate whether some example is being covered by CI or not. --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 4f681c6454a9..4d76bb6ae27e 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -73,7 +73,7 @@ jobs: - checkout - run: sudo pip install .[sklearn,torch,testing] - run: sudo pip install -r examples/requirements.txt - - run: python -m pytest -n 8 --dist=loadfile -s ./examples/ | tee output.txt + - run: python -m pytest -n 8 --dist=loadfile -rA -s ./examples/ | tee output.txt - store_artifacts: path: ~/transformers/output.txt destination: test_output.txt From fb0589a03d762c426c5e996b1fb5c2c2ef822bc2 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Sun, 26 Jul 2020 11:29:54 -0700 Subject: [PATCH 009/127] don't complain about missing W&B when WANDB_DISABLED=true (#6036) * don't complain about missing W&B when WANDB_DISABLED=true * reformat to elif * typo --- src/transformers/trainer.py | 2 +- src/transformers/trainer_tf.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index df39eec5c3f0..7a6db778878f 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -203,7 +203,7 @@ def __init__( ) if is_wandb_available(): self.setup_wandb() - else: + elif os.environ.get("WANDB_DISABLED") != "true": logger.info( "You are instantiating a Trainer but W&B is not installed. To use wandb logging, " "run `pip install wandb; wandb login` see https://docs.wandb.com/huggingface." diff --git a/src/transformers/trainer_tf.py b/src/transformers/trainer_tf.py index accbf0c7cf27..8e56251b27a0 100644 --- a/src/transformers/trainer_tf.py +++ b/src/transformers/trainer_tf.py @@ -90,7 +90,7 @@ def __init__( if is_wandb_available(): self._setup_wandb() - else: + elif os.environ.get("WANDB_DISABLED") != "true": logger.info( "You are instantiating a Trainer but W&B is not installed. To use wandb logging, " "run `pip install wandb; wandb login` see https://docs.wandb.com/huggingface." From f7f03b22dc15543317635770f312adf4513303d0 Mon Sep 17 00:00:00 2001 From: Rodolfo De Nadai Date: Sun, 26 Jul 2020 18:31:49 -0300 Subject: [PATCH 010/127] Update README.md of my model (#6042) --- model_cards/rdenadai/BR_BERTo/README.md | 42 ++++--------------------- 1 file changed, 6 insertions(+), 36 deletions(-) diff --git a/model_cards/rdenadai/BR_BERTo/README.md b/model_cards/rdenadai/BR_BERTo/README.md index 5872e2aa359f..59609b9fb71a 100644 --- a/model_cards/rdenadai/BR_BERTo/README.md +++ b/model_cards/rdenadai/BR_BERTo/README.md @@ -16,41 +16,11 @@ Portuguese (Brazil) model for text inference. Trained on a corpus of 5_258_624 sentences, with 132_807_374 non unique tokens (992_418 unique tokens). -But since my machine doesn`t support bigger model, at the end it has a vocab size of 54_000 tokens. The rest of the parameters are the default used in the HuggingFace tutorial. +- Vocab size: 220_000 +- RobertaForMaskedLM size : 32 +- Num train epochs: 2 +- Time to train: ~23hs (on GCP with a Nvidia T4) -[How to train a new language model from scratch using Transformers and Tokenizers](https://huggingface.co/blog/how-to-train) - -## Results +I follow the great tutorial from HuggingFace team: -```python -fill_mask("gostei muito dessa ") - -#[{'sequence': '~~gostei muito dessa experiÃªncia~~', -# 'score': 0.0719294399023056, -# 'token': 2322, -# 'token_str': 'Ä experiÃƒÂªncia'}, -# {'sequence': '~~gostei muito dessa diferenÃ§a~~', -# 'score': 0.05286405608057976, -# 'token': 3472, -# 'token_str': 'Ä diferenÃƒÂ§a'}, -# {'sequence': '~~gostei muito dessa atenÃ§Ã£o~~', -# 'score': 0.027575725689530373, -# 'token': 2557, -# 'token_str': 'Ä atenÃƒÂ§ÃƒÂ£o'}, -# {'sequence': '~~gostei muito dessa histÃ³ria~~', -# 'score': 0.026764703914523125, -# 'token': 1329, -# 'token_str': 'Ä histÃƒÂ³ria'}, -# {'sequence': '~~gostei muito dessa razÃ£o~~', -# 'score': 0.0250675268471241, -# 'token': 3323, -# 'token_str': 'Ä razÃƒÂ£o'}, -# {'sequence': '~~gostei muito dessa resposta~~', -# 'score': 0.024784332141280174, -# 'token': 2403, -# 'token_str': 'Ä resposta'}, -# {'sequence': '~~gostei muito dessa dose~~', -# 'score': 0.01720510423183441, -# 'token': 1042, -# 'token_str': 'Ä dose'}] -``` +[How to train a new language model from scratch using Transformers and Tokenizers](https://huggingface.co/blog/how-to-train) From a9585fd10728a863b04bcaf479b81552b3efb956 Mon Sep 17 00:00:00 2001 From: Vamsi995 <52487689+Vamsi995@users.noreply.github.com> Date: Mon, 27 Jul 2020 14:42:45 +0530 Subject: [PATCH 011/127] Model card for Vamsi/T5_Paraphrase_Paws (#6037) * Model card for Vamsi/T5_Paraphrase_Paws * Update model_cards/Vamsi/T5_Paraphrase_Paws/README.md Co-authored-by: Julien Chaumond --- .../Vamsi/T5_Paraphrase_Paws/README.md | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 model_cards/Vamsi/T5_Paraphrase_Paws/README.md diff --git a/model_cards/Vamsi/T5_Paraphrase_Paws/README.md b/model_cards/Vamsi/T5_Paraphrase_Paws/README.md new file mode 100644 index 000000000000..8d54e825d795 --- /dev/null +++ b/model_cards/Vamsi/T5_Paraphrase_Paws/README.md @@ -0,0 +1,51 @@ +--- +language: "en" +tags: +- paraphrase-generation +- text-generation +- Conditional Generation +inference: false +--- +â€‹ +# Paraphrase-Generation +â€‹ +## Model description +â€‹ +T5 Model for generating paraphrases of english sentences. Trained on the [Google PAWS](https://github.com/google-research-datasets/paws) dataset. +â€‹ +## How to use +â€‹ +PyTorch and TF models available +â€‹ +```python +from transformers import AutoTokenizer, AutoModelForSeq2SeqLM +â€‹ +tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws") +model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws") +â€‹ +sentence = "This is something which i cannot understand at all" + +text = "paraphrase: " + sentence + " " + +encoding = tokenizer.encode_plus(text,pad_to_max_length=True, return_tensors="pt") +input_ids, attention_masks = encoding["input_ids"].to("cuda"), encoding["attention_mask"].to("cuda") + + +outputs = model.generate( + input_ids=input_ids, attention_mask=attention_masks, + max_length=256, + do_sample=True, + top_k=120, + top_p=0.95, + early_stopping=True, + num_return_sequences=5 +) + +for output in outputs: + line = tokenizer.decode(output, skip_special_tokens=True,clean_up_tokenization_spaces=True) + print(line) +â€‹ + +``` + +For more reference on training your own T5 model or using this model, do check out [Paraphrase Generation](https://github.com/Vamsi995/Paraphrase-Generator). From 7969e96f4a8bd9d84e526e18d4d79ed51d0a64bd Mon Sep 17 00:00:00 2001 From: Pavel Soriano Date: Mon, 27 Jul 2020 11:15:08 +0200 Subject: [PATCH 012/127] draft etalab QA model (#6040) --- .../README.md | 99 +++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 model_cards/etalab-ia/camembert-base-squadFR-fquad-piaf/README.md diff --git a/model_cards/etalab-ia/camembert-base-squadFR-fquad-piaf/README.md b/model_cards/etalab-ia/camembert-base-squadFR-fquad-piaf/README.md new file mode 100644 index 000000000000..38c2411a7ce1 --- /dev/null +++ b/model_cards/etalab-ia/camembert-base-squadFR-fquad-piaf/README.md @@ -0,0 +1,99 @@ +--- +language: fr +--- + +# camembert-base-squadFR-fquad-piaf + +## Description + +Question-answering French model, using base [CamemBERT](https://camembert-model.fr/) fine-tuned on a combo of three French Q&A datasets: + +1. [PIAFv1.1](https://www.data.gouv.fr/en/datasets/piaf-le-dataset-francophone-de-questions-reponses/) +2. [FQuADv1.0](https://fquad.illuin.tech/) +3. [SQuAD-FR (SQuAD automatically translated to French)](https://github.com/Alikabbadj/French-SQuAD) + +## Training hyperparameters + +```shell +python run_squad.py \ +--model_type camembert \ +--model_name_or_path camembert-base \ +--do_train --do_eval \ +--train_file data/SQuAD+fquad+piaf.json \ +--predict_file data/fquad_valid.json \ +--per_gpu_train_batch_size 12 \ +--learning_rate 3e-5 \ +--num_train_epochs 4 \ +--max_seq_length 384 \ +--doc_stride 128 \ +--save_steps 10000 +``` + +## Evaluation results +### Fquad v1.0 Evaluation +```shell +{"f1": 79.81, "exact_match": 55.14} +``` +### SQuAD-FR Evaluation +```shell +{"f1": 59.54, "exact_match": 80.61} +``` + +## Usage + +```python +from transformers import pipeline + +nlp = pipeline('question-answering', model='etalab-ia/camembert-base-squadFR-fquad-piaf', tokenizer='etalab-ia/camembert-base-squadFR-fquad-piaf') + +nlp({ + 'question': "Qui est Claude Monet?", + 'context': "Claude Monet, nÃ© le 14 novembre 1840 Ã Paris et mort le 5 dÃ©cembre 1926 Ã Giverny, est un peintre franÃ§ais et lâ€™un des fondateurs de l'impressionnisme." +}) +``` + +## Citation + +### PIAF +``` +@inproceedings{KeraronLBAMSSS20, + author = {Rachel Keraron and + Guillaume Lancrenon and + Mathilde Bras and + Fr{\'{e}}d{\'{e}}ric Allary and + Gilles Moyse and + Thomas Scialom and + Edmundo{-}Pavel Soriano{-}Morales and + Jacopo Staiano}, + title = {Project {PIAF:} Building a Native French Question-Answering Dataset}, + booktitle = {{LREC}}, + pages = {5481--5490}, + publisher = {European Language Resources Association}, + year = {2020} +} + +``` + +### Fquad +``` +@article{dHoffschmidt2020FQuADFQ, + title={FQuAD: French Question Answering Dataset}, + author={Martin d'Hoffschmidt and Maxime Vidal and Wacim Belblidia and Tom Brendl'e and Quentin Heinrich}, + journal={ArXiv}, + year={2020}, + volume={abs/2002.06071} +} +``` + +### SQuAD-FR +``` + @MISC{maldives, + author = "Kabbadj, Ali", + title = "Something new in French Text Mining and Information Extraction (Universal Chatbot): Largest Q&A French training dataset (110 000+) ", + editor = "linkedin.com", + month = "November", + year = "2018", + url = "\url{https://www.linkedin.com/pulse/something-new-french-text-mining-information-chatbot-largest-kabbadj/}", + note = "[Online; posted 11-November-2018]", + } + ``` From b21993b3625354a2e6255be09b0a9acec068ec11 Mon Sep 17 00:00:00 2001 From: Gong Linyuan Date: Mon, 27 Jul 2020 17:31:37 +0800 Subject: [PATCH 013/127] Allow to set Adam beta1, beta2 in TrainingArgs (#5592) * Add Adam beta1, beta2 to trainier * Make style consistent --- src/transformers/optimization_tf.py | 14 +++++++++++--- src/transformers/trainer.py | 7 ++++++- src/transformers/trainer_tf.py | 2 ++ src/transformers/training_args.py | 2 ++ 4 files changed, 21 insertions(+), 4 deletions(-) diff --git a/src/transformers/optimization_tf.py b/src/transformers/optimization_tf.py index 21c3557ae895..7043bcdf1f3e 100644 --- a/src/transformers/optimization_tf.py +++ b/src/transformers/optimization_tf.py @@ -84,6 +84,8 @@ def create_optimizer( num_train_steps: int, num_warmup_steps: int, min_lr_ratio: float = 0.0, + adam_beta1: float = 0.9, + adam_beta2: float = 0.999, adam_epsilon: float = 1e-8, weight_decay_rate: float = 0.0, include_in_weight_decay: Optional[List[str]] = None, @@ -100,6 +102,10 @@ def create_optimizer( The number of warmup steps. min_lr_ratio (:obj:`float`, `optional`, defaults to 0): The final learning rate at the end of the linear decay will be :obj:`init_lr * min_lr_ratio`. + adam_beta1 (:obj:`float`, `optional`, defaults to 0.9): + The beta1 to use in Adam. + adam_beta2 (:obj:`float`, `optional`, defaults to 0.999): + The beta2 to use in Adam. adam_epsilon (:obj:`float`, `optional`, defaults to 1e-8): The epsilon to use in Adam. weight_decay_rate (:obj:`float`, `optional`, defaults to 0): @@ -122,14 +128,16 @@ def create_optimizer( optimizer = AdamWeightDecay( learning_rate=lr_schedule, weight_decay_rate=weight_decay_rate, - beta_1=0.9, - beta_2=0.999, + beta_1=adam_beta1, + beta_2=adam_beta2, epsilon=adam_epsilon, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"], include_in_weight_decay=include_in_weight_decay, ) else: - optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule, epsilon=adam_epsilon) + optimizer = tf.keras.optimizers.Adam( + learning_rate=lr_schedule, beta_1=adam_beta1, beta_2=adam_beta2, epsilon=adam_epsilon + ) # We return the optimizer and the LR scheduler in order to better track the # evolution of the LR independently of the optimizer. return optimizer, lr_schedule diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 7a6db778878f..06d467a354e2 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -343,7 +343,12 @@ def get_optimizers( "weight_decay": 0.0, }, ] - optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon) + optimizer = AdamW( + optimizer_grouped_parameters, + lr=self.args.learning_rate, + betas=(self.args.adam_beta1, self.args.adam_beta2), + eps=self.args.adam_epsilon, + ) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=num_training_steps ) diff --git a/src/transformers/trainer_tf.py b/src/transformers/trainer_tf.py index 8e56251b27a0..bd3bf1e92526 100644 --- a/src/transformers/trainer_tf.py +++ b/src/transformers/trainer_tf.py @@ -171,6 +171,8 @@ def get_optimizers( self.args.learning_rate, num_training_steps, self.args.warmup_steps, + adam_beta1=self.args.adam_beta1, + adam_beta2=self.args.adam_beta2, adam_epsilon=self.args.adam_epsilon, weight_decay_rate=self.args.weight_decay, ) diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index 90fc8f266d5c..e6506d97632f 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -160,6 +160,8 @@ class TrainingArguments: learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate for Adam."}) weight_decay: float = field(default=0.0, metadata={"help": "Weight decay if we apply some."}) + adam_beta1: float = field(default=0.9, metadata={"help": "Beta1 for Adam optimizer"}) + adam_beta2: float = field(default=0.999, metadata={"help": "Beta2 for Adam optimizer"}) adam_epsilon: float = field(default=1e-8, metadata={"help": "Epsilon for Adam optimizer."}) max_grad_norm: float = field(default=1.0, metadata={"help": "Max gradient norm."}) From b9b11795cfdce7bb8dd8a01ec5efa602589a78b2 Mon Sep 17 00:00:00 2001 From: Xin Wen Date: Mon, 27 Jul 2020 17:34:02 +0800 Subject: [PATCH 014/127] Update model_summary.rst (#5737) Add '-' to make the reference of Transformer-XL more accurate and formal. --- docs/source/model_summary.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/model_summary.rst b/docs/source/model_summary.rst index f87a488f216f..5371492dea21 100644 --- a/docs/source/model_summary.rst +++ b/docs/source/model_summary.rst @@ -195,7 +195,7 @@ tokens in the sentence, then allows the model to use the last n tokens to predic with a mask, the sentence is actually fed in the model in the right order, but instead of masking the first n tokens for n+1, XLNet uses a mask that hides the previous tokens in some given permutation of 1,...,sequence length. -XLNet also uses the same recurrence mechanism as TransformerXL to build long-term dependencies. +XLNet also uses the same recurrence mechanism as Transformer-XL to build long-term dependencies. The library provides a version of the model for language modeling, token classification, sentence classification, multiple choice classification and question answering. From 3b64ad5d5c5b6099e0af41d721ba5a10e8a8dfb8 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Mon, 27 Jul 2020 08:31:24 -0400 Subject: [PATCH 015/127] Remove unused file (#6023) --- deploy_multi_version_doc.sh | 23 ----------------------- 1 file changed, 23 deletions(-) delete mode 100644 deploy_multi_version_doc.sh diff --git a/deploy_multi_version_doc.sh b/deploy_multi_version_doc.sh deleted file mode 100644 index 37c5de114f0c..000000000000 --- a/deploy_multi_version_doc.sh +++ /dev/null @@ -1,23 +0,0 @@ -cd docs - -function deploy_doc(){ - echo "Creating doc at commit $1 and pushing to folder $2" - git checkout $1 - if [ ! -z "$2" ] - then - echo "Pushing version" $2 - make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html $doc:$dir/$2 - else - echo "Pushing master" - make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir - fi -} - -deploy_doc "master" -deploy_doc "b33a385" v1.0.0 -deploy_doc "fe02e45" v1.1.0 -deploy_doc "89fd345" v1.2.0 -deploy_doc "fc9faa8" v2.0.0 -deploy_doc "3ddce1d" v2.1.1 -deploy_doc "f2f3294" v2.2.0 -deploy_doc "d0f8b9a" v2.3.0 From 1246b20f6d81bcd949078d26cf5ab3d0f3acccc6 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Mon, 27 Jul 2020 09:18:59 -0400 Subject: [PATCH 016/127] Fix the return documentation rendering for all model outputs (#6022) * Fix the return documentation rendering for all model outputs * Formatting --- src/transformers/file_utils.py | 39 ++++++++++++++++++++++++- src/transformers/modeling_transfo_xl.py | 2 -- 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py index 57b8c3d31028..5a46fb062467 100644 --- a/src/transformers/file_utils.py +++ b/src/transformers/file_utils.py @@ -189,10 +189,46 @@ def docstring_decorator(fn): RETURN_INTRODUCTION = r""" Returns: - :class:`~{full_output_type}` or :obj:`tuple(torch.FloatTensor)` (if ``return_tuple=True`` is passed or when ``config.return_tuple=True``) comprising various elements depending on the configuration (:class:`~transformers.{config_class}`) and inputs: + :class:`~{full_output_type}` or :obj:`tuple(torch.FloatTensor)`: + A :class:`~{full_output_type}` or a tuple of :obj:`torch.FloatTensor` (if ``return_tuple=True`` is passed or + when ``config.return_tuple=True``) comprising various elements depending on the configuration + (:class:`~transformers.{config_class}`) and inputs. + """ +def _get_indent(t): + """Returns the indentation in the first line of t""" + search = re.search(r"^(\s*)\S", t) + return "" if search is None else search.groups()[0] + + +def _convert_output_args_doc(output_args_doc): + """Convert output_args_doc to display properly.""" + # Split output_arg_doc in blocks argument/description + indent = _get_indent(output_args_doc) + blocks = [] + current_block = "" + for line in output_args_doc.split("\n"): + # If the indent is the same as the beginning, the line is the name of new arg. + if _get_indent(line) == indent: + if len(current_block) > 0: + blocks.append(current_block[:-1]) + current_block = f"{line}\n" + else: + # Otherwise it's part of the description of the current arg. + # We need to remove 2 spaces to the indentation. + current_block += f"{line[2:]}\n" + blocks.append(current_block[:-1]) + + # Format each block for proper rendering + for i in range(len(blocks)): + blocks[i] = re.sub(r"^(\s+)(\S+)(\s+)", r"\1- **\2**\3", blocks[i]) + blocks[i] = re.sub(r":\s*\n\s*(\S)", r" -- \1", blocks[i]) + + return "\n".join(blocks) + + def _prepare_output_docstrings(output_type, config_class): """ Prepares the return part of the docstring using `output_type`. @@ -206,6 +242,7 @@ def _prepare_output_docstrings(output_type, config_class): i += 1 if i < len(lines): docstrings = "\n".join(lines[(i + 1) :]) + docstrings = _convert_output_args_doc(docstrings) # Add the return introduction full_output_type = f"{output_type.__module__}.{output_type.__name__}" diff --git a/src/transformers/modeling_transfo_xl.py b/src/transformers/modeling_transfo_xl.py index ba8285f388b8..ca98fe5abc5d 100644 --- a/src/transformers/modeling_transfo_xl.py +++ b/src/transformers/modeling_transfo_xl.py @@ -629,8 +629,6 @@ class TransfoXLLMHeadModelOutput(ModelOutput): Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding). Args: - - Language modeling loss (for next-token prediction). losses (:obj:`torch.FloatTensor` of shape `(batch_size, sequence_length-1)`, `optional`, returned when ``labels`` is provided) Language modeling losses (not reduced). prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): From 3deffc1d67e78a57282a66508ef229945e996c9f Mon Sep 17 00:00:00 2001 From: Joe Davison Date: Mon, 27 Jul 2020 07:42:58 -0600 Subject: [PATCH 017/127] Zero shot classification pipeline (#5760) * add initial zero-shot pipeline * change default args * update default template * add label string splitting * add str labels support, remove nli from name * style * add input validation and working tf defaults * tests * quality check * add docstring to __call__ * add slow tests * Change truncation to only_first also lower precision on tests for readibility * style --- src/transformers/pipelines.py | 163 ++++++++++++++++++++++++++++++++++ tests/test_pipelines.py | 132 +++++++++++++++++++++++++++ 2 files changed, 295 insertions(+) diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py index 164f94b33c10..10c682d0a96a 100755 --- a/src/transformers/pipelines.py +++ b/src/transformers/pipelines.py @@ -816,6 +816,159 @@ def __call__(self, *args, **kwargs): ] +class ZeroShotClassificationArgumentHandler(ArgumentHandler): + """ + Handles arguments for zero-shot for text classification by turning each possible label into an NLI + premise/hypothesis pair. + """ + + def _parse_labels(self, labels): + if isinstance(labels, str): + labels = [label.strip() for label in labels.split(",")] + return labels + + def __call__(self, sequences, labels, hypothesis_template): + if len(labels) == 0 or len(sequences) == 0: + raise ValueError("You must include at least one label and at least one sequence.") + if hypothesis_template.format(labels[0]) == hypothesis_template: + raise ValueError( + ( + 'The provided hypothesis_template "{}" was not able to be formatted with the target labels. ' + "Make sure the passed template includes formatting syntax such as {{}} where the label should go." + ).format(hypothesis_template) + ) + + if isinstance(sequences, str): + sequences = [sequences] + labels = self._parse_labels(labels) + + sequence_pairs = [] + for sequence in sequences: + sequence_pairs.extend([[sequence, hypothesis_template.format(label)] for label in labels]) + + return sequence_pairs + + +class ZeroShotClassificationPipeline(Pipeline): + """ + NLI-based zero-shot classification pipeline using a ModelForSequenceClassification head with models trained on + NLI tasks. + + Any combination of sequences and labels can be passed and each combination will be posed as a premise/hypothesis + pair and passed to the pre-trained model. Then logit for `entailment` is then taken as the logit for the + candidate label being valid. Any NLI model can be used as long as the first output logit corresponds to + `contradiction` and the last to `entailment`. + + This pipeline can currently be loaded from the :func:`~transformers.pipeline` method using the following task + identifier(s): + + - "zero-shot-classification" + + The models that this pipeline can use are models that have been fine-tuned on a Natural Language Inference task. + See the up-to-date list of available models on + `huggingface.co/models `__. + + Arguments: + model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`): + The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from + :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for + TensorFlow. + tokenizer (:obj:`~transformers.PreTrainedTokenizer`): + The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from + :class:`~transformers.PreTrainedTokenizer`. + modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`): + Model card attributed to the model for this pipeline. + framework (:obj:`str`, `optional`, defaults to :obj:`None`): + The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be + installed. + + If no framework is specified, will default to the one currently installed. If no framework is specified + and both frameworks are installed, will default to PyTorch. + args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`): + Reference to the object in charge of parsing supplied pipeline parameters. + device (:obj:`int`, `optional`, defaults to :obj:`-1`): + Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model + on the associated CUDA device id. + """ + + def __init__(self, args_parser=ZeroShotClassificationArgumentHandler(), *args, **kwargs): + super().__init__(*args, args_parser=args_parser, **kwargs) + + def _parse_and_tokenize(self, *args, padding=True, add_special_tokens=True, **kwargs): + """ + Parse arguments and tokenize only_first so that hypothesis (label) is not truncated + """ + inputs = self._args_parser(*args, **kwargs) + inputs = self.tokenizer( + inputs, + add_special_tokens=add_special_tokens, + return_tensors=self.framework, + padding=padding, + truncation="only_first", + ) + + return inputs + + def __call__(self, sequences, candidate_labels, hypothesis_template="This example is {}.", multi_class=False): + """ + NLI-based zero-shot classification. Any combination of sequences and labels can be passed and each + combination will be posed as a premise/hypothesis pair and passed to the pre-trained model. Then logit for + `entailment` is then taken as the logit for the candidate label being valid. Any NLI model can be used as + long as the first output logit corresponds to `contradiction` and the last to `entailment`. + + Args: + sequences (:obj:`str` or obj:`List`): + The sequence or sequences to classify. Truncated if model input is too large. + candidate_labels (:obj:`str` or obj:`List`): + The set of possible class labels to classify each sequence into. Can be a single label, a string of + comma-separated labels, or a list of labels. + hypothesis_template (obj:`str`, defaults to "This example is {}."): + The template used to turn each label into an NLI-style hypothesis. This template must include a {} + or similar syntax for the candidate label to be inserted into the template. For example, the default + template is "This example is {}." With the candidate label "sports", this would be fed into the model + like ` sequence to classify This example is sports . `. The default template works + well in many cases, but it may be worthwhile to experiment with different templates depending on the + task setting. + multi_class (obj:`bool`, defaults to False): + When False, it is assumed that only one candidate label can be true, and the scores are normalized + such that the sum of the label likelihoods for each sequence is 1. When True, the labels are + considered independent and probabilities are normalized for each candidate by doing a of softmax of + the entailment score vs. the contradiction score. + """ + outputs = super().__call__(sequences, candidate_labels, hypothesis_template) + num_sequences = 1 if isinstance(sequences, str) else len(sequences) + candidate_labels = self._args_parser._parse_labels(candidate_labels) + reshaped_outputs = outputs.reshape((num_sequences, len(candidate_labels), -1)) + + if len(candidate_labels) == 1: + multi_class = True + + if not multi_class: + # softmax the "entailment" logits over all candidate labels + entail_logits = reshaped_outputs[..., -1] + scores = np.exp(entail_logits) / np.exp(entail_logits).sum(-1, keepdims=True) + else: + # softmax over the entailment vs. contradiction dim for each label independently + entail_contr_logits = reshaped_outputs[..., [0, -1]] + scores = np.exp(entail_contr_logits) / np.exp(entail_contr_logits).sum(-1, keepdims=True) + scores = scores[..., 1] + + result = [] + for iseq in range(num_sequences): + top_inds = list(reversed(scores[iseq].argsort())) + result.append( + { + "sequence": sequences if num_sequences == 1 else sequences[iseq], + "labels": [candidate_labels[i] for i in top_inds], + "scores": scores[iseq][top_inds].tolist(), + } + ) + + if len(result) == 1: + return result[0] + return result + + class FillMaskPipeline(Pipeline): """ Masked language modeling prediction pipeline using ModelWithLMHead head. See the @@ -1813,6 +1966,16 @@ def __call__( "pt": AutoModelWithLMHead if is_torch_available() else None, "default": {"model": {"pt": "gpt2", "tf": "gpt2"}}, }, + "zero-shot-classification": { + "impl": ZeroShotClassificationPipeline, + "tf": TFAutoModelForSequenceClassification if is_tf_available() else None, + "pt": AutoModelForSequenceClassification if is_torch_available() else None, + "default": { + "model": {"pt": "facebook/bart-large-mnli", "tf": "roberta-large-mnli"}, + "config": {"pt": "facebook/bart-large-mnli", "tf": "roberta-large-mnli"}, + "tokenizer": {"pt": "facebook/bart-large-mnli", "tf": "roberta-large-mnli"}, + }, + }, } diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index 1e94b2e67d02..3f2dd55afbfd 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -318,6 +318,138 @@ def test_tf_text_generation(self): QA_FINETUNED_MODELS = ["sshleifer/tiny-distilbert-base-cased-distilled-squad"] +class ZeroShotClassificationPipelineTests(unittest.TestCase): + def _test_scores_sum_to_one(self, result): + sum = 0.0 + for score in result["scores"]: + sum += score + self.assertAlmostEqual(sum, 1.0) + + def _test_zero_shot_pipeline(self, nlp): + output_keys = {"sequence", "labels", "scores"} + valid_mono_inputs = [ + {"sequences": "Who are you voting for in 2020?", "candidate_labels": "politics"}, + {"sequences": "Who are you voting for in 2020?", "candidate_labels": ["politics"]}, + {"sequences": "Who are you voting for in 2020?", "candidate_labels": "politics, public health"}, + {"sequences": "Who are you voting for in 2020?", "candidate_labels": ["politics", "public health"]}, + {"sequences": ["Who are you voting for in 2020?"], "candidate_labels": "politics"}, + { + "sequences": "Who are you voting for in 2020?", + "candidate_labels": "politics", + "hypothesis_template": "This text is about {}", + }, + ] + valid_multi_input = { + "sequences": ["Who are you voting for in 2020?", "What is the capital of Spain?"], + "candidate_labels": "politics", + } + invalid_inputs = [ + {"sequences": None, "candidate_labels": "politics"}, + {"sequences": "", "candidate_labels": "politics"}, + {"sequences": "Who are you voting for in 2020?", "candidate_labels": None}, + {"sequences": "Who are you voting for in 2020?", "candidate_labels": ""}, + { + "sequences": "Who are you voting for in 2020?", + "candidate_labels": "politics", + "hypothesis_template": None, + }, + { + "sequences": "Who are you voting for in 2020?", + "candidate_labels": "politics", + "hypothesis_template": "", + }, + { + "sequences": "Who are you voting for in 2020?", + "candidate_labels": "politics", + "hypothesis_template": "Template without formatting syntax.", + }, + ] + self.assertIsNotNone(nlp) + + for mono_input in valid_mono_inputs: + mono_result = nlp(**mono_input) + self.assertIsInstance(mono_result, dict) + if len(mono_result["labels"]) > 1: + self._test_scores_sum_to_one(mono_result) + + for key in output_keys: + self.assertIn(key, mono_result) + + multi_result = nlp(**valid_multi_input) + self.assertIsInstance(multi_result, list) + self.assertIsInstance(multi_result[0], dict) + self.assertEqual(len(multi_result), len(valid_multi_input["sequences"])) + + for result in multi_result: + for key in output_keys: + self.assertIn(key, result) + + if len(result["labels"]) > 1: + self._test_scores_sum_to_one(result) + + for bad_input in invalid_inputs: + self.assertRaises(Exception, nlp, **bad_input) + + def _test_zero_shot_pipeline_outputs(self, nlp): + inputs = [ + { + "sequences": "Who are you voting for in 2020?", + "candidate_labels": ["politics", "public health", "science"], + }, + { + "sequences": "The dominant sequence transduction models are based on complex recurrent or convolutional neural networks in an encoder-decoder configuration. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data.", + "candidate_labels": ["machine learning", "statistics", "translation", "vision"], + "multi_class": True, + }, + ] + + expected_outputs = [ + { + "sequence": "Who are you voting for in 2020?", + "labels": ["politics", "public health", "science"], + "scores": [0.975, 0.015, 0.008], + }, + { + "sequence": "The dominant sequence transduction models are based on complex recurrent or convolutional neural networks in an encoder-decoder configuration. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data.", + "labels": ["translation", "machine learning", "vision", "statistics"], + "scores": [0.817, 0.712, 0.018, 0.017], + }, + ] + + for input, expected_output in zip(inputs, expected_outputs): + output = nlp(**input) + for key in output: + if key == "scores": + for output_score, expected_score in zip(output[key], expected_output[key]): + self.assertAlmostEqual(output_score, expected_score, places=2) + else: + self.assertEqual(output[key], expected_output[key]) + + @require_torch + def test_torch_zero_shot_classification(self): + for model_name in TEXT_CLASSIF_FINETUNED_MODELS: + nlp = pipeline(task="zero-shot-classification", model=model_name, tokenizer=model_name) + self._test_zero_shot_pipeline(nlp) + + @require_tf + def test_tf_zero_shot_classification(self): + for model_name in TEXT_CLASSIF_FINETUNED_MODELS: + nlp = pipeline(task="zero-shot-classification", model=model_name, tokenizer=model_name, framework="tf") + self._test_zero_shot_pipeline(nlp) + + @slow + @require_torch + def test_torch_zero_shot_outputs(self): + nlp = pipeline(task="zero-shot-classification", model="roberta-large-mnli") + self._test_zero_shot_pipeline_outputs(nlp) + + @slow + @require_tf + def test_tf_zero_shot_outputs(self): + nlp = pipeline(task="zero-shot-classification", model="roberta-large-mnli", framework="tf") + self._test_zero_shot_pipeline_outputs(nlp) + + class QAPipelineTests(unittest.TestCase): def _test_qa_pipeline(self, nlp): output_keys = {"score", "answer", "start", "end"} From d1d15d6f2de9e2cde48ff3ea2072add3311ce2ac Mon Sep 17 00:00:00 2001 From: Suraj Patil Date: Mon, 27 Jul 2020 19:40:43 +0530 Subject: [PATCH 018/127] [examples (seq2seq)] fix preparing decoder_input_ids for T5 (#5994) --- examples/seq2seq/finetune.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/examples/seq2seq/finetune.py b/examples/seq2seq/finetune.py index a0014b983500..1866042cbb3b 100644 --- a/examples/seq2seq/finetune.py +++ b/examples/seq2seq/finetune.py @@ -14,7 +14,7 @@ from torch.utils.data import DataLoader from lightning_base import BaseTransformer, add_generic_args, generic_train -from transformers import MBartTokenizer, get_linear_schedule_with_warmup +from transformers import MBartTokenizer, T5ForConditionalGeneration, get_linear_schedule_with_warmup try: @@ -131,8 +131,14 @@ def ids_to_clean_text(self, generated_ids: List[int]): def _step(self, batch: dict) -> Tuple: pad_token_id = self.tokenizer.pad_token_id source_ids, source_mask, target_ids = batch["input_ids"], batch["attention_mask"], batch["decoder_input_ids"] - decoder_input_ids = target_ids[:, :-1].contiguous() # Why this line? - lm_labels = target_ids[:, 1:].clone() # why clone? + + if isinstance(self.model, T5ForConditionalGeneration): + decoder_input_ids = self.model._shift_right(target_ids) + lm_labels = target_ids + else: + decoder_input_ids = target_ids[:, :-1].contiguous() # Why this line? + lm_labels = target_ids[:, 1:].clone() # why clone? + outputs = self(source_ids, attention_mask=source_mask, decoder_input_ids=decoder_input_ids, use_cache=False) if self.hparams.label_smoothing == 0: From 5779e5434d327a5debd79a4a028f48d5f3b872b9 Mon Sep 17 00:00:00 2001 From: Cola <43774355+Colanim@users.noreply.github.com> Date: Mon, 27 Jul 2020 23:55:15 +0900 Subject: [PATCH 019/127] :pencil2: Fix typo (#5734) --- src/transformers/trainer_tf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/trainer_tf.py b/src/transformers/trainer_tf.py index bd3bf1e92526..9a2c8181ebe4 100644 --- a/src/transformers/trainer_tf.py +++ b/src/transformers/trainer_tf.py @@ -575,4 +575,4 @@ def save_model(self, output_dir: Optional[str] = None): if not isinstance(self.model, TFPreTrainedModel): raise ValueError("Trainer.model appears to not be a PreTrainedModel") - self.model.save_pretrained(self.args.output_dir) + self.model.save_pretrained(output_dir) From c8bdf7f4ecd73680cb0751d9efc8fa3a992c2c2d Mon Sep 17 00:00:00 2001 From: Suraj Patil Date: Mon, 27 Jul 2020 21:20:08 +0530 Subject: [PATCH 020/127] Add new AutoModel classes in pipeline (#6062) * use new AutoModel classed * make style and quality --- src/transformers/pipelines.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py index 10c682d0a96a..16589469256a 100755 --- a/src/transformers/pipelines.py +++ b/src/transformers/pipelines.py @@ -60,13 +60,14 @@ AutoModelForSequenceClassification, AutoModelForQuestionAnswering, AutoModelForTokenClassification, - AutoModelWithLMHead, AutoModelForSeq2SeqLM, - MODEL_WITH_LM_HEAD_MAPPING, + AutoModelForCausalLM, + AutoModelForMaskedLM, MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, MODEL_FOR_QUESTION_ANSWERING_MAPPING, MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, + MODEL_FOR_MASKED_LM_MAPPING, ) if TYPE_CHECKING: @@ -1029,7 +1030,7 @@ def __init__( task=task, ) - self.check_model_type(TF_MODEL_WITH_LM_HEAD_MAPPING if self.framework == "tf" else MODEL_WITH_LM_HEAD_MAPPING) + self.check_model_type(TF_MODEL_WITH_LM_HEAD_MAPPING if self.framework == "tf" else MODEL_FOR_MASKED_LM_MAPPING) self.topk = topk @@ -1817,7 +1818,9 @@ class TranslationPipeline(Pipeline): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.check_model_type(TF_MODEL_WITH_LM_HEAD_MAPPING if self.framework == "tf" else MODEL_WITH_LM_HEAD_MAPPING) + self.check_model_type( + TF_MODEL_WITH_LM_HEAD_MAPPING if self.framework == "tf" else MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING + ) def __call__( self, *args, return_tensors=False, return_text=True, clean_up_tokenization_spaces=False, **generate_kwargs @@ -1933,7 +1936,7 @@ def __call__( "fill-mask": { "impl": FillMaskPipeline, "tf": TFAutoModelWithLMHead if is_tf_available() else None, - "pt": AutoModelWithLMHead if is_torch_available() else None, + "pt": AutoModelForMaskedLM if is_torch_available() else None, "default": {"model": {"pt": "distilroberta-base", "tf": "distilroberta-base"}}, }, "summarization": { @@ -1945,25 +1948,25 @@ def __call__( "translation_en_to_fr": { "impl": TranslationPipeline, "tf": TFAutoModelWithLMHead if is_tf_available() else None, - "pt": AutoModelWithLMHead if is_torch_available() else None, + "pt": AutoModelForSeq2SeqLM if is_torch_available() else None, "default": {"model": {"pt": "t5-base", "tf": "t5-base"}}, }, "translation_en_to_de": { "impl": TranslationPipeline, "tf": TFAutoModelWithLMHead if is_tf_available() else None, - "pt": AutoModelWithLMHead if is_torch_available() else None, + "pt": AutoModelForSeq2SeqLM if is_torch_available() else None, "default": {"model": {"pt": "t5-base", "tf": "t5-base"}}, }, "translation_en_to_ro": { "impl": TranslationPipeline, "tf": TFAutoModelWithLMHead if is_tf_available() else None, - "pt": AutoModelWithLMHead if is_torch_available() else None, + "pt": AutoModelForSeq2SeqLM if is_torch_available() else None, "default": {"model": {"pt": "t5-base", "tf": "t5-base"}}, }, "text-generation": { "impl": TextGenerationPipeline, "tf": TFAutoModelWithLMHead if is_tf_available() else None, - "pt": AutoModelWithLMHead if is_torch_available() else None, + "pt": AutoModelForCausalLM if is_torch_available() else None, "default": {"model": {"pt": "gpt2", "tf": "gpt2"}}, }, "zero-shot-classification": { From 4302ace5bd6a0dba6be90e580b4718e270384bb0 Mon Sep 17 00:00:00 2001 From: Sam Shleifer Date: Mon, 27 Jul 2020 12:14:23 -0400 Subject: [PATCH 021/127] [pack_dataset] don't sort before packing, only pack train (#5954) --- examples/requirements.txt | 1 + examples/seq2seq/minify_dataset.py | 19 +++++++++++++++++++ examples/seq2seq/pack_dataset.py | 19 +++++++------------ 3 files changed, 27 insertions(+), 12 deletions(-) create mode 100644 examples/seq2seq/minify_dataset.py diff --git a/examples/requirements.txt b/examples/requirements.txt index 028dd7f8fdcd..a27ba790565b 100644 --- a/examples/requirements.txt +++ b/examples/requirements.txt @@ -13,3 +13,4 @@ streamlit elasticsearch pandas nlp +fire diff --git a/examples/seq2seq/minify_dataset.py b/examples/seq2seq/minify_dataset.py new file mode 100644 index 000000000000..da70ced60ab4 --- /dev/null +++ b/examples/seq2seq/minify_dataset.py @@ -0,0 +1,19 @@ +from pathlib import Path + +import fire + + +def minify(src_dir: str, dest_dir: str, n: int): + """Write first n lines of each file f in src_dir to dest_dir/f """ + src_dir = Path(src_dir) + dest_dir = Path(dest_dir) + dest_dir.mkdir(exist_ok=True) + for path in src_dir.iterdir(): + new = [x.rstrip() for x in list(path.open().readlines())][:n] + dest_path = dest_dir.joinpath(path.name) + print(dest_path) + dest_path.open("w").write("\n".join(new)) + + +if __name__ == "__main__": + fire.Fire(minify) diff --git a/examples/seq2seq/pack_dataset.py b/examples/seq2seq/pack_dataset.py index 599d133a6234..1609c2d1cc0c 100644 --- a/examples/seq2seq/pack_dataset.py +++ b/examples/seq2seq/pack_dataset.py @@ -6,6 +6,7 @@ """ import argparse +import shutil from pathlib import Path from tqdm import tqdm @@ -17,7 +18,7 @@ def pack_examples(tok, src_examples, tgt_examples, max_tokens=1024): finished_src, finished_tgt = [], [] - sorted_examples = list(sorted(zip(src_examples, tgt_examples), key=lambda x: len(x[0]))) + sorted_examples = list(zip(src_examples, tgt_examples)) new_src, new_tgt = sorted_examples[0] def is_too_big(strang): @@ -42,20 +43,10 @@ def is_too_big(strang): return finished_src, finished_tgt -def minify(src_dir: Path, dest_dir: Path, n: int): - """Write first n lines of each file f in src_dir to dest_dir/f""" - dest_dir.mkdir(exist_ok=True) - for path in src_dir.iterdir(): - new = [x.rstrip() for x in list(path.open().readlines())][:n] - dest_path = dest_dir.joinpath(path.name) - print(dest_path) - dest_path.open("w").write("\n".join(new)) - - def pack_data_dir(tok, data_dir: Path, max_tokens, save_path): save_path = Path(save_path) save_path.mkdir(exist_ok=True) - for split in ["val", "test", "train"]: + for split in ["train"]: src_path, tgt_path = data_dir / f"{split}.source", data_dir / f"{split}.target" src_docs = [x.rstrip() for x in Path(src_path).open().readlines()] tgt_docs = [x.rstrip() for x in Path(tgt_path).open().readlines()] @@ -63,6 +54,10 @@ def pack_data_dir(tok, data_dir: Path, max_tokens, save_path): print(f"packed {split} split from {len(src_docs)} examples -> {len(packed_src)}.") Path(save_path / f"{split}.source").open("w").write("\n".join(packed_src)) Path(save_path / f"{split}.target").open("w").write("\n".join(packed_tgt)) + for split in ["val", "test"]: + src_path, tgt_path = data_dir / f"{split}.source", data_dir / f"{split}.target" + shutil.copyfile(src_path, save_path / f"{split}.source") + shutil.copyfile(tgt_path, save_path / f"{split}.target") def packer_cli(): From 11792d7826854979bb532b6da09bc3796b09ea6a Mon Sep 17 00:00:00 2001 From: Sam Shleifer Date: Mon, 27 Jul 2020 12:21:25 -0400 Subject: [PATCH 022/127] CL util to convert models to fp16 before upload (#5953) --- examples/seq2seq/convert_model_to_fp16.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 examples/seq2seq/convert_model_to_fp16.py diff --git a/examples/seq2seq/convert_model_to_fp16.py b/examples/seq2seq/convert_model_to_fp16.py new file mode 100644 index 000000000000..24042cc0e7e5 --- /dev/null +++ b/examples/seq2seq/convert_model_to_fp16.py @@ -0,0 +1,21 @@ +from typing import Union + +import fire +import torch +from tqdm import tqdm + + +def convert(src_path: str, map_location: str = "cpu", save_path: Union[str, None] = None) -> None: + """Convert a pytorch_model.bin or model.pt file to torch.float16 for faster downloads, less disk space.""" + state_dict = torch.load(src_path, map_location=map_location) + for k, v in tqdm(state_dict.items()): + if not isinstance(v, torch.Tensor): + raise TypeError("FP16 conversion only works on paths that are saved state dics, like pytorch_model.bin") + state_dict[k] = v.half() + if save_path is None: # overwrite src_path + save_path = src_path + torch.save(state_dict, save_path) + + +if __name__ == "__main__": + fire.Fire(convert) From fd347e0da786b45c80cb64b1c6a02a80c4142db5 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Mon, 27 Jul 2020 15:17:33 -0400 Subject: [PATCH 023/127] Add fire to setup.cfg to make isort happy (#6066) --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index d8272abd10bf..d630f0b3a9f9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -9,6 +9,7 @@ known_third_party = fairseq faiss fastprogress + fire git h5py matplotlib From 769e6ba01f7b91f244ac55235c53ca3d68408006 Mon Sep 17 00:00:00 2001 From: Ramsri Goutham Golla Date: Tue, 28 Jul 2020 01:55:37 +0530 Subject: [PATCH 024/127] Create README.md (#6032) Adding model card - readme --- .../ramsrigouthamg/t5_paraphraser/README.md | 84 +++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 model_cards/ramsrigouthamg/t5_paraphraser/README.md diff --git a/model_cards/ramsrigouthamg/t5_paraphraser/README.md b/model_cards/ramsrigouthamg/t5_paraphraser/README.md new file mode 100644 index 000000000000..7816c0dd087d --- /dev/null +++ b/model_cards/ramsrigouthamg/t5_paraphraser/README.md @@ -0,0 +1,84 @@ +## Model in Action ðŸš€ + +```python +import torch +from transformers import T5ForConditionalGeneration,T5Tokenizer + + +def set_seed(seed): + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(seed) + +set_seed(42) + +model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_paraphraser') +tokenizer = T5Tokenizer.from_pretrained('ramsrigouthamg/t5_paraphraser') + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +print ("device ",device) +model = model.to(device) + +sentence = "Which course should I take to get started in data science?" +# sentence = "What are the ingredients required to bake a perfect cake?" +# sentence = "What is the best possible approach to learn aeronautical engineering?" +# sentence = "Do apples taste better than oranges in general?" + + +text = "paraphrase: " + sentence + " " + + +max_len = 256 + +encoding = tokenizer.encode_plus(text,pad_to_max_length=True, return_tensors="pt") +input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device) + + +# set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3 +beam_outputs = model.generate( + input_ids=input_ids, attention_mask=attention_masks, + do_sample=True, + max_length=256, + top_k=120, + top_p=0.98, + early_stopping=True, + num_return_sequences=10 +) + + +print ("\nOriginal Question ::") +print (sentence) +print ("\n") +print ("Paraphrased Questions :: ") +final_outputs =[] +for beam_output in beam_outputs: + sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True) + if sent.lower() != sentence.lower() and sent not in final_outputs: + final_outputs.append(sent) + +for i, final_output in enumerate(final_outputs): + print("{}: {}".format(i, final_output)) + +``` +## Output +``` +Original Question :: +Which course should I take to get started in data science? + + +Paraphrased Questions :: +0: What should I learn to become a data scientist? +1: How do I get started with data science? +2: How would you start a data science career? +3: How can I start learning data science? +4: How do you get started in data science? +5: What's the best course for data science? +6: Which course should I start with for data science? +7: What courses should I follow to get started in data science? +8: What degree should be taken by a data scientist? +9: Which course should I follow to become a Data Scientist? +``` + +## Detailed blog post available here : +https://towardsdatascience.com/paraphrase-any-question-with-t5-text-to-text-transfer-transformer-pretrained-model-and-cbb9e35f1555 + From 9d0d3a6645384e236c55d311f3f8b7dd67d58562 Mon Sep 17 00:00:00 2001 From: sgugger Date: Mon, 27 Jul 2020 18:03:09 -0400 Subject: [PATCH 025/127] Pin TF while we wait for a fix --- setup.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index c4fc91ab3170..2ea0954b425f 100644 --- a/setup.py +++ b/setup.py @@ -70,14 +70,14 @@ # keras2onnx and onnxconverter-common version is specific through a commit until 1.7.0 lands on pypi extras["tf"] = [ - "tensorflow", + "tensorflow<=2.2", # "onnxconverter-common", # "keras2onnx" "onnxconverter-common @ git+git://github.com/microsoft/onnxconverter-common.git@f64ca15989b6dc95a1f3507ff6e4c395ba12dff5#egg=onnxconverter-common", "keras2onnx @ git+git://github.com/onnx/keras-onnx.git@cbdc75cb950b16db7f0a67be96a278f8d2953b48#egg=keras2onnx" ] extras["tf-cpu"] = [ - "tensorflow-cpu", + "tensorflow-cpu<=2.2", # "onnxconverter-common", # "keras2onnx" "onnxconverter-common @ git+git://github.com/microsoft/onnxconverter-common.git@f64ca15989b6dc95a1f3507ff6e4c395ba12dff5#egg=onnxconverter-common", @@ -86,7 +86,7 @@ extras["torch"] = ["torch"] extras["serving"] = ["pydantic", "uvicorn", "fastapi", "starlette"] -extras["all"] = extras["serving"] + ["tensorflow", "torch"] +extras["all"] = extras["serving"] + ["tensorflow<=2.2", "torch"] extras["testing"] = ["pytest", "pytest-xdist", "timeout-decorator", "psutil"] # sphinx-rtd-theme==0.5.0 introduced big changes in the style. @@ -97,7 +97,7 @@ "isort @ git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort", "flake8", ] -extras["dev"] = extras["testing"] + extras["quality"] + ["mecab-python3<1", "scikit-learn", "tensorflow", "torch"] +extras["dev"] = extras["testing"] + extras["quality"] + ["mecab-python3<1", "scikit-learn", "tensorflow<=2.2", "torch"] setup( name="transformers", From 1e00ef681d213938cfafd678b9ec11c786405bbf Mon Sep 17 00:00:00 2001 From: Sam Shleifer Date: Mon, 27 Jul 2020 18:26:00 -0400 Subject: [PATCH 026/127] [s2s] dont document packing because it hurts performance (#6077) --- examples/seq2seq/README.md | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/examples/seq2seq/README.md b/examples/seq2seq/README.md index ed24f5939484..5029f38361cb 100644 --- a/examples/seq2seq/README.md +++ b/examples/seq2seq/README.md @@ -27,17 +27,7 @@ this should make a directory called `cnn_dm/` with files like `test.source`. ``` WMT16 English-Romanian Translation Data: - -This dataset comes in two formats. The "packed" version merges short training examples into examples of <200 tokens to increase GPU utilization (and also improves validation performance). - -```bash -cd examples/seq2seq -wget https://s3.amazonaws.com/datasets.huggingface.co/translation/wmt_en_ro_packed_train_200.tgz -tar -xzvf wmt_en_ro_packed_200.tgz -export ENRO_DIR=wmt_en_ro_packed_train_200 -``` - -The original data can also be downloaded with this command: +download with this command: ```bash wget https://s3.amazonaws.com/datasets.huggingface.co/translation/wmt_en_ro.tar.gz tar -xzvf wmt_en_ro.tar.gz From b7345d22d0b59ccfda8df840a918af33cf95a189 Mon Sep 17 00:00:00 2001 From: Sam Shleifer Date: Mon, 27 Jul 2020 20:00:44 -0400 Subject: [PATCH 027/127] [fix] no warning for position_ids buffer (#6063) --- src/transformers/modeling_bert.py | 2 ++ src/transformers/modeling_mobilebert.py | 2 ++ src/transformers/modeling_openai.py | 1 + src/transformers/modeling_xlm.py | 4 +++- 4 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py index e27ba7539cc5..757eb7c9c7d1 100644 --- a/src/transformers/modeling_bert.py +++ b/src/transformers/modeling_bert.py @@ -699,6 +699,8 @@ class BertModel(BertPreTrainedModel): """ + authorized_missing_keys = [r"position_ids"] + def __init__(self, config): super().__init__(config) self.config = config diff --git a/src/transformers/modeling_mobilebert.py b/src/transformers/modeling_mobilebert.py index b01c29df291e..4d78ca0396f6 100644 --- a/src/transformers/modeling_mobilebert.py +++ b/src/transformers/modeling_mobilebert.py @@ -788,6 +788,8 @@ class MobileBertModel(MobileBertPreTrainedModel): https://arxiv.org/pdf/2004.02984.pdf """ + authorized_missing_keys = [r"position_ids"] + def __init__(self, config): super().__init__(config) self.config = config diff --git a/src/transformers/modeling_openai.py b/src/transformers/modeling_openai.py index e346219c3d2c..3efa7d353f6e 100644 --- a/src/transformers/modeling_openai.py +++ b/src/transformers/modeling_openai.py @@ -272,6 +272,7 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel): config_class = OpenAIGPTConfig load_tf_weights = load_tf_weights_in_openai_gpt base_model_prefix = "transformer" + authorized_missing_keys = [r"position_ids"] def _init_weights(self, module): """ Initialize the weights. diff --git a/src/transformers/modeling_xlm.py b/src/transformers/modeling_xlm.py index 932bf807a5a6..e7396df689e7 100644 --- a/src/transformers/modeling_xlm.py +++ b/src/transformers/modeling_xlm.py @@ -375,7 +375,9 @@ class XLMForQuestionAnsweringOutput(ModelOutput): XLM_START_DOCSTRING, ) class XLMModel(XLMPreTrainedModel): - def __init__(self, config): # , dico, is_encoder, with_output): + authorized_missing_keys = [r"position_ids"] + + def __init__(self, config): super().__init__(config) # encoder / decoder, output layer From 7a68d401388bc68f10dfeb591709352736a6c0b6 Mon Sep 17 00:00:00 2001 From: Sam Shleifer Date: Mon, 27 Jul 2020 20:07:21 -0400 Subject: [PATCH 028/127] [s2s] Don't mention packed data in README (#6079) --- examples/seq2seq/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/seq2seq/README.md b/examples/seq2seq/README.md index 5029f38361cb..a579d728b59f 100644 --- a/examples/seq2seq/README.md +++ b/examples/seq2seq/README.md @@ -89,20 +89,20 @@ Then you can finetune mbart_cc25 on english-romanian with the following command. Best performing command: ```bash # optionally -export ENRO_DIR='wmt_en_ro_packed_train_200' # Download instructions above +export ENRO_DIR='wmt_en_ro' # Download instructions above # export WANDB_PROJECT="MT" # optional export MAX_LEN=200 export BS=4 export GAS=8 # gradient accumulation steps ./train_mbart_cc25_enro.sh --output_dir enro_finetune_baseline --label_smoothing 0.1 --fp16_opt_level=O1 --logger_name wandb --sortish_sampler ``` -This should take < 2h/epoch on a 16GB v100 and achieve val_avg_ BLEU score above 25. (you can see in wandb or metrics.json). +This should take < 6h/epoch on a 16GB v100 and achieve val_avg_ BLEU score above 25. (you can see metrics in wandb or metrics.json). To get results in line with fairseq, you need to do some postprocessing. MultiGPU command (using 8 GPUS as an example) ```bash -export ENRO_DIR='wmt_en_ro_packed_train_200' # Download instructions above +export ENRO_DIR='wmt_en_ro' # Download instructions above # export WANDB_PROJECT="MT" # optional export MAX_LEN=200 export BS=4 From 7214954db42ec96603ea596c5f68b16f574fba89 Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Tue, 28 Jul 2020 03:14:31 -0400 Subject: [PATCH 029/127] Should return a tuple for serialization (#6061) --- src/transformers/pipelines.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py index 16589469256a..bc1d885cb324 100755 --- a/src/transformers/pipelines.py +++ b/src/transformers/pipelines.py @@ -2122,6 +2122,6 @@ def pipeline( "Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. " "Trying to load the model with Tensorflow." ) - model = model_class.from_pretrained(model, config=config, **model_kwargs) + model = model_class.from_pretrained(model, config=config, return_tuple=True, **model_kwargs) return task_class(model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, task=task, **kwargs) From 018d61fa245c32b1f4f0ff44975ce1292b37aa59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s=20Felipe=20Cruz?= Date: Tue, 28 Jul 2020 01:19:17 -0700 Subject: [PATCH 030/127] Moving transformers package import statements to relative imports in some files (#5796) * Moving rom transformers statements to relative imports in some files under src/ * Import order Co-authored-by: Lysandre Debut --- src/transformers/configuration_encoder_decoder.py | 2 +- src/transformers/modeling_encoder_decoder.py | 6 +++--- src/transformers/modeling_marian.py | 4 ++-- src/transformers/modeling_mobilebert.py | 3 +-- src/transformers/modeling_tf_electra.py | 3 +-- src/transformers/testing_utils.py | 2 +- src/transformers/tokenization_auto.py | 3 +-- 7 files changed, 10 insertions(+), 13 deletions(-) diff --git a/src/transformers/configuration_encoder_decoder.py b/src/transformers/configuration_encoder_decoder.py index 261fa505e938..ae71dbecc175 100644 --- a/src/transformers/configuration_encoder_decoder.py +++ b/src/transformers/configuration_encoder_decoder.py @@ -69,7 +69,7 @@ def __init__(self, **kwargs): decoder_config = kwargs.pop("decoder") decoder_model_type = decoder_config.pop("model_type") - from transformers import AutoConfig + from .configuration_auto import AutoConfig self.encoder = AutoConfig.for_model(encoder_model_type, **encoder_config) self.decoder = AutoConfig.for_model(decoder_model_type, **decoder_config) diff --git a/src/transformers/modeling_encoder_decoder.py b/src/transformers/modeling_encoder_decoder.py index 6db2d176e2bf..52d7058d1b2b 100644 --- a/src/transformers/modeling_encoder_decoder.py +++ b/src/transformers/modeling_encoder_decoder.py @@ -56,12 +56,12 @@ def __init__( super().__init__(config) if encoder is None: - from transformers import AutoModel + from .modeling_auto import AutoModel encoder = AutoModel.from_config(config.encoder) if decoder is None: - from transformers import AutoModelForCausalLM + from .modeling_auto import AutoModelForCausalLM decoder = AutoModelForCausalLM.from_config(config.decoder) @@ -159,7 +159,7 @@ def from_encoder_decoder_pretrained( from .modeling_auto import AutoModelForCausalLM if "config" not in kwargs_decoder: - from transformers import AutoConfig + from .configuration_auto import AutoConfig decoder_config = AutoConfig.from_pretrained(decoder_pretrained_model_name_or_path) if decoder_config.is_decoder is False: diff --git a/src/transformers/modeling_marian.py b/src/transformers/modeling_marian.py index 16ebf838f80b..e747ee515a13 100644 --- a/src/transformers/modeling_marian.py +++ b/src/transformers/modeling_marian.py @@ -15,8 +15,8 @@ """PyTorch MarianMTModel model, ported from the Marian C++ repo.""" -from transformers.configuration_marian import MarianConfig -from transformers.modeling_bart import BartForConditionalGeneration +from .configuration_marian import MarianConfig +from .modeling_bart import BartForConditionalGeneration MARIAN_PRETRAINED_MODEL_ARCHIVE_LIST = [ diff --git a/src/transformers/modeling_mobilebert.py b/src/transformers/modeling_mobilebert.py index 4d78ca0396f6..13c9ade0270c 100644 --- a/src/transformers/modeling_mobilebert.py +++ b/src/transformers/modeling_mobilebert.py @@ -32,8 +32,6 @@ from torch import nn from torch.nn import CrossEntropyLoss, MSELoss -from transformers.modeling_bert import BertIntermediate - from .activations import gelu, gelu_new, swish from .configuration_mobilebert import MobileBertConfig from .file_utils import ( @@ -43,6 +41,7 @@ add_start_docstrings_to_callable, replace_return_docstrings, ) +from .modeling_bert import BertIntermediate from .modeling_outputs import ( BaseModelOutput, BaseModelOutputWithPooling, diff --git a/src/transformers/modeling_tf_electra.py b/src/transformers/modeling_tf_electra.py index 595482ee275e..3d04e22a2a95 100644 --- a/src/transformers/modeling_tf_electra.py +++ b/src/transformers/modeling_tf_electra.py @@ -2,8 +2,7 @@ import tensorflow as tf -from transformers import ElectraConfig - +from .configuration_electra import ElectraConfig from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable from .modeling_tf_bert import ACT2FN, TFBertEncoder, TFBertPreTrainedModel from .modeling_tf_utils import ( diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index d6fbabcbdfc1..1fb9e83868ce 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -2,7 +2,7 @@ import unittest from distutils.util import strtobool -from transformers.file_utils import _tf_available, _torch_available, _torch_tpu_available +from .file_utils import _tf_available, _torch_available, _torch_tpu_available SMALL_MODEL_IDENTIFIER = "julien-c/bert-xsmall-dummy" diff --git a/src/transformers/tokenization_auto.py b/src/transformers/tokenization_auto.py index 7e2992c78285..f16324ab7878 100644 --- a/src/transformers/tokenization_auto.py +++ b/src/transformers/tokenization_auto.py @@ -18,8 +18,6 @@ import logging from collections import OrderedDict -from transformers.configuration_mobilebert import MobileBertConfig - from .configuration_auto import ( AlbertConfig, AutoConfig, @@ -44,6 +42,7 @@ XLNetConfig, ) from .configuration_marian import MarianConfig +from .configuration_mobilebert import MobileBertConfig from .configuration_utils import PretrainedConfig from .tokenization_albert import AlbertTokenizer from .tokenization_bart import BartTokenizer, MBartTokenizer From 842eb456062a0200963f3bca3cff41ddfd3da479 Mon Sep 17 00:00:00 2001 From: Tanmay Thakur Date: Tue, 28 Jul 2020 13:55:12 +0530 Subject: [PATCH 031/127] New Community NB Add (#5824) Signed-off-by: lordtt13 --- notebooks/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/notebooks/README.md b/notebooks/README.md index c2e57c63715b..b1c7df1070f5 100644 --- a/notebooks/README.md +++ b/notebooks/README.md @@ -41,3 +41,4 @@ Pull Request so it can be included under the Community notebooks. |[Fine-tune T5 for Summarization](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)|How to fine-tune T5 for summarization in PyTorch and track experiments with WandB|[Abhishek Kumar Mishra](https://github.com/abhimishra91) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)| |[Speed up Fine-Tuning in Transformers with Dynamic Padding / Bucketing](https://github.com/ELS-RD/transformers-notebook/blob/master/Divide_Hugging_Face_Transformers_training_time_by_2_or_more.ipynb)|How to speed up fine-tuning by a factor of 2 using dynamic padding / bucketing|[Michael Benesty](https://github.com/pommedeterresautee) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1CBfRU1zbfu7-ijiOqAAQUA-RJaxfcJoO?usp=sharing)| |[Pretrain Reformer for Masked Language Modeling](https://github.com/patrickvonplaten/notebooks/blob/master/Reformer_For_Masked_LM.ipynb)| How to train a Reformer model with bi-directional self-attention layers | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tzzh0i8PgDQGV3SMFUGxM7_gGae3K-uW?usp=sharing)| +|[Expand and Fine Tune Sci-BERT](https://github.com/lordtt13/word-embeddings/blob/master/COVID-19%20Research%20Data/COVID-SciBERT.ipynb)| How to increase vocabulary of a pretrained SciBERT model from AllenAI on the CORD dataset and pipeline it. | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1rqAR40goxbAfez1xvF3hBJphSCsvXmh8)| From 3c7fbf35a6c9237e8bbceb5b4f315980ed10d8a0 Mon Sep 17 00:00:00 2001 From: Sam Shleifer Date: Tue, 28 Jul 2020 08:18:11 -0400 Subject: [PATCH 032/127] MBART: support summarization tasks where max_src_len > max_tgt_len (#6003) * MBART: support summarization tasks * fix test * Style * add tokenizer test --- examples/seq2seq/README.md | 2 ++ examples/seq2seq/finetune.py | 13 +++++++------ examples/seq2seq/finetune_t5.sh | 1 + examples/seq2seq/test_seq2seq_examples.py | 15 ++++++++------- examples/seq2seq/utils.py | 4 +++- src/transformers/tokenization_bart.py | 6 +++++- tests/test_tokenization_mbart.py | 12 ++++++++++++ 7 files changed, 38 insertions(+), 15 deletions(-) diff --git a/examples/seq2seq/README.md b/examples/seq2seq/README.md index a579d728b59f..1e12242cbb2d 100644 --- a/examples/seq2seq/README.md +++ b/examples/seq2seq/README.md @@ -180,6 +180,8 @@ python run_eval.py sshleifer/distilbart-cnn-12-6 $DATA_DIR/val.source dbart_val_ --task summarization \ --n_obs 100 \ --device cuda \ + --max_source_length 1024 \ + --max_target_length 56 \ --fp16 \ --bs 32 ``` diff --git a/examples/seq2seq/finetune.py b/examples/seq2seq/finetune.py index 1866042cbb3b..e2e9ecffa26b 100644 --- a/examples/seq2seq/finetune.py +++ b/examples/seq2seq/finetune.py @@ -105,7 +105,13 @@ def __init__(self, hparams, **kwargs): self.hparams.git_sha = get_git_info()["repo_sha"] self.num_workers = hparams.num_workers self.decoder_start_token_id = None - self.dataset_class = Seq2SeqDataset + if self.model.config.decoder_start_token_id is None and isinstance(self.tokenizer, MBartTokenizer): + self.decoder_start_token_id = self.tokenizer.lang_code_to_id[hparams.tgt_lang] + self.model.config.decoder_start_token_id = self.decoder_start_token_id + if isinstance(self.tokenizer, MBartTokenizer): + self.dataset_class = MBartDataset + else: + self.dataset_class = Seq2SeqDataset def freeze_embeds(self): """Freeze token embeddings and positional embeddings for bart, just token embeddings for t5.""" @@ -331,11 +337,6 @@ def __init__(self, hparams, **kwargs): super().__init__(hparams, **kwargs) self.dataset_kwargs["src_lang"] = hparams.src_lang self.dataset_kwargs["tgt_lang"] = hparams.tgt_lang - if self.model.config.decoder_start_token_id is None and isinstance(self.tokenizer, MBartTokenizer): - self.decoder_start_token_id = self.tokenizer.lang_code_to_id[hparams.tgt_lang] - self.model.config.decoder_start_token_id = self.decoder_start_token_id - if isinstance(self.tokenizer, MBartTokenizer): - self.dataset_class = MBartDataset def calc_generative_metrics(self, preds, target) -> dict: return calculate_bleu_score(preds, target) diff --git a/examples/seq2seq/finetune_t5.sh b/examples/seq2seq/finetune_t5.sh index ed8d26634cf6..0021107bb623 100755 --- a/examples/seq2seq/finetune_t5.sh +++ b/examples/seq2seq/finetune_t5.sh @@ -8,6 +8,7 @@ python finetune.py \ --eval_batch_size=$BS \ --output_dir=$OUTPUT_DIR \ --max_source_length=512 \ +--max_target_length=56 \ --val_check_interval=0.1 --n_val=200 \ --do_train --do_predict \ $@ diff --git a/examples/seq2seq/test_seq2seq_examples.py b/examples/seq2seq/test_seq2seq_examples.py index 191bbfac70fd..44e3d6c703dc 100644 --- a/examples/seq2seq/test_seq2seq_examples.py +++ b/examples/seq2seq/test_seq2seq_examples.py @@ -300,14 +300,17 @@ def test_mbart_dataset_truncation(): tmp_dir = make_test_data_dir() max_len_source = max(len(tokenizer.encode(a)) for a in ARTICLES) max_len_target = max(len(tokenizer.encode(a)) for a in SUMMARIES) - trunc = 4 + max_src_len = 4 + max_tgt_len = 8 + assert max_len_target > max_src_len # Truncated + assert max_len_source > max_src_len src_lang, tgt_lang = "ro_RO", "de_DE" # NOT WHAT IT WAS TRAINED ON train_dataset = MBartDataset( tokenizer, data_dir=tmp_dir, type_path="train", - max_source_length=trunc, - max_target_length=1000, # ignored + max_source_length=max_src_len, + max_target_length=max_tgt_len, # ignored src_lang=src_lang, tgt_lang=tgt_lang, ) @@ -316,17 +319,15 @@ def test_mbart_dataset_truncation(): assert isinstance(batch, dict) assert batch["attention_mask"].shape == batch["input_ids"].shape # show that articles were trimmed. - assert batch["input_ids"].shape[1] == trunc + assert batch["input_ids"].shape[1] == max_src_len # show that targets are the same len - assert batch["decoder_input_ids"].shape[1] == trunc + assert batch["decoder_input_ids"].shape[1] == max_tgt_len # check language codes in correct place assert batch["decoder_input_ids"][0, 0].item() == tokenizer.lang_code_to_id[tgt_lang] assert batch["decoder_input_ids"][0, -1].item() == tokenizer.eos_token_id assert batch["input_ids"][0, -2].item() == tokenizer.eos_token_id assert batch["input_ids"][0, -1].item() == tokenizer.lang_code_to_id[src_lang] - assert max_len_target > trunc # Truncated - assert max_len_source > trunc break # No need to test every batch diff --git a/examples/seq2seq/utils.py b/examples/seq2seq/utils.py index c2c484735f39..49910ab62162 100644 --- a/examples/seq2seq/utils.py +++ b/examples/seq2seq/utils.py @@ -157,7 +157,8 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) if self.max_source_length != self.max_target_length: warnings.warn( - f"Mbart will ignore max_target_length = {self.max_target_length} and use {self.max_source_length} for both sides." + f"Mbart is using sequence lengths {self.max_source_length}, {self.max_target_length}. " + f"Imbalanced sequence lengths may be undesired for translation tasks" ) def __getitem__(self, index) -> Dict[str, str]: @@ -178,6 +179,7 @@ def collate_fn(self, batch) -> Dict[str, torch.Tensor]: tgt_texts=[x["tgt_texts"] for x in batch], tgt_lang=self.tgt_lang, max_length=self.max_source_length, + max_target_length=self.max_target_length, ) return batch_encoding.data diff --git a/src/transformers/tokenization_bart.py b/src/transformers/tokenization_bart.py index 90353ddd317b..c83ad0d333e2 100644 --- a/src/transformers/tokenization_bart.py +++ b/src/transformers/tokenization_bart.py @@ -193,6 +193,7 @@ def prepare_translation_batch( tgt_texts: Optional[List[str]] = None, tgt_lang: str = "ro_RO", max_length: Optional[int] = None, + max_target_length: Optional[int] = None, padding: str = "longest", return_tensors: str = "pt", **kwargs, @@ -224,13 +225,16 @@ def prepare_translation_batch( ) if tgt_texts is None: return model_inputs + # Process tgt_texts + if max_target_length is None: + max_target_length = max_length self.set_tgt_lang_special_tokens(tgt_lang) decoder_inputs: BatchEncoding = self( tgt_texts, add_special_tokens=True, return_tensors=return_tensors, padding=padding, - max_length=max_length, + max_length=max_target_length, truncation=True, **kwargs, ) diff --git a/tests/test_tokenization_mbart.py b/tests/test_tokenization_mbart.py index d45a5ee6026b..14566ac97598 100644 --- a/tests/test_tokenization_mbart.py +++ b/tests/test_tokenization_mbart.py @@ -137,6 +137,18 @@ def test_enro_tokenizer_prepare_translation_batch(self): self.assertEqual(self.tokenizer.prefix_tokens, []) self.assertEqual(self.tokenizer.suffix_tokens, [self.tokenizer.eos_token_id, EN_CODE]) + def test_max_target_length(self): + + batch = self.tokenizer.prepare_translation_batch( + self.src_text, tgt_texts=self.tgt_text, max_length=3, max_target_length=10 + ) + self.assertEqual(batch.input_ids.shape[1], 3) + self.assertEqual(batch.decoder_input_ids.shape[1], 10) + # max_target_length will default to max_length if not specified + batch = self.tokenizer.prepare_translation_batch(self.src_text, tgt_texts=self.tgt_text, max_length=3) + self.assertEqual(batch.input_ids.shape[1], 3) + self.assertEqual(batch.decoder_input_ids.shape[1], 3) + def test_enro_tokenizer_batch_encode_plus(self): ids = self.tokenizer.batch_encode_plus(self.src_text).input_ids[0] self.assertListEqual(self.expected_src_tokens, ids) From 4f814fd587811b49d1c3b07050c88d2e393a5bd3 Mon Sep 17 00:00:00 2001 From: Pavel Soriano Date: Tue, 28 Jul 2020 14:33:52 +0200 Subject: [PATCH 033/127] [Model Card] camembert-base-squadFR-fquad-piaf (#6087) --- .../camembert-base-squadFR-fquad-piaf/README.md | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/model_cards/etalab-ia/camembert-base-squadFR-fquad-piaf/README.md b/model_cards/etalab-ia/camembert-base-squadFR-fquad-piaf/README.md index 38c2411a7ce1..44f78e876148 100644 --- a/model_cards/etalab-ia/camembert-base-squadFR-fquad-piaf/README.md +++ b/model_cards/etalab-ia/camembert-base-squadFR-fquad-piaf/README.md @@ -1,5 +1,14 @@ --- language: fr +datasets: +- PIAF +- FQuAD +- SQuAD-FR +widget: +- text: "Comment s'appelle le portail open data du gouvernement ?" + context: "Etalab est une administration publique franÃ§aise qui fait notamment office de Chief Data Officer de l'Ã‰tat et coordonne la conception et la mise en Å“uvre de sa stratÃ©gie dans le domaine de la donnÃ©e (ouverture et partage des donnÃ©es publiques ou open data, exploitation des donnÃ©es et intelligence artificielle...). Ainsi, Etalab dÃ©veloppe et maintient le portail des donnÃ©es ouvertes du gouvernement franÃ§ais data.gouv.fr. +Etalab promeut Ã©galement une plus grande ouverture l'administration sur la sociÃ©tÃ© (gouvernement ouvert) : transparence de l'action publique, innovation ouverte, participation citoyenne... elle promeut lâ€™innovation, lâ€™expÃ©rimentation, les mÃ©thodes de travail ouvertes, agiles et itÃ©ratives, ainsi que les synergies avec la sociÃ©tÃ© civile pour dÃ©cloisonner lâ€™administration et favoriser lâ€™adoption des meilleures pratiques professionnelles dans le domaine du numÃ©rique. Ã€ ce titre elle Ã©tudie notamment lâ€™opportunitÃ© de recourir Ã des technologies en voie de maturation issues du monde de la recherche. +Cette entitÃ© chargÃ©e de l'innovation au sein de l'administration doit contribuer Ã l'amÃ©lioration du service public grÃ¢ce au numÃ©rique. Elle est rattachÃ©e Ã la Direction interministÃ©rielle du numÃ©rique, dont les missions et lâ€™organisation ont Ã©tÃ© fixÃ©es par le dÃ©cret du 30 octobre 2019.â€‰ DirigÃ© par Laure Lucchesi depuis 2016, elle rassemble une Ã©quipe pluridisciplinaire d'une trentaine de personnes." --- # camembert-base-squadFR-fquad-piaf @@ -30,7 +39,7 @@ python run_squad.py \ ``` ## Evaluation results -### Fquad v1.0 Evaluation +### FQuAD v1.0 Evaluation ```shell {"f1": 79.81, "exact_match": 55.14} ``` @@ -74,7 +83,7 @@ nlp({ ``` -### Fquad +### FQuAD ``` @article{dHoffschmidt2020FQuADFQ, title={FQuAD: French Question Answering Dataset}, @@ -87,7 +96,7 @@ nlp({ ### SQuAD-FR ``` - @MISC{maldives, + @MISC{kabbadj2018, author = "Kabbadj, Ali", title = "Something new in French Text Mining and Information Extraction (Universal Chatbot): Largest Q&A French training dataset (110 000+) ", editor = "linkedin.com", From f0c70085c2c3115bf02acd9bb56255c6f113edf6 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 28 Jul 2020 05:34:58 -0700 Subject: [PATCH 034/127] link to README.md (#6068) * add a link to README.md * Update README.md --- examples/seq2seq/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/seq2seq/README.md b/examples/seq2seq/README.md index 1e12242cbb2d..033d200c0994 100644 --- a/examples/seq2seq/README.md +++ b/examples/seq2seq/README.md @@ -3,7 +3,7 @@ This directory contains examples for finetuning and evaluating transformers on summarization and translation tasks. Summarization support is more mature than translation support. Please tag @sshleifer with any issues/unexpected behaviors, or send a PR! -For `bertabs` instructions, see `bertabs/README.md`. +For `bertabs` instructions, see [`bertabs/README.md`](bertabs/README.md). ### Data From 31a5486e42a50bd19cba5c2300f54aa159fab2ed Mon Sep 17 00:00:00 2001 From: Sam Shleifer Date: Tue, 28 Jul 2020 08:41:27 -0400 Subject: [PATCH 035/127] github issue template suggests who to tag (#5790) Co-authored-by: Julien Chaumond Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Lysandre Debut Co-authored-by: Teven --- .github/ISSUE_TEMPLATE/bug-report.md | 55 ++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 15 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug-report.md b/.github/ISSUE_TEMPLATE/bug-report.md index 754089eaa29b..b9515d69004d 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.md +++ b/.github/ISSUE_TEMPLATE/bug-report.md @@ -7,14 +7,51 @@ assignees: '' --- -# ðŸ› Bug + +## Environment info + + +- `transformers` version: +- Platform: +- Python version: +- PyTorch version (GPU?): +- Tensorflow version (GPU?): +- Using GPU in script?: +- Using distributed or parallel set-up in script?: + +### Who can help + ## Information Model I am using (Bert, XLNet ...): -Language I am using the model on (English, Chinese ...): - The problem arises when using: * [ ] the official example scripts: (give details below) * [ ] my own modified scripts: (give details below) @@ -38,15 +75,3 @@ Steps to reproduce the behavior: ## Expected behavior - -## Environment info - - -- `transformers` version: -- Platform: -- Python version: -- PyTorch version (GPU?): -- Tensorflow version (GPU?): -- Using GPU in script?: -- Using distributed or parallel set-up in script?: From 0206efb4cfcffd9d1cf349b098892cc49c9a3efc Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Tue, 28 Jul 2020 09:08:20 -0400 Subject: [PATCH 036/127] Make all data collators accept dict (#6065) * Make all data collators accept dict * Style --- src/transformers/data/data_collator.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py index b4d9f205b95b..29d7bf43a2d7 100644 --- a/src/transformers/data/data_collator.py +++ b/src/transformers/data/data_collator.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import Any, Callable, Dict, List, NewType, Tuple +from typing import Any, Callable, Dict, List, NewType, Tuple, Union import torch from torch.nn.utils.rnn import pad_sequence @@ -77,7 +77,9 @@ class DataCollatorForLanguageModeling: mlm: bool = True mlm_probability: float = 0.15 - def __call__(self, examples: List[torch.Tensor]) -> Dict[str, torch.Tensor]: + def __call__(self, examples: List[Union[torch.Tensor, Dict[str, torch.Tensor]]]) -> Dict[str, torch.Tensor]: + if isinstance(examples[0], dict): + examples = [e["input_ids"] for e in examples] batch = self._tensorize_batch(examples) if self.mlm: inputs, labels = self.mask_tokens(batch) @@ -148,7 +150,9 @@ class DataCollatorForPermutationLanguageModeling: plm_probability: float = 1 / 6 max_span_length: int = 5 # maximum length of a span of masked tokens - def __call__(self, examples: List[torch.Tensor]) -> Dict[str, torch.Tensor]: + def __call__(self, examples: List[Union[torch.Tensor, Dict[str, torch.Tensor]]]) -> Dict[str, torch.Tensor]: + if isinstance(examples[0], dict): + examples = [e["input_ids"] for e in examples] batch = self._tensorize_batch(examples) inputs, perm_mask, target_mapping, labels = self.mask_tokens(batch) return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels} From 54f49af4aef2b19aaf00ffa400ff6c1e4292e9dd Mon Sep 17 00:00:00 2001 From: Clement Date: Tue, 28 Jul 2020 09:14:00 -0400 Subject: [PATCH 037/127] Add inference widget examples (#5825) --- README.md | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 8e6c1a4e6a3e..38a3af573aa9 100644 --- a/README.md +++ b/README.md @@ -174,12 +174,19 @@ These implementations have been tested on several datasets (see the example scri ## Online demo -**[Write With Transformer](https://transformer.huggingface.co)**, built by the Hugging Face team at transformer.huggingface.co, is the official demo of this repoâ€™s text generation capabilities. -You can use it to experiment with completions generated by `GPT2Model`, `TransfoXLModel`, and `XLNetModel`. +You can test our inference API on most model pages from the model hub: https://huggingface.co/models + +For example: +- [Masked word completion with BERT](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France) +- [NER with Electra](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city) +- [Text generation with GPT-2](https://huggingface.co/gpt2?text=A+long+time+ago%2C+) +- [NLI with RoBERTa](https://huggingface.co/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal) +- [Summarization with BART](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct) +- [Question answering with DistilBERT](https://huggingface.co/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species) +- [Translation with T5](https://huggingface.co/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin) -> â€œðŸ¦„ Write with transformer is to writing what calculators are to calculus.â€ -![write_with_transformer](https://transformer.huggingface.co/front/assets/thumbnail-large.png) +**[Write With Transformer](https://transformer.huggingface.co)**, built by the Hugging Face team at transformer.huggingface.co, is the official demo of this repoâ€™s text generation capabilities. ## Quick tour From 5e97c82940a51ee95324b23f66b66692fa419182 Mon Sep 17 00:00:00 2001 From: Manuel Romero Date: Tue, 28 Jul 2020 15:36:00 +0200 Subject: [PATCH 038/127] Create README.md (#6076) --- .../mrm8488/gpt2-finetuned-recipes-cooking_v2/README.md | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 model_cards/mrm8488/gpt2-finetuned-recipes-cooking_v2/README.md diff --git a/model_cards/mrm8488/gpt2-finetuned-recipes-cooking_v2/README.md b/model_cards/mrm8488/gpt2-finetuned-recipes-cooking_v2/README.md new file mode 100644 index 000000000000..1e8029f30425 --- /dev/null +++ b/model_cards/mrm8488/gpt2-finetuned-recipes-cooking_v2/README.md @@ -0,0 +1,6 @@ +--- +language: en +thumbnail: +widget: +- text: "HuggingFace Cake:" +--- From 28931f81b73272f39a453a9fb0497f564328d758 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Tue, 28 Jul 2020 09:48:39 -0400 Subject: [PATCH 039/127] Fix #6092 (#6093) * Fix #6092 * Format --- src/transformers/data/data_collator.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py index 29d7bf43a2d7..085f7a68a8aa 100644 --- a/src/transformers/data/data_collator.py +++ b/src/transformers/data/data_collator.py @@ -5,6 +5,7 @@ from torch.nn.utils.rnn import pad_sequence from ..tokenization_utils import PreTrainedTokenizer +from ..tokenization_utils_base import BatchEncoding InputDataClass = NewType("InputDataClass", Any) @@ -33,7 +34,7 @@ def default_data_collator(features: List[InputDataClass]) -> Dict[str, torch.Ten # have the same attributes. # So we will look at the first element as a proxy for what attributes exist # on the whole batch. - if not isinstance(features[0], dict): + if not isinstance(features[0], (dict, BatchEncoding)): features = [vars(f) for f in features] first = features[0] @@ -78,7 +79,7 @@ class DataCollatorForLanguageModeling: mlm_probability: float = 0.15 def __call__(self, examples: List[Union[torch.Tensor, Dict[str, torch.Tensor]]]) -> Dict[str, torch.Tensor]: - if isinstance(examples[0], dict): + if isinstance(examples[0], (dict, BatchEncoding)): examples = [e["input_ids"] for e in examples] batch = self._tensorize_batch(examples) if self.mlm: @@ -151,7 +152,7 @@ class DataCollatorForPermutationLanguageModeling: max_span_length: int = 5 # maximum length of a span of masked tokens def __call__(self, examples: List[Union[torch.Tensor, Dict[str, torch.Tensor]]]) -> Dict[str, torch.Tensor]: - if isinstance(examples[0], dict): + if isinstance(examples[0], (dict, BatchEncoding)): examples = [e["input_ids"] for e in examples] batch = self._tensorize_batch(examples) inputs, perm_mask, target_mapping, labels = self.mask_tokens(batch) From dc4755c6d59238ffea4843d06610a29c522257fb Mon Sep 17 00:00:00 2001 From: Tanmay Thakur Date: Tue, 28 Jul 2020 19:30:23 +0530 Subject: [PATCH 040/127] create model-card for lordtt13/emo-mobilebert (#6030) --- model_cards/lordtt13/emo-mobilebert/README.md | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 model_cards/lordtt13/emo-mobilebert/README.md diff --git a/model_cards/lordtt13/emo-mobilebert/README.md b/model_cards/lordtt13/emo-mobilebert/README.md new file mode 100644 index 000000000000..18d6b496a900 --- /dev/null +++ b/model_cards/lordtt13/emo-mobilebert/README.md @@ -0,0 +1,44 @@ +--- +language: en +datasets: +- emo +--- + +## Emo-MobileBERT: a thin version of BERT LARGE, trained on the EmoContext Dataset from scratch + +### Details of MobileBERT + +The **MobileBERT** model was presented in [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by *Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, Denny Zhou* and here is the abstract: + +Natural Language Processing (NLP) has recently achieved great success by using huge pre-trained models with hundreds of millions of parameters. However, these models suffer from heavy model sizes and high latency such that they cannot be deployed to resource-limited mobile devices. In this paper, we propose MobileBERT for compressing and accelerating the popular BERT model. Like the original BERT, MobileBERT is task-agnostic, that is, it can be generically applied to various downstream NLP tasks via simple fine-tuning. Basically, MobileBERT is a thin version of BERT_LARGE, while equipped with bottleneck structures and a carefully designed balance between self-attentions and feed-forward networks. To train MobileBERT, we first train a specially designed teacher model, an inverted-bottleneck incorporated BERT_LARGE model. Then, we conduct knowledge transfer from this teacher to MobileBERT. Empirical studies show that MobileBERT is 4.3x smaller and 5.5x faster than BERT_BASE while achieving competitive results on well-known benchmarks. On the natural language inference tasks of GLUE, MobileBERT achieves a GLUEscore o 77.7 (0.6 lower than BERT_BASE), and 62 ms latency on a Pixel 4 phone. On the SQuAD v1.1/v2.0 question answering task, MobileBERT achieves a dev F1 score of 90.0/79.2 (1.5/2.1 higher than BERT_BASE). + +### Details of the downstream task (Emotion Recognition) - Dataset ðŸ“š + +SemEval-2019 Task 3: EmoContext Contextual Emotion Detection in Text + +In this dataset, given a textual dialogue i.e. an utterance along with two previous turns of context, the goal was to infer the underlying emotion of the utterance by choosing from four emotion classes: + + - sad ðŸ˜¢ + - happy ðŸ˜ƒ + - angry ðŸ˜¡ + - others + +### Model training + +The training script is present [here](https://github.com/lordtt13/transformers-experiments/blob/master/Custom%20Tasks/emo-mobilebert.ipynb). + +### Pipelining the Model + +```python +from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline + +tokenizer = AutoTokenizer.from_pretrained("lordtt13/emo-mobilebert") + +model = AutoModelForSequenceClassification.from_pretrained("lordtt13/emo-mobilebert") + +nlp_sentence_classif = transformers.pipeline('sentiment-analysis', model = model, tokenizer = tokenizer) +nlp_sentence_classif("I've never had such a bad day in my life") +# Output: [{'label': 'sad', 'score': 0.93153977394104}] +``` + +> Created by [Tanmay Thakur](https://github.com/lordtt13) | [LinkedIn](https://www.linkedin.com/in/tanmay-thakur-6bb5a9154/) From dafa296c952c08fca3686f1cf8f3a8f8eb116744 Mon Sep 17 00:00:00 2001 From: Sam Shleifer Date: Tue, 28 Jul 2020 11:24:23 -0400 Subject: [PATCH 041/127] [s2s] Delete useless method, log tokens_per_batch (#6081) --- examples/seq2seq/finetune.py | 23 ++++++++++++++--------- examples/seq2seq/utils.py | 6 ------ 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/examples/seq2seq/finetune.py b/examples/seq2seq/finetune.py index e2e9ecffa26b..c71382954604 100644 --- a/examples/seq2seq/finetune.py +++ b/examples/seq2seq/finetune.py @@ -160,9 +160,16 @@ def _step(self, batch: dict) -> Tuple: ) return (loss,) + @property + def pad(self) -> int: + return self.tokenizer.pad_token_id + def training_step(self, batch, batch_idx) -> Dict: loss_tensors = self._step(batch) + logs = {name: loss for name, loss in zip(self.loss_names, loss_tensors)} + # tokens per batch + logs["tpb"] = batch["input_ids"].ne(self.pad).sum() + batch["decoder_input_ids"].ne(self.pad).sum() return {"loss": loss_tensors[0], "log": logs} def validation_step(self, batch, batch_idx) -> Dict: @@ -172,7 +179,7 @@ def validation_epoch_end(self, outputs, prefix="val") -> Dict: self.step_count += 1 losses = {k: torch.stack([x[k] for x in outputs]).mean() for k in self.loss_names} loss = losses["loss"] - rouges = {k: np.array([x[k] for x in outputs]).mean() for k in self.metric_names + ["gen_time", "summ_len"]} + rouges = {k: np.array([x[k] for x in outputs]).mean() for k in self.metric_names + ["gen_time", "gen_len"]} rouge_tensor: torch.FloatTensor = torch.tensor(rouges[self.val_metric]).type_as(loss) rouges.update({k: v.item() for k, v in losses.items()}) losses.update(rouges) @@ -190,23 +197,21 @@ def calc_generative_metrics(self, preds, target) -> Dict: return calculate_rouge(preds, target) def _generative_step(self, batch: dict) -> dict: - pad_token_id = self.tokenizer.pad_token_id - source_ids, source_mask, y = Seq2SeqDataset.trim_seq2seq_batch(batch, pad_token_id) t0 = time.time() generated_ids = self.model.generate( - input_ids=source_ids, - attention_mask=source_mask, + batch["input_ids"], + attention_mask=batch["attention_mask"], use_cache=True, decoder_start_token_id=self.decoder_start_token_id, ) - gen_time = (time.time() - t0) / source_ids.shape[0] - preds = self.ids_to_clean_text(generated_ids) - target = self.ids_to_clean_text(y) + gen_time = (time.time() - t0) / batch["input_ids"].shape[0] + preds: List[str] = self.ids_to_clean_text(generated_ids) + target: List[str] = self.ids_to_clean_text(batch["decoder_input_ids"]) loss_tensors = self._step(batch) base_metrics = {name: loss for name, loss in zip(self.loss_names, loss_tensors)} rouge: Dict = self.calc_generative_metrics(preds, target) summ_len = np.mean(lmap(len, generated_ids)) - base_metrics.update(gen_time=gen_time, summ_len=summ_len, preds=preds, target=target, **rouge) + base_metrics.update(gen_time=gen_time, gen_len=summ_len, preds=preds, target=target, **rouge) return base_metrics def test_step(self, batch, batch_idx): diff --git a/examples/seq2seq/utils.py b/examples/seq2seq/utils.py index 49910ab62162..7d9288333c9d 100644 --- a/examples/seq2seq/utils.py +++ b/examples/seq2seq/utils.py @@ -128,12 +128,6 @@ def __getitem__(self, index) -> Dict[str, torch.Tensor]: def get_char_lens(data_file): return [len(x) for x in Path(data_file).open().readlines()] - @staticmethod - def trim_seq2seq_batch(batch, pad_token_id) -> tuple: - y = trim_batch(batch["decoder_input_ids"], pad_token_id) - source_ids, source_mask = trim_batch(batch["input_ids"], pad_token_id, attention_mask=batch["attention_mask"]) - return source_ids, source_mask, y - def collate_fn(self, batch) -> Dict[str, torch.Tensor]: input_ids = torch.stack([x["input_ids"] for x in batch]) masks = torch.stack([x["attention_mask"] for x in batch]) From 06834bc33255f5fb8fabb72c9ff114764b3c7ce5 Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Tue, 28 Jul 2020 12:44:25 -0400 Subject: [PATCH 042/127] Logs should not be hidden behind a logger.info (#6097) --- src/transformers/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 06d467a354e2..2df9113e1e27 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -646,7 +646,7 @@ def log(self, logs: Dict[str, float], iterator: Optional[tqdm] = None) -> None: if iterator is not None: iterator.write(output) else: - logger.info(output) + print(output) def _prepare_inputs( self, inputs: Dict[str, Union[torch.Tensor, Any]], model: nn.Module From b1c8b76907ad605c7b25bb12580cb46d70207b7a Mon Sep 17 00:00:00 2001 From: Joe Davison Date: Tue, 28 Jul 2020 14:46:03 -0400 Subject: [PATCH 043/127] Fix zero-shot pipeline single seq output shape (#6104) --- src/transformers/pipelines.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py index bc1d885cb324..2e6b1f4917a7 100755 --- a/src/transformers/pipelines.py +++ b/src/transformers/pipelines.py @@ -959,7 +959,7 @@ def __call__(self, sequences, candidate_labels, hypothesis_template="This exampl top_inds = list(reversed(scores[iseq].argsort())) result.append( { - "sequence": sequences if num_sequences == 1 else sequences[iseq], + "sequence": sequences if isinstance(sequences, str) else sequences[iseq], "labels": [candidate_labels[i] for i in top_inds], "scores": scores[iseq][top_inds].tolist(), } From 5abe50381afc3a02cb5776f990bf443f83430ef4 Mon Sep 17 00:00:00 2001 From: Sam Shleifer Date: Tue, 28 Jul 2020 18:27:58 -0400 Subject: [PATCH 044/127] Fix #6096: MBartTokenizer's mask token (#6098) --- src/transformers/tokenization_bart.py | 1 + tests/test_modeling_mbart.py | 12 ++++++++++++ tests/test_tokenization_mbart.py | 11 +++++++++++ 3 files changed, 24 insertions(+) diff --git a/src/transformers/tokenization_bart.py b/src/transformers/tokenization_bart.py index c83ad0d333e2..499895e0bda6 100644 --- a/src/transformers/tokenization_bart.py +++ b/src/transformers/tokenization_bart.py @@ -122,6 +122,7 @@ def __init__(self, *args, **kwargs): } self.id_to_lang_code = {v: k for k, v in self.lang_code_to_id.items()} self.cur_lang_code = self.lang_code_to_id["en_XX"] + self.fairseq_tokens_to_ids[""] = len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset self.fairseq_tokens_to_ids.update(self.lang_code_to_id) self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} diff --git a/tests/test_modeling_mbart.py b/tests/test_modeling_mbart.py index 0d10a8e406fb..159fc42976b6 100644 --- a/tests/test_modeling_mbart.py +++ b/tests/test_modeling_mbart.py @@ -123,6 +123,7 @@ def test_mbart_fast_forward(self): self.assertEqual(logits.shape, expected_shape) +@require_torch class MBartCC25IntegrationTest(AbstractMBartIntegrationTest): checkpoint_name = "facebook/mbart-large-cc25" src_text = [ @@ -140,3 +141,14 @@ def test_cc25_generate(self): ) decoded = self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True) self.assertEqual(self.tgt_text[0], decoded[0]) + + @slow + def test_fill_mask(self): + inputs = self.tokenizer.prepare_translation_batch(["One of the best I ever read!"]).to(torch_device) + outputs = self.model.generate( + inputs["input_ids"], decoder_start_token_id=self.tokenizer.lang_code_to_id["en_XX"], num_beams=1 + ) + prediction: str = self.tokenizer.batch_decode( + outputs, clean_up_tokenization_spaces=True, skip_special_tokens=True + )[0] + self.assertEqual(prediction, "of the best books I ever read!") diff --git a/tests/test_tokenization_mbart.py b/tests/test_tokenization_mbart.py index 14566ac97598..74bfd5b5bf45 100644 --- a/tests/test_tokenization_mbart.py +++ b/tests/test_tokenization_mbart.py @@ -1,3 +1,4 @@ +import tempfile import unittest from transformers import AutoTokenizer, BatchEncoding, MBartTokenizer @@ -171,3 +172,13 @@ def test_enro_tokenizer_truncation(self): self.assertEqual(ids[-2], 2) self.assertEqual(ids[-1], EN_CODE) self.assertEqual(len(ids), desired_max_length) + + def test_mask_token(self): + self.assertListEqual(self.tokenizer.convert_tokens_to_ids(["", "ar_AR"]), [250026, 250001]) + + def test_special_tokens_unaffacted_by_save_load(self): + tmpdirname = tempfile.mkdtemp() + original_special_tokens = self.tokenizer.fairseq_tokens_to_ids + self.tokenizer.save_pretrained(tmpdirname) + new_tok = MBartTokenizer.from_pretrained(tmpdirname) + self.assertDictEqual(new_tok.fairseq_tokens_to_ids, original_special_tokens) From 40796c5801ed2589f94d674b7b7e71780afc9575 Mon Sep 17 00:00:00 2001 From: Sam Shleifer Date: Tue, 28 Jul 2020 18:29:18 -0400 Subject: [PATCH 045/127] [fix] add bart to LM_MAPPING (#6099) --- src/transformers/modeling_auto.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/modeling_auto.py b/src/transformers/modeling_auto.py index d59f8d6028a5..b3dc19fc1c10 100644 --- a/src/transformers/modeling_auto.py +++ b/src/transformers/modeling_auto.py @@ -264,6 +264,7 @@ [ (DistilBertConfig, DistilBertForMaskedLM), (AlbertConfig, AlbertForMaskedLM), + (BartConfig, BartForConditionalGeneration), (CamembertConfig, CamembertForMaskedLM), (XLMRobertaConfig, XLMRobertaForMaskedLM), (LongformerConfig, LongformerForMaskedLM), From c49cd927f7b59fc1308ff8073bde31ddeb15eed2 Mon Sep 17 00:00:00 2001 From: Sam Shleifer Date: Tue, 28 Jul 2020 18:29:35 -0400 Subject: [PATCH 046/127] [Fix] position_ids tests again (#6100) --- src/transformers/modeling_bert.py | 3 +-- tests/test_modeling_auto.py | 6 ++++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py index 757eb7c9c7d1..850cae298469 100644 --- a/src/transformers/modeling_bert.py +++ b/src/transformers/modeling_bert.py @@ -568,6 +568,7 @@ class BertPreTrainedModel(PreTrainedModel): config_class = BertConfig load_tf_weights = load_tf_weights_in_bert base_model_prefix = "bert" + authorized_missing_keys = [r"position_ids"] def _init_weights(self, module): """ Initialize the weights """ @@ -699,8 +700,6 @@ class BertModel(BertPreTrainedModel): """ - authorized_missing_keys = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.config = config diff --git a/tests/test_modeling_auto.py b/tests/test_modeling_auto.py index cbbf857bc07c..b86506a014eb 100644 --- a/tests/test_modeling_auto.py +++ b/tests/test_modeling_auto.py @@ -88,9 +88,11 @@ def test_model_for_pretraining_from_pretrained(self): model, loading_info = AutoModelForPreTraining.from_pretrained(model_name, output_loading_info=True) self.assertIsNotNone(model) self.assertIsInstance(model, BertForPreTraining) + # Only one value should not be initialized and in the missing keys. + missing_keys = loading_info.pop("missing_keys") + self.assertListEqual(["cls.predictions.decoder.bias"], missing_keys) for key, value in loading_info.items(): - # Only one value should not be initialized and in the missing keys. - self.assertEqual(len(value), 1 if key == "missing_keys" else 0) + self.assertEqual(len(value), 0) @slow def test_lmhead_model_from_pretrained(self): From 92f8ce2ed65f23f91795ce6eafb8cce1e226cd38 Mon Sep 17 00:00:00 2001 From: Sam Shleifer Date: Tue, 28 Jul 2020 18:30:16 -0400 Subject: [PATCH 047/127] Fix deebert tests (#6102) --- examples/deebert/test_glue_deebert.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/examples/deebert/test_glue_deebert.py b/examples/deebert/test_glue_deebert.py index 06a728916ae6..59f7f58024f4 100644 --- a/examples/deebert/test_glue_deebert.py +++ b/examples/deebert/test_glue_deebert.py @@ -21,11 +21,13 @@ def get_setup_file(): class DeeBertTests(unittest.TestCase): - @slow - def test_glue_deebert(self): + def setup(self) -> None: stream_handler = logging.StreamHandler(sys.stdout) logger.addHandler(stream_handler) + @slow + def test_glue_deebert_train(self): + train_args = """ run_glue_deebert.py --model_type roberta @@ -48,6 +50,10 @@ def test_glue_deebert(self): --overwrite_cache --eval_after_first_stage """.split() + with patch.object(sys, "argv", train_args): + result = run_glue_deebert.main() + for value in result.values(): + self.assertGreaterEqual(value, 0.666) eval_args = """ run_glue_deebert.py @@ -65,6 +71,10 @@ def test_glue_deebert(self): --overwrite_cache --per_gpu_eval_batch_size=1 """.split() + with patch.object(sys, "argv", eval_args): + result = run_glue_deebert.main() + for value in result.values(): + self.assertGreaterEqual(value, 0.666) entropy_eval_args = """ run_glue_deebert.py @@ -82,18 +92,7 @@ def test_glue_deebert(self): --overwrite_cache --per_gpu_eval_batch_size=1 """.split() - - with patch.object(sys, "argv", train_args): - result = run_glue_deebert.main() - for value in result.values(): - self.assertGreaterEqual(value, 0.75) - - with patch.object(sys, "argv", eval_args): - result = run_glue_deebert.main() - for value in result.values(): - self.assertGreaterEqual(value, 0.75) - with patch.object(sys, "argv", entropy_eval_args): result = run_glue_deebert.main() for value in result.values(): - self.assertGreaterEqual(value, 0.75) + self.assertGreaterEqual(value, 0.666) From 640550fc7a1e311915ead1bcca6dacea0c503faf Mon Sep 17 00:00:00 2001 From: Funtowicz Morgan Date: Wed, 29 Jul 2020 11:02:35 +0200 Subject: [PATCH 048/127] ONNX documentation (#5992) * Move torchscript and add ONNX documentation under modle_export Signed-off-by: Morgan Funtowicz * Let's follow guidelines by the gurus: Renamed torchscript.rst to serialization.rst Signed-off-by: Morgan Funtowicz * Remove previously introduced tree element Signed-off-by: Morgan Funtowicz * WIP doc Signed-off-by: Morgan Funtowicz * ONNX documentation Signed-off-by: Morgan Funtowicz * Fix invalid link Signed-off-by: Morgan Funtowicz * Improve spelling Signed-off-by: Morgan Funtowicz * Final wording pass Signed-off-by: Morgan Funtowicz --- docs/source/index.rst | 2 +- .../{torchscript.rst => serialization.rst} | 51 ++++++++++++++++--- 2 files changed, 46 insertions(+), 7 deletions(-) rename docs/source/{torchscript.rst => serialization.rst} (69%) diff --git a/docs/source/index.rst b/docs/source/index.rst index c5eb3283b015..a9e27953ca28 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -157,8 +157,8 @@ conversion utilities for the following models: notebooks converting_tensorflow_models migration - torchscript contributing + serialization .. toctree:: :maxdepth: 2 diff --git a/docs/source/torchscript.rst b/docs/source/serialization.rst similarity index 69% rename from docs/source/torchscript.rst rename to docs/source/serialization.rst index a735b531d119..82180def7750 100644 --- a/docs/source/torchscript.rst +++ b/docs/source/serialization.rst @@ -1,5 +1,44 @@ +********************************************** +Exporting transformers models +********************************************** + +ONNX / ONNXRuntime +============================================== + +Projects ONNX (Open Neural Network eXchange) and ONNXRuntime (ORT) are part of an effort from leading industries in the AI field +to provide a unified and community-driven format to store and, by extension, efficiently execute neural network leveraging a variety +of hardware and dedicated optimizations. + +Starting from transformers v2.10.0 we partnered with ONNX Runtime to provide an easy export of transformers models to +the ONNX format. You can have a look at the effort by looking at our joint blog post `Accelerate your NLP pipelines using +Hugging Face Transformers and ONNX Runtime `_. + +Exporting a model is done through the script `convert_graph_to_onnx.py` at the root of the transformers sources. +The following command shows how easy it is to export a BERT model from the library, simply run: + +.. code-block:: bash + + python convert_graph_to_onnx.py --framework --model bert-base-cased bert-base-cased.onnx + +The conversion tool works for both PyTorch and Tensorflow models and ensures: + * The model and its weights are correctly initialized from the Hugging Face model hub or a local checkpoint. + * The inputs and outputs are correctly generated to their ONNX counterpart. + * The generated model can be correctly loaded through onnxruntime. + +.. note:: + Currently, inputs and outputs are always exported with dynamic sequence axes preventing some optimizations + on the ONNX Runtime. If you would like to see such support for fixed-length inputs/outputs, please + open up an issue on transformers. + + +Also, the conversion tool supports different options which let you tune the behavior of the generated model: + * Change the target opset version of the generated model: More recent opset generally supports more operator and enables faster inference. + * Export pipeline specific prediction heads: Allow to export model along with its task-specific prediction head(s). + * Use the external data format (PyTorch only): Lets you export model which size is above 2Gb (`More info `_). + + TorchScript -================================================ +======================================= .. note:: This is the very beginning of our experiments with TorchScript and we are still exploring its capabilities @@ -25,7 +64,7 @@ These necessities imply several things developers should be careful about. These Implications -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +------------------------------------------------ TorchScript flag and tied weights ------------------------------------------------ @@ -62,12 +101,12 @@ It is recommended to be careful of the total number of operations done on each i when exporting varying sequence-length models. Using TorchScript in Python -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +------------------------------------------------- Below are examples of using the Python to save, load models as well as how to use the trace for inference. Saving a model ------------------------------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This snippet shows how to use TorchScript to export a ``BertModel``. Here the ``BertModel`` is instantiated according to a ``BertConfig`` class and then saved to disk under the filename ``traced_bert.pt`` @@ -113,7 +152,7 @@ according to a ``BertConfig`` class and then saved to disk under the filename `` torch.jit.save(traced_model, "traced_bert.pt") Loading a model ------------------------------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This snippet shows how to load the ``BertModel`` that was previously saved to disk under the name ``traced_bert.pt``. We are re-using the previously initialised ``dummy_input``. @@ -126,7 +165,7 @@ We are re-using the previously initialised ``dummy_input``. all_encoder_layers, pooled_output = loaded_model(dummy_input) Using a traced model for inference ------------------------------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Using the traced model for inference is as simple as using its ``__call__`` dunder method: From 25de74ccfe51ea31a878cc9e98df37e286768c28 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Wed, 29 Jul 2020 05:20:53 -0400 Subject: [PATCH 049/127] Use FutureWarning to deprecate (#6111) --- src/transformers/modeling_albert.py | 2 +- src/transformers/tokenization_utils_base.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/transformers/modeling_albert.py b/src/transformers/modeling_albert.py index f50ce673f276..2f52d1f49854 100644 --- a/src/transformers/modeling_albert.py +++ b/src/transformers/modeling_albert.py @@ -823,7 +823,7 @@ def forward( if "masked_lm_labels" in kwargs: warnings.warn( "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.", - DeprecationWarning, + FutureWarning, ) labels = kwargs.pop("masked_lm_labels") assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 63d1351b0808..6b424606f7ff 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1466,7 +1466,7 @@ def _get_padding_truncation_strategies( "use `padding='max_length'` to pad to a max length. In this case, you can give a specific " "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the " "maximal input size of the model (e.g. 512 for Bert).", - DeprecationWarning, + FutureWarning, ) if max_length is None: padding_strategy = PaddingStrategy.LONGEST @@ -1492,7 +1492,7 @@ def _get_padding_truncation_strategies( "`truncation='only_first'` (will only truncate the first sentence in the pairs) " "`truncation='only_second'` (will only truncate the second sentence in the pairs) " "or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence in the pairs).", - DeprecationWarning, + FutureWarning, ) truncation_strategy = TruncationStrategy(old_truncation_strategy) elif truncation is not False: From 6c002853a68906a5b1c2dd2ebb416770f1fc322b Mon Sep 17 00:00:00 2001 From: Funtowicz Morgan Date: Wed, 29 Jul 2020 13:21:29 +0200 Subject: [PATCH 050/127] Added capability to quantize a model while exporting through ONNX. (#6089) * Added capability to quantize a model while exporting through ONNX. Signed-off-by: Morgan Funtowicz We do not support multiple extensions Signed-off-by: Morgan Funtowicz * Reformat files Signed-off-by: Morgan Funtowicz * More quality Signed-off-by: Morgan Funtowicz * Ensure test_generate_identified_name compares the same object types Signed-off-by: Morgan Funtowicz * Added documentation everywhere on ONNX exporter Signed-off-by: Morgan Funtowicz * Use pathlib.Path instead of plain-old string Signed-off-by: Morgan Funtowicz * Use f-string everywhere Signed-off-by: Morgan Funtowicz * Use the correct parameters for black formatting Signed-off-by: Morgan Funtowicz * Use Python 3 super() style. Signed-off-by: Morgan Funtowicz * Use packaging.version to ensure installed onnxruntime version match requirements Signed-off-by: Morgan Funtowicz * Fixing imports sorting order. Signed-off-by: Morgan Funtowicz * Missing raise(s) Signed-off-by: Morgan Funtowicz * Added quantization documentation Signed-off-by: Morgan Funtowicz * Fix some spelling. Signed-off-by: Morgan Funtowicz * Fix bad list header format Signed-off-by: Morgan Funtowicz --- docs/source/serialization.rst | 61 +++++- src/transformers/convert_graph_to_onnx.py | 230 ++++++++++++++++++---- tests/test_onnx.py | 43 +++- 3 files changed, 288 insertions(+), 46 deletions(-) diff --git a/docs/source/serialization.rst b/docs/source/serialization.rst index 82180def7750..15a1f3771ec5 100644 --- a/docs/source/serialization.rst +++ b/docs/source/serialization.rst @@ -21,9 +21,10 @@ The following command shows how easy it is to export a BERT model from the libra python convert_graph_to_onnx.py --framework --model bert-base-cased bert-base-cased.onnx The conversion tool works for both PyTorch and Tensorflow models and ensures: - * The model and its weights are correctly initialized from the Hugging Face model hub or a local checkpoint. - * The inputs and outputs are correctly generated to their ONNX counterpart. - * The generated model can be correctly loaded through onnxruntime. + +* The model and its weights are correctly initialized from the Hugging Face model hub or a local checkpoint. +* The inputs and outputs are correctly generated to their ONNX counterpart. +* The generated model can be correctly loaded through onnxruntime. .. note:: Currently, inputs and outputs are always exported with dynamic sequence axes preventing some optimizations @@ -32,9 +33,57 @@ The conversion tool works for both PyTorch and Tensorflow models and ensures: Also, the conversion tool supports different options which let you tune the behavior of the generated model: - * Change the target opset version of the generated model: More recent opset generally supports more operator and enables faster inference. - * Export pipeline specific prediction heads: Allow to export model along with its task-specific prediction head(s). - * Use the external data format (PyTorch only): Lets you export model which size is above 2Gb (`More info `_). + +* Change the target opset version of the generated model: More recent opset generally supports more operator and enables faster inference. +* Export pipeline specific prediction heads: Allow to export model along with its task-specific prediction head(s). +* Use the external data format (PyTorch only): Lets you export model which size is above 2Gb (`More info `_). + +Quantization +------------------------------------------------ + +ONNX exporter supports generating a quantized version of the model to allow efficient inference. + +Quantization works by converting the memory representation of the parameters in the neural network +to a compact integer format. By default, weights of a neural network are stored as single-precision float (`float32`) +which can express a wide-range of floating-point numbers with decent precision. +These properties are especially interesting at training where you want fine-grained representation. + +On the other hand, after the training phase, it has been shown one can greatly reduce the range and the precision of `float32` numbers +without changing the performances of the neural network. + +More technically, `float32` parameters are converted to a type requiring fewer bits to represent each number, thus reducing +the overall size of the model. Here, we are enabling `float32` mapping to `int8` values (a non-floating, single byte, number representation) +according to the following formula: + +.. math:: + y_{float32} = scale * x_{int8} - zero\_point + +.. note:: + The quantization process will infer the parameter `scale` and `zero_point` from the neural network parameters + +Leveraging tiny-integers has numerous advantages when it comes to inference: + +* Storing fewer bits instead of 32 bits for the `float32` reduces the size of the model and makes it load faster. +* Integer operations execute a magnitude faster on modern hardware +* Integer operations require less power to do the computations + +In order to convert a transformers model to ONNX IR with quantized weights you just need to specify ``--quantize`` +when using ``convert_graph_to_onnx.py``. Also, you can have a look at the ``quantize()`` utility-method in this +same script file. + +Example of quantized BERT model export: + +.. code-block:: bash + + python convert_graph_to_onnx.py --framework --model bert-base-cased --quantize bert-base-cased.onnx + +.. note:: + Quantization support requires ONNX Runtime >= 1.4.0 + +.. note:: + When exporting quantized model you will end up with two different ONNX files. The one specified at the end of the + above command will contain the original ONNX model storing `float32` weights. + The second one, with ``-quantized`` suffix, will hold the quantized parameters. TorchScript diff --git a/src/transformers/convert_graph_to_onnx.py b/src/transformers/convert_graph_to_onnx.py index 8c85b76e4f65..72082ab0b5fb 100644 --- a/src/transformers/convert_graph_to_onnx.py +++ b/src/transformers/convert_graph_to_onnx.py @@ -1,14 +1,21 @@ from argparse import ArgumentParser from os import listdir, makedirs -from os.path import abspath, dirname, exists +from pathlib import Path from typing import Dict, List, Optional, Tuple +from packaging.version import parse + from transformers import is_tf_available, is_torch_available from transformers.file_utils import ModelOutput from transformers.pipelines import Pipeline, pipeline from transformers.tokenization_utils import BatchEncoding +# This is the minimal required version to +# support some ONNX Runtime features +ORT_QUANTIZE_MINIMUM_VERSION = parse("1.4.0") + + SUPPORTED_PIPELINES = [ "feature-extraction", "ner", @@ -28,18 +35,71 @@ class OnnxConverterArgumentParser(ArgumentParser): """ def __init__(self): - super(OnnxConverterArgumentParser, self).__init__("ONNX Converter") + super().__init__("ONNX Converter") - self.add_argument("--pipeline", type=str, choices=SUPPORTED_PIPELINES, default="feature-extraction") - self.add_argument("--model", type=str, required=True, help="Model's id or path (ex: bert-base-cased)") + self.add_argument( + "--pipeline", type=str, choices=SUPPORTED_PIPELINES, default="feature-extraction", + ) + self.add_argument( + "--model", type=str, required=True, help="Model's id or path (ex: bert-base-cased)", + ) self.add_argument("--tokenizer", type=str, help="Tokenizer's id or path (ex: bert-base-cased)") - self.add_argument("--framework", type=str, choices=["pt", "tf"], help="Framework for loading the model") + self.add_argument( + "--framework", type=str, choices=["pt", "tf"], help="Framework for loading the model", + ) self.add_argument("--opset", type=int, default=11, help="ONNX opset to use") - self.add_argument("--check-loading", action="store_true", help="Check ONNX is able to load the model") - self.add_argument("--use-external-format", action="store_true", help="Allow exporting model >= than 2Gb") + self.add_argument( + "--check-loading", action="store_true", help="Check ONNX is able to load the model", + ) + self.add_argument( + "--use-external-format", action="store_true", help="Allow exporting model >= than 2Gb", + ) + self.add_argument( + "--quantize", action="store_true", help="Quantize the neural network to be run with int8", + ) self.add_argument("output") +def generate_identified_filename(filename: Path, identifier: str) -> Path: + """ + Append a string-identifier at the end (before the extension, if any) to the provided filepath. + Args: + filename: pathlib.Path The actual path object we would like to add an identifier suffix + identifier: The suffix to add + + Returns: String with concatenated indentifier at the end of the filename + """ + return filename.parent.joinpath(filename.stem + identifier).with_suffix(filename.suffix) + + +def ensure_onnxruntime_installed(): + """ + Check onnxruntime is installed and if the installed version match is recent enough. + Raises: + ImportError: If onnxruntime is not installed or too old version is found + """ + try: + import onnxruntime + + # Parse the version of the installed onnxruntime + ort_version = parse(onnxruntime.__version__) + + # We require 1.4.0 minimum + if ort_version < ORT_QUANTIZE_MINIMUM_VERSION: + raise ImportError( + f"We found an older version of onnxruntime ({onnxruntime.__version__}) " + f"but we require onnxruntime to be >= 1.4.0 to enable all the conversions options.\n" + f"Please update onnxruntime by running `pip install --upgrade onnxruntime`" + ) + + except ImportError: + raise ImportError( + "onnxruntime doesn't seem to be currently installed. " + "Please install the onnxruntime by running `pip install onnxruntime`" + " and relaunch the conversion." + ) + + def ensure_valid_input(model, tokens, input_names): """ Ensure input are presented in the correct order, without any None @@ -60,7 +120,7 @@ def ensure_valid_input(model, tokens, input_names): ordered_input_names.append(arg_name) model_args.append(tokens[arg_name]) else: - print("{} is not present in the generated input list.".format(arg_name)) + print(f"{arg_name} is not present in the generated input list.") break print("Generated inputs order: {}".format(ordered_input_names)) @@ -68,6 +128,19 @@ def ensure_valid_input(model, tokens, input_names): def infer_shapes(nlp: Pipeline, framework: str) -> Tuple[List[str], List[str], Dict, BatchEncoding]: + """ + Attempt to infer the static vs dynamic axes for each input and output tensors for a specific model. + Args: + nlp: The pipeline object holding the model to be exported + framework: The framework identifier to dispatch to the correct inference scheme (pt/tf) + + Returns: + - List of the inferred input variable names + - List of the inferred output variable names + - Dictionary with input/output variables names as key and shape tensor as value + - a BatchEncoding reference which was used to infer all the above information + """ + def build_shape_dict(name: str, tensor, is_input: bool, seq_len: int): if isinstance(tensor, (tuple, list)): return [build_shape_dict(name, t, is_input, seq_len) for t in tensor] @@ -79,12 +152,12 @@ def build_shape_dict(name: str, tensor, is_input: bool, seq_len: int): if len(tensor.shape) == 2: axes[1] = "sequence" else: - raise ValueError("Unable to infer tensor axes ({})".format(len(tensor.shape))) + raise ValueError(f"Unable to infer tensor axes ({len(tensor.shape)})") else: seq_axes = [dim for dim, shape in enumerate(tensor.shape) if shape == seq_len] axes.update({dim: "sequence" for dim in seq_axes}) - print("Found {} {} with shape: {}".format("input" if is_input else "output", name, axes)) + print(f"Found {'input' if is_input else 'output'} {name} with shape: {axes}") return axes tokens = nlp.tokenizer("This is a sample output", return_tensors=framework) @@ -108,7 +181,7 @@ def build_shape_dict(name: str, tensor, is_input: bool, seq_len: int): outputs_flat.append(output) # Generate output names & axes - output_names = ["output_{}".format(i) for i in range(len(outputs_flat))] + output_names = [f"output_{i}" for i in range(len(outputs_flat))] output_dynamic_axes = {k: build_shape_dict(k, v, False, seq_len) for k, v in zip(output_names, outputs_flat)} # Create the aggregated axes representation @@ -117,6 +190,17 @@ def build_shape_dict(name: str, tensor, is_input: bool, seq_len: int): def load_graph_from_args(pipeline_name: str, framework: str, model: str, tokenizer: Optional[str] = None) -> Pipeline: + """ + Convert the set of arguments provided through the CLI to an actual pipeline reference (tokenizer + model) + Args: + pipeline_name: The kind of pipeline to use (ner, question-answering, etc.) + framework: The actual model to convert the pipeline from ("pt" or "tf") + model: The model name which will be loaded by the pipeline + tokenizer: The tokenizer name which will be loaded by the pipeline, defaut to the model's value + + Returns: Pipeline object + + """ # If no tokenizer provided if tokenizer is None: tokenizer = model @@ -127,20 +211,31 @@ def load_graph_from_args(pipeline_name: str, framework: str, model: str, tokeniz if framework == "tf" and not is_tf_available(): raise Exception("Cannot convert because TF is not installed. Please install tensorflow first.") - print("Loading pipeline (model: {}, tokenizer: {})".format(model, tokenizer)) + print(f"Loading pipeline (model: {model}, tokenizer: {tokenizer})") # Allocate tokenizer and model return pipeline(pipeline_name, model=model, tokenizer=tokenizer, framework=framework) -def convert_pytorch(nlp: Pipeline, opset: int, output: str, use_external_format: bool): +def convert_pytorch(nlp: Pipeline, opset: int, output: Path, use_external_format: bool): + """ + Export a PyTorch backed pipeline to ONNX Intermediate Representation (IR) + Args: + nlp: The pipeline to be exported + opset: The actual version of the ONNX operator set to use + output: Path where will be stored the generated ONNX model + use_external_format: Split the model definition from its parameters to allow model bigger than 2GB + + Returns: + + """ if not is_torch_available(): raise Exception("Cannot convert because PyTorch is not installed. Please install torch first.") import torch from torch.onnx import export - print("Using framework PyTorch: {}".format(torch.__version__)) + print(f"Using framework PyTorch: {torch.__version__}") with torch.no_grad(): input_names, output_names, dynamic_axes, tokens = infer_shapes(nlp, "pt") @@ -149,7 +244,7 @@ def convert_pytorch(nlp: Pipeline, opset: int, output: str, use_external_format: export( nlp.model, model_args, - f=output, + f=output.as_posix(), input_names=ordered_input_names, output_names=output_names, dynamic_axes=dynamic_axes, @@ -160,7 +255,17 @@ def convert_pytorch(nlp: Pipeline, opset: int, output: str, use_external_format: ) -def convert_tensorflow(nlp: Pipeline, opset: int, output: str): +def convert_tensorflow(nlp: Pipeline, opset: int, output: Path): + """ + Export a TensorFlow backed pipeline to ONNX Intermediate Representation (IR) + Args: + nlp: The pipeline to be exported + opset: The actual version of the ONNX operator set to use + output: Path where will be stored the generated ONNX model + + Notes: TensorFlow cannot export model bigger than 2GB due to internal constraint from TensorFlow + + """ if not is_tf_available(): raise Exception("Cannot convert because TF is not installed. Please install tensorflow first.") @@ -170,7 +275,7 @@ def convert_tensorflow(nlp: Pipeline, opset: int, output: str): import tensorflow as tf from keras2onnx import convert_keras, save_model, __version__ as k2ov - print("Using framework TensorFlow: {}, keras2onnx: {}".format(tf.version.VERSION, k2ov)) + print(f"Using framework TensorFlow: {tf.version.VERSION}, keras2onnx: {k2ov}") # Build input_names, output_names, dynamic_axes, tokens = infer_shapes(nlp, "tf") @@ -178,34 +283,45 @@ def convert_tensorflow(nlp: Pipeline, opset: int, output: str): # Forward nlp.model.predict(tokens.data) onnx_model = convert_keras(nlp.model, nlp.model.name, target_opset=opset) - save_model(onnx_model, output) + save_model(onnx_model, output.as_posix()) except ImportError as e: - raise Exception( - "Cannot import {} required to convert TF model to ONNX. Please install {} first.".format(e.name, e.name) - ) + raise Exception(f"Cannot import {e.name} required to convert TF model to ONNX. Please install {e.name} first.") def convert( framework: str, model: str, - output: str, + output: Path, opset: int, tokenizer: Optional[str] = None, use_external_format: bool = False, pipeline_name: str = "feature-extraction", ): - print("ONNX opset version set to: {}".format(opset)) + """ + Convert the pipeline object to the ONNX Intermediate Representation (IR) format. + Args: + framework: The framework the pipeline is backed by ("pt" or "tf") + model: The name of the model to load for the pipeline + output: The path where the ONNX graph will be stored + opset: The actual version of the ONNX operator set to use + tokenizer: The name of the model to load for the pipeline, default to the model's name if not provided + use_external_format: Split the model definition from its parameters to allow model bigger than 2GB (PyTorch only) + pipeline_name: The kind of pipeline to instantiate (ner, question-answering, etc.) + + Returns: + + """ + print(f"ONNX opset version set to: {opset}") # Load the pipeline nlp = load_graph_from_args(pipeline_name, framework, model, tokenizer) - parent = dirname(output) - if not exists(parent): - print("Creating folder {}".format(parent)) - makedirs(parent) - elif len(listdir(parent)) > 0: - raise Exception("Folder {} is not empty, aborting conversion".format(parent)) + if not output.parent.exists(): + print(f"Creating folder {output.parent}") + makedirs(output.parent.as_posix()) + elif len(listdir(output.parent.as_posix())) > 0: + raise Exception(f"Folder {output.parent.as_posix()} is not empty, aborting conversion") # Export the graph if framework == "pt": @@ -214,17 +330,52 @@ def convert( convert_tensorflow(nlp, opset, output) -def verify(path: str): +def quantize(onnx_model_path: Path) -> Path: + """ + Quantize the weights of the model from float32 to in8 to allow very efficient inference on modern CPU. + Args: + onnx_model_path: Path to location the exported ONNX model is stored + + Returns: The Path generated for the quantized + """ + + try: + ensure_onnxruntime_installed() + import onnx + from onnxruntime import __version__ as ort_version + from onnxruntime.quantization import quantize, QuantizationMode + + print(f"Found ONNX: {onnx.__version__}") + print(f"Found ONNXRuntime: {ort_version}") + + onnx_model = onnx.load(onnx_model_path.as_posix()) + quantized_model = quantize( + model=onnx_model, quantization_mode=QuantizationMode.IntegerOps, force_fusions=True, symmetric_weight=True, + ) + + # Append "-quantized" at the end of the model's name + quantized_model_path = generate_identified_filename(onnx_model_path, "-quantized") + + # Save model + print(f"Storing quantized model at {quantized_model_path}") + onnx.save(quantized_model, quantized_model_path.as_posix()) + + return quantized_model_path + except ImportError as ie: + print(f"Error while quantizing the model:\n{str(ie)}") + + +def verify(path: Path): from onnxruntime import InferenceSession, SessionOptions from onnxruntime.capi.onnxruntime_pybind11_state import RuntimeException - print("Checking ONNX model loading from: {}".format(path)) + print(f"Checking ONNX model loading from: {path}") try: onnx_options = SessionOptions() - _ = InferenceSession(path, onnx_options, providers=["CPUExecutionProvider"]) - print("Model correctly loaded") + _ = InferenceSession(path.as_posix(), onnx_options, providers=["CPUExecutionProvider"]) + print(f"Model {path} correctly loaded: \N{heavy check mark}") except RuntimeException as re: - print("Error while loading the model: {}".format(re)) + print(f"Error while loading the model {re}: \N{heavy ballot x}") if __name__ == "__main__": @@ -232,7 +383,7 @@ def verify(path: str): args = parser.parse_args() # Make sure output is absolute path - args.output = abspath(args.output) + args.output = Path(args.output).absolute() try: # Convert @@ -246,9 +397,16 @@ def verify(path: str): args.pipeline, ) + if args.quantize: + args.quantized_output = quantize(args.output) + # And verify if args.check_loading: verify(args.output) + + if hasattr(args, "quantized_output"): + verify(args.quantized_output) + except Exception as e: - print("Error while converting the model: {}".format(e)) + print(f"Error while converting the model: {e}") exit(1) diff --git a/tests/test_onnx.py b/tests/test_onnx.py index e9c0c6b48a59..d397a149c4b6 100644 --- a/tests/test_onnx.py +++ b/tests/test_onnx.py @@ -1,10 +1,17 @@ import unittest from os.path import dirname, exists +from pathlib import Path from shutil import rmtree from tempfile import NamedTemporaryFile, TemporaryDirectory from transformers import BertConfig, BertTokenizerFast, FeatureExtractionPipeline -from transformers.convert_graph_to_onnx import convert, ensure_valid_input, infer_shapes +from transformers.convert_graph_to_onnx import ( + convert, + ensure_valid_input, + generate_identified_filename, + infer_shapes, + quantize, +) from transformers.testing_utils import require_tf, require_torch, slow @@ -25,13 +32,13 @@ class OnnxExportTestCase(unittest.TestCase): @slow def test_export_tensorflow(self): for model in OnnxExportTestCase.MODEL_TO_TEST: - self._test_export(model, "tf", 11) + self._test_export(model, "tf", 12) @require_torch @slow def test_export_pytorch(self): for model in OnnxExportTestCase.MODEL_TO_TEST: - self._test_export(model, "pt", 11) + self._test_export(model, "pt", 12) @require_torch @slow @@ -47,7 +54,29 @@ def test_export_custom_bert_model(self): with TemporaryDirectory() as bert_save_dir: model = BertModel(BertConfig(vocab_size=len(vocab))) model.save_pretrained(bert_save_dir) - self._test_export(bert_save_dir, "pt", 11, tokenizer) + self._test_export(bert_save_dir, "pt", 12, tokenizer) + + @require_tf + @slow + def test_quantize_tf(self): + for model in OnnxExportTestCase.MODEL_TO_TEST: + path = self._test_export(model, "tf", 12) + quantized_path = quantize(Path(path)) + + # Ensure the actual quantized model is not bigger than the original one + if quantized_path.stat().st_size >= Path(path).stat().st_size: + self.fail("Quantized model is bigger than initial ONNX model") + + @require_torch + @slow + def test_quantize_pytorch(self): + for model in OnnxExportTestCase.MODEL_TO_TEST: + path = self._test_export(model, "pt", 12) + quantized_path = quantize(Path(path)) + + # Ensure the actual quantized model is not bigger than the original one + if quantized_path.stat().st_size >= Path(path).stat().st_size: + self.fail("Quantized model is bigger than initial ONNX model") def _test_export(self, model, framework, opset, tokenizer=None): try: @@ -61,6 +90,8 @@ def _test_export(self, model, framework, opset, tokenizer=None): # Export convert(framework, model, path, opset, tokenizer) + + return path except Exception as e: self.fail(e) @@ -138,3 +169,7 @@ def test_ensure_valid_input(self): # Should have only "input_ids" self.assertEqual(inputs_args[0], tokens["input_ids"]) self.assertEqual(ordered_input_names[0], "input_ids") + + def test_generate_identified_name(self): + generated = generate_identified_filename(Path("/home/something/my_fake_model.onnx"), "-test") + self.assertEqual("/home/something/my_fake_model-test.onnx", generated.as_posix()) From 8d157c930bb535190e0a5d322ec85b9124265478 Mon Sep 17 00:00:00 2001 From: Timo Moeller Date: Wed, 29 Jul 2020 17:34:16 +0200 Subject: [PATCH 051/127] add deepset/xlm-roberta-large-squad2 model card (#6128) * Add xlm-r QA model card * Add tags --- .../xlm-roberta-large-squad2/README.md | 133 ++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 model_cards/deepset/xlm-roberta-large-squad2/README.md diff --git a/model_cards/deepset/xlm-roberta-large-squad2/README.md b/model_cards/deepset/xlm-roberta-large-squad2/README.md new file mode 100644 index 000000000000..db75ef4b5879 --- /dev/null +++ b/model_cards/deepset/xlm-roberta-large-squad2/README.md @@ -0,0 +1,133 @@ +--- +language: multilingual +tags: +- question-answering +datasets: +- squad_v2 +--- + +# Multilingual XLM-RoBERTa large for QA on various languages + +## Overview +**Language model:** xlm-roberta-large +**Language:** Multilingual +**Downstream-task:** Extractive QA +**Training data:** SQuAD 2.0 +**Eval data:** SQuAD dev set - German MLQA - German XQuAD +**Training run:** [MLFlow link](https://public-mlflow.deepset.ai/#/experiments/124/runs/3a540e3f3ecf4dd98eae8fc6d457ff20) +**Infrastructure**: 4x Tesla v100 + +## Hyperparameters + +``` +batch_size = 32 +n_epochs = 3 +base_LM_model = "xlm-roberta-large" +max_seq_len = 256 +learning_rate = 1e-5 +lr_schedule = LinearWarmup +warmup_proportion = 0.2 +doc_stride=128 +max_query_length=64 +``` + +## Performance +Evaluated on the SQuAD 2.0 English dev set with the [official eval script](https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/). +``` + "exact": 79.45759285774446, + "f1": 83.79259828925511, + "total": 11873, + "HasAns_exact": 71.96356275303644, + "HasAns_f1": 80.6460053117963, + "HasAns_total": 5928, + "NoAns_exact": 86.93019343986543, + "NoAns_f1": 86.93019343986543, + "NoAns_total": 5945 +``` + +Evaluated on German [MLQA: test-context-de-question-de.json](https://github.com/facebookresearch/MLQA) +``` +"exact": 49.34691166703564, +"f1": 66.15582561674236, +"total": 4517, +``` + +Evaluated on German [XQuAD: xquad.de.json](https://github.com/deepmind/xquad) +``` +"exact": 61.51260504201681, +"f1": 78.80206098332569, +"total": 1190, +``` + +## Usage + +### In Transformers +```python +from transformers.pipelines import pipeline +from transformers.modeling_auto import AutoModelForQuestionAnswering +from transformers.tokenization_auto import AutoTokenizer + +model_name = "deepset/xlm-roberta-large-squad2" + +# a) Get predictions +nlp = pipeline('question-answering', model=model_name, tokenizer=model_name) +QA_input = { + 'question': 'Why is model conversion important?', + 'context': 'The option to convert models between FARM and transformers gives freedom to the user and let people easily switch between frameworks.' +} +res = nlp(QA_input) + +# b) Load model & tokenizer +model = AutoModelForQuestionAnswering.from_pretrained(model_name) +tokenizer = AutoTokenizer.from_pretrained(model_name) +``` + +### In FARM + +```python +from farm.modeling.adaptive_model import AdaptiveModel +from farm.modeling.tokenization import Tokenizer +from farm.infer import QAInferencer + +model_name = "deepset/xlm-roberta-large-squad2" + +# a) Get predictions +nlp = QAInferencer.load(model_name) +QA_input = [{"questions": ["Why is model conversion important?"], + "text": "The option to convert models between FARM and transformers gives freedom to the user and let people easily switch between frameworks."}] +res = nlp.inference_from_dicts(dicts=QA_input, rest_api_schema=True) + +# b) Load model & tokenizer +model = AdaptiveModel.convert_from_transformers(model_name, device="cpu", task_type="question_answering") +tokenizer = Tokenizer.load(model_name) +``` + +### In haystack +For doing QA at scale (i.e. many docs instead of single paragraph), you can load the model also in [haystack](https://github.com/deepset-ai/haystack/): +```python +reader = FARMReader(model_name_or_path="deepset/xlm-roberta-large-squad2") +# or +reader = TransformersReader(model="deepset/xlm-roberta-large-squad2",tokenizer="deepset/xlm-roberta-large-squad2") +``` + + +## Authors +Branden Chan: `branden.chan [at] deepset.ai` +Timo MÃ¶ller: `timo.moeller [at] deepset.ai` +Malte Pietsch: `malte.pietsch [at] deepset.ai` +Tanay Soni: `tanay.soni [at] deepset.ai` + +## About us +![deepset logo](https://raw.githubusercontent.com/deepset-ai/FARM/master/docs/img/deepset_logo.png) + +We bring NLP to the industry via open source! +Our focus: Industry specific language models & large scale QA systems. + +Some of our work: +- [German BERT (aka "bert-base-german-cased")](https://deepset.ai/german-bert) +- [FARM](https://github.com/deepset-ai/FARM) +- [Haystack](https://github.com/deepset-ai/haystack/) + +Get in touch: +[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Website](https://deepset.ai) + From 641b873c1341f553b40fd82c990b80884b585f0b Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Wed, 29 Jul 2020 11:38:15 -0400 Subject: [PATCH 052/127] XLNet PLM Readme (#6121) --- examples/language-modeling/README.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/examples/language-modeling/README.md b/examples/language-modeling/README.md index 514c51f8d11f..a66215351a7d 100644 --- a/examples/language-modeling/README.md +++ b/examples/language-modeling/README.md @@ -60,3 +60,27 @@ python run_language_modeling.py \ --mlm ``` +### XLNet and permutation language modeling + +XLNet uses a different training objective, which is permutation language modeling. It is an autoregressive method +to learn bidirectional contexts by maximizing the expected likelihood over all permutations of the input +sequence factorization order. + +We use the `--plm_probability` flag to define the ratio of length of a span of masked tokens to surrounding +context length for permutation language modeling. + +The `--max_span_length` flag may also be used to limit the length of a span of masked tokens used +for permutation language modeling. + +```bash +export TRAIN_FILE=/path/to/dataset/wiki.train.raw +export TEST_FILE=/path/to/dataset/wiki.test.raw + +python run_language_modeling.py \ + --output_dir=output \ + --model_name_or_path=xlnet-base-cased \ + --do_train \ + --train_data_file=$TRAIN_FILE \ + --do_eval \ + --eval_data_file=$TEST_FILE \ +``` From fc64559c4583db4e38ce50a976c8d935b124cf67 Mon Sep 17 00:00:00 2001 From: Julien Plu Date: Wed, 29 Jul 2020 18:20:00 +0200 Subject: [PATCH 053/127] Fix TF CTRL model naming (#6134) --- setup.py | 8 ++++---- src/transformers/modeling_tf_ctrl.py | 19 +++++++++++++------ 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/setup.py b/setup.py index 2ea0954b425f..c4fc91ab3170 100644 --- a/setup.py +++ b/setup.py @@ -70,14 +70,14 @@ # keras2onnx and onnxconverter-common version is specific through a commit until 1.7.0 lands on pypi extras["tf"] = [ - "tensorflow<=2.2", + "tensorflow", # "onnxconverter-common", # "keras2onnx" "onnxconverter-common @ git+git://github.com/microsoft/onnxconverter-common.git@f64ca15989b6dc95a1f3507ff6e4c395ba12dff5#egg=onnxconverter-common", "keras2onnx @ git+git://github.com/onnx/keras-onnx.git@cbdc75cb950b16db7f0a67be96a278f8d2953b48#egg=keras2onnx" ] extras["tf-cpu"] = [ - "tensorflow-cpu<=2.2", + "tensorflow-cpu", # "onnxconverter-common", # "keras2onnx" "onnxconverter-common @ git+git://github.com/microsoft/onnxconverter-common.git@f64ca15989b6dc95a1f3507ff6e4c395ba12dff5#egg=onnxconverter-common", @@ -86,7 +86,7 @@ extras["torch"] = ["torch"] extras["serving"] = ["pydantic", "uvicorn", "fastapi", "starlette"] -extras["all"] = extras["serving"] + ["tensorflow<=2.2", "torch"] +extras["all"] = extras["serving"] + ["tensorflow", "torch"] extras["testing"] = ["pytest", "pytest-xdist", "timeout-decorator", "psutil"] # sphinx-rtd-theme==0.5.0 introduced big changes in the style. @@ -97,7 +97,7 @@ "isort @ git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort", "flake8", ] -extras["dev"] = extras["testing"] + extras["quality"] + ["mecab-python3<1", "scikit-learn", "tensorflow<=2.2", "torch"] +extras["dev"] = extras["testing"] + extras["quality"] + ["mecab-python3<1", "scikit-learn", "tensorflow", "torch"] setup( name="transformers", diff --git a/src/transformers/modeling_tf_ctrl.py b/src/transformers/modeling_tf_ctrl.py index dc20cf74ba34..033e100d7233 100644 --- a/src/transformers/modeling_tf_ctrl.py +++ b/src/transformers/modeling_tf_ctrl.py @@ -141,11 +141,18 @@ def call(self, inputs, training=False): return outputs -def point_wise_feed_forward_network(d_model_size, dff, name=""): - return tf.keras.Sequential( - [tf.keras.layers.Dense(dff, activation="relu", name="0"), tf.keras.layers.Dense(d_model_size, name="2")], - name="ffn", - ) +class TFPointWiseFeedForwardLayer(tf.keras.layers.Layer): + def __init__(self, d_model_size, dff, **kwargs): + super().__init__(**kwargs) + + self.dense_0 = tf.keras.layers.Dense(dff, activation="relu", name="0") + self.dense_2 = tf.keras.layers.Dense(d_model_size, name="2") + + def call(self, inputs, trainable=False): + dense_0_output = self.dense_0(inputs) + dense_2_output = self.dense_2(dense_0_output) + + return dense_2_output class TFEncoderLayer(tf.keras.layers.Layer): @@ -153,7 +160,7 @@ def __init__(self, d_model_size, num_heads, dff, rate=0.1, layer_norm_epsilon=1e super().__init__(**kwargs) self.multi_head_attention = TFMultiHeadAttention(d_model_size, num_heads, name="multi_head_attention") - self.ffn = point_wise_feed_forward_network(d_model_size, dff, name="ffn") + self.ffn = TFPointWiseFeedForwardLayer(d_model_size, dff, name="ffn") self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm1") self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm2") From 8a8ae27617e3c4dafb34bcbbaadf4ceee28583bd Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Wed, 29 Jul 2020 12:28:12 -0400 Subject: [PATCH 054/127] Use google style to document properties (#6130) * Use google style to document properties * Update src/transformers/configuration_utils.py Co-authored-by: Lysandre Debut Co-authored-by: Lysandre Debut --- src/transformers/configuration_utils.py | 8 +++++++- src/transformers/modeling_tf_utils.py | 5 +---- src/transformers/modeling_utils.py | 22 +++++++++------------- 3 files changed, 17 insertions(+), 18 deletions(-) diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index 40efef2b3ab3..c8dd3572aeba 100644 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -194,12 +194,18 @@ def __init__(self, **kwargs): raise err @property - def use_return_tuple(self): + def use_return_tuple(self) -> bool: + """ + :obj:`bool`: Whether or not the model should return a tuple. + """ # If torchscript is set, force return_tuple to avoid jit errors return self.return_tuple or self.torchscript @property def num_labels(self) -> int: + """ + :obj:`int`: The number of labels for classification models. + """ return len(self.id2label) @num_labels.setter diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py index b2ff04741d90..10b355f22a9d 100644 --- a/src/transformers/modeling_tf_utils.py +++ b/src/transformers/modeling_tf_utils.py @@ -238,10 +238,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin): @property def dummy_inputs(self) -> Dict[str, tf.Tensor]: """ - Dummy inputs to build the network. - - Returns: - :obj:`Dict[str, tf.Tensor]`: The dummy inputs. + :obj:`Dict[str, tf.Tensor]`: Dummy inputs to build the network. """ return {"input_ids": tf.constant(DUMMY_INPUTS)} diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index bd33f7a7a357..7296ba4ac4d7 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -157,10 +157,8 @@ def reset_memory_hooks_state(self): @property def device(self) -> device: """ - The device on which the module is (assuming that all the module parameters are on the same device). - - Returns: - :obj:`torch.device` The device of the module. + :obj:`torch.device`: The device on which the module is (assuming that all the module parameters are on the same + device). """ try: return next(self.parameters()).device @@ -178,10 +176,7 @@ def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]: @property def dtype(self) -> dtype: """ - The dtype of the module (assuming that all the module parameters have the same dtype). - - Returns: - :obj:`torch.dtype` The dtype of the module. + :obj:`torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype). """ try: return next(self.parameters()).dtype @@ -350,10 +345,8 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin): @property def dummy_inputs(self) -> Dict[str, torch.Tensor]: - """ Dummy inputs to do a forward pass in the network. - - Returns: - :obj:`Dict[str, torch.Tensor]`: The dummy inputs. + """ + :obj:`Dict[str, torch.Tensor]`: Dummy inputs to do a forward pass in the network. """ return {"input_ids": torch.tensor(DUMMY_INPUTS)} @@ -371,7 +364,10 @@ def __init__(self, config: PretrainedConfig, *inputs, **kwargs): self.config = config @property - def base_model(self): + def base_model(self) -> nn.Module: + """ + :obj:`torch.nn.Module`: The main body of the model. + """ return getattr(self, self.base_model_prefix, self) def get_input_embeddings(self) -> nn.Module: From 3f94170a1048bbcff77b222a708470e482fdaff8 Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Wed, 29 Jul 2020 14:26:26 -0400 Subject: [PATCH 055/127] =?UTF-8?q?[WIP]=20Test=20TF=20Flaubert=20+=20Add?= =?UTF-8?q?=20{XLM,=20Flaubert}{TokenClassification,=20MultipleC=E2=80=A6?= =?UTF-8?q?=20(#5614)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Test TF Flaubert + Add {XLM, Flaubert}{TokenClassification, MultipleChoice} models and tests * AutoModels Tiny tweaks * Style * Final changes before merge * Re-order for simpler review * Final fixes * Addressing @sgugger's comments * Test MultipleChoice --- src/transformers/__init__.py | 3 + src/transformers/modeling_auto.py | 5 + src/transformers/modeling_flaubert.py | 20 ++ src/transformers/modeling_tf_flaubert.py | 36 ++- src/transformers/modeling_tf_xlm.py | 30 +- src/transformers/modeling_xlm.py | 104 +++++++ tests/test_modeling_common.py | 2 +- tests/test_modeling_flaubert.py | 47 ++++ tests/test_modeling_tf_common.py | 4 +- tests/test_modeling_tf_flaubert.py | 331 ++++++++++++++++++++++- tests/test_modeling_tf_xlm.py | 41 ++- tests/test_modeling_xlm.py | 50 +++- 12 files changed, 652 insertions(+), 21 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index e17b97240da7..a0fc396e5114 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -278,6 +278,7 @@ XLMForTokenClassification, XLMForQuestionAnswering, XLMForQuestionAnsweringSimple, + XLMForMultipleChoice, XLM_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_bart import ( @@ -356,6 +357,8 @@ FlaubertForTokenClassification, FlaubertForQuestionAnswering, FlaubertForQuestionAnsweringSimple, + FlaubertForTokenClassification, + FlaubertForMultipleChoice, FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST, ) diff --git a/src/transformers/modeling_auto.py b/src/transformers/modeling_auto.py index b3dc19fc1c10..5f6ad671edbe 100644 --- a/src/transformers/modeling_auto.py +++ b/src/transformers/modeling_auto.py @@ -98,6 +98,7 @@ ) from .modeling_encoder_decoder import EncoderDecoderModel from .modeling_flaubert import ( + FlaubertForMultipleChoice, FlaubertForQuestionAnsweringSimple, FlaubertForSequenceClassification, FlaubertForTokenClassification, @@ -142,6 +143,7 @@ from .modeling_t5 import T5ForConditionalGeneration, T5Model from .modeling_transfo_xl import TransfoXLLMHeadModel, TransfoXLModel from .modeling_xlm import ( + XLMForMultipleChoice, XLMForQuestionAnsweringSimple, XLMForSequenceClassification, XLMForTokenClassification, @@ -338,6 +340,7 @@ (XLNetConfig, XLNetForTokenClassification), (AlbertConfig, AlbertForTokenClassification), (ElectraConfig, ElectraForTokenClassification), + (FlaubertConfig, FlaubertForTokenClassification), ] ) @@ -353,6 +356,8 @@ (MobileBertConfig, MobileBertForMultipleChoice), (XLNetConfig, XLNetForMultipleChoice), (AlbertConfig, AlbertForMultipleChoice), + (XLMConfig, XLMForMultipleChoice), + (FlaubertConfig, FlaubertForMultipleChoice), ] ) diff --git a/src/transformers/modeling_flaubert.py b/src/transformers/modeling_flaubert.py index aeda892f7ff6..5d0ebf27fc3f 100644 --- a/src/transformers/modeling_flaubert.py +++ b/src/transformers/modeling_flaubert.py @@ -25,6 +25,7 @@ from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable from .modeling_outputs import BaseModelOutput from .modeling_xlm import ( + XLMForMultipleChoice, XLMForQuestionAnswering, XLMForQuestionAnsweringSimple, XLMForSequenceClassification, @@ -382,3 +383,22 @@ def __init__(self, config): super().__init__(config) self.transformer = FlaubertModel(config) self.init_weights() + + +@add_start_docstrings( + """Flaubert Model with a multiple choice classification head on top (a linear layer on top of + the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, + FLAUBERT_START_DOCSTRING, +) +class FlaubertForMultipleChoice(XLMForMultipleChoice): + """ + This class overrides :class:`~transformers.XLMForMultipleChoice`. Please check the + superclass for the appropriate documentation alongside usage examples. + """ + + config_class = FlaubertConfig + + def __init__(self, config): + super().__init__(config) + self.transformer = FlaubertModel(config) + self.init_weights() diff --git a/src/transformers/modeling_tf_flaubert.py b/src/transformers/modeling_tf_flaubert.py index d10324de088e..cf721be25ccd 100644 --- a/src/transformers/modeling_tf_flaubert.py +++ b/src/transformers/modeling_tf_flaubert.py @@ -22,7 +22,7 @@ from .configuration_flaubert import FlaubertConfig from .file_utils import add_start_docstrings -from .modeling_tf_utils import keras_serializable, shape_list +from .modeling_tf_utils import cast_bool_to_primitive, keras_serializable, shape_list from .modeling_tf_xlm import ( TFXLMForMultipleChoice, TFXLMForQuestionAnsweringSimple, @@ -30,6 +30,7 @@ TFXLMForTokenClassification, TFXLMMainLayer, TFXLMModel, + TFXLMPredLayer, TFXLMWithLMHeadModel, get_masks, ) @@ -123,6 +124,8 @@ def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.layerdrop = getattr(config, "layerdrop", 0.0) self.pre_norm = getattr(config, "pre_norm", False) + self.output_attentions = config.output_attentions + self.output_hidden_states = config.output_hidden_states def call( self, @@ -135,9 +138,9 @@ def call( cache=None, head_mask=None, inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, training=False, - output_attentions=False, - output_hidden_states=False, ): # removed: src_enc=None, src_len=None if isinstance(inputs, (tuple, list)): @@ -150,7 +153,9 @@ def call( cache = inputs[6] if len(inputs) > 6 else cache head_mask = inputs[7] if len(inputs) > 7 else head_mask inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds - assert len(inputs) <= 9, "Too many inputs." + output_attentions = inputs[9] if len(inputs) > 9 else output_attentions + output_hidden_states = inputs[10] if len(inputs) > 10 else output_hidden_states + assert len(inputs) <= 11, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -161,10 +166,15 @@ def call( cache = inputs.get("cache", cache) head_mask = inputs.get("head_mask", head_mask) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) - assert len(inputs) <= 9, "Too many inputs." + output_attentions = inputs.get("output_attentions", output_attentions) + output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) + assert len(inputs) <= 11, "Too many inputs." else: input_ids = inputs + output_attentions = output_attentions if output_attentions is not None else self.output_attentions + output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states + if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: @@ -257,9 +267,12 @@ def call( # self attention if not self.pre_norm: - attn_outputs = self.attentions[i]([tensor, attn_mask, None, cache, head_mask[i]], training=training) + attn_outputs = self.attentions[i]( + [tensor, attn_mask, None, cache, head_mask[i], output_attentions], training=training + ) attn = attn_outputs[0] - attentions = attentions + (attn_outputs[1],) + if cast_bool_to_primitive(output_attentions, self.output_attentions) is True: + attentions = attentions + (attn_outputs[1],) attn = self.dropout(attn, training=training) tensor = tensor + attn tensor = self.layer_norm1[i](tensor) @@ -269,7 +282,7 @@ def call( [tensor_normalized, attn_mask, None, cache, head_mask[i]], training=training ) attn = attn_outputs[0] - if output_attentions: + if cast_bool_to_primitive(output_attentions, self.output_attentions) is True: attentions = attentions + (attn_outputs[1],) attn = self.dropout(attn, training=training) tensor = tensor + attn @@ -292,7 +305,7 @@ def call( tensor = tensor * mask[..., tf.newaxis] # Add last hidden state - if output_hidden_states: + if cast_bool_to_primitive(output_hidden_states, self.output_hidden_states) is True: hidden_states = hidden_states + (tensor,) # update cache length @@ -303,9 +316,9 @@ def call( # tensor = tensor.transpose(0, 1) outputs = (tensor,) - if output_hidden_states: + if cast_bool_to_primitive(output_hidden_states, self.output_hidden_states) is True: outputs = outputs + (hidden_states,) - if output_attentions: + if cast_bool_to_primitive(output_attentions, self.output_attentions) is True: outputs = outputs + (attentions,) return outputs # outputs, (hidden_states), (attentions) @@ -321,6 +334,7 @@ class TFFlaubertWithLMHeadModel(TFXLMWithLMHeadModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFFlaubertMainLayer(config, name="transformer") + self.pred_layer = TFXLMPredLayer(config, self.transformer.embeddings, name="pred_layer_._proj") @add_start_docstrings( diff --git a/src/transformers/modeling_tf_xlm.py b/src/transformers/modeling_tf_xlm.py index e912891c212d..7a5f029e56cf 100644 --- a/src/transformers/modeling_tf_xlm.py +++ b/src/transformers/modeling_tf_xlm.py @@ -19,6 +19,7 @@ import itertools import logging import math +import warnings import numpy as np import tensorflow as tf @@ -827,6 +828,9 @@ def __init__(self, config, *inputs, **kwargs): self.transformer = TFXLMMainLayer(config, name="transformer") self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary") + self.logits_proj = tf.keras.layers.Dense( + 1, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj" + ) @property def dummy_inputs(self): @@ -835,7 +839,10 @@ def dummy_inputs(self): Returns: tf.Tensor with dummy inputs """ - return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} + return { + "input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS), + "langs": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS), + } @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048") @@ -892,7 +899,7 @@ def call( output_attentions = inputs[9] if len(inputs) > 9 else output_attentions output_hidden_states = inputs[10] if len(inputs) > 10 else output_hidden_states labels = inputs[11] if len(inputs) > 11 else labels - assert len(inputs) <= 11, "Too many inputs." + assert len(inputs) <= 12, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -921,17 +928,31 @@ def call( flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None + flat_langs = tf.reshape(langs, (-1, seq_length)) if langs is not None else None + flat_inputs_embeds = ( + tf.reshape(inputs_embeds, (-1, inputs_embeds.shape[-2], inputs_embeds.shape[-1])) + if inputs_embeds is not None + else None + ) + + if lengths is not None: + warnings.warn( + "The `lengths` parameter cannot be used with the XLM multiple choice models. Please use the " + "attention mask instead.", + FutureWarning, + ) + lengths = None flat_inputs = [ flat_input_ids, flat_attention_mask, - langs, + flat_langs, flat_token_type_ids, flat_position_ids, lengths, cache, head_mask, - inputs_embeds, + flat_inputs_embeds, output_attentions, output_hidden_states, ] @@ -939,6 +960,7 @@ def call( transformer_outputs = self.transformer(flat_inputs, training=training) output = transformer_outputs[0] logits = self.sequence_summary(output) + logits = self.logits_proj(logits) reshaped_logits = tf.reshape(logits, (-1, num_choices)) outputs = (reshaped_logits,) + transformer_outputs[1:] # add hidden states and attention if they are here diff --git a/src/transformers/modeling_xlm.py b/src/transformers/modeling_xlm.py index e7396df689e7..9a366cee6bdd 100644 --- a/src/transformers/modeling_xlm.py +++ b/src/transformers/modeling_xlm.py @@ -19,6 +19,7 @@ import itertools import logging import math +import warnings from dataclasses import dataclass from typing import Optional, Tuple @@ -40,6 +41,7 @@ from .modeling_outputs import ( BaseModelOutput, MaskedLMOutput, + MultipleChoiceModelOutput, QuestionAnsweringModelOutput, SequenceClassifierOutput, TokenClassifierOutput, @@ -1122,3 +1124,105 @@ def forward( return TokenClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + + +@add_start_docstrings( + """XLM Model with a multiple choice classification head on top (a linear layer on top of + the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, + XLM_START_DOCSTRING, +) +class XLMForMultipleChoice(XLMPreTrainedModel): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.transformer = XLMModel(config) + self.sequence_summary = SequenceSummary(config) + self.logits_proj = nn.Linear(config.num_labels, 1) + + self.init_weights() + + @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xlm-mlm-en-2048", + output_type=MultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + langs=None, + token_type_ids=None, + position_ids=None, + lengths=None, + cache=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_tuple=None, + ): + r""" + labels (:obj:`torch.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + Labels for computing the multiple choice classification loss. + Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension + of the input tensors. (see `input_ids` above) + """ + return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + + input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None + attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None + langs = langs.view(-1, langs.size(-1)) if langs is not None else None + inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) + if inputs_embeds is not None + else None + ) + + if lengths is not None: + warnings.warn( + "The `lengths` parameter cannot be used with the XLM multiple choice models. Please use the " + "attention mask instead.", + FutureWarning, + ) + lengths = None + + transformer_outputs = self.transformer( + input_ids=input_ids, + attention_mask=attention_mask, + langs=langs, + token_type_ids=token_type_ids, + position_ids=position_ids, + lengths=lengths, + cache=cache, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_tuple=return_tuple, + ) + output = transformer_outputs[0] + logits = self.sequence_summary(output) + logits = self.logits_proj(logits) + reshaped_logits = logits.view(-1, num_choices) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if return_tuple: + output = (reshaped_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return MultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 097c387543cc..f6841cb84465 100644 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -66,7 +66,7 @@ def _prepare_for_class(self, inputs_dict, model_class): if model_class in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.values(): return { k: v.unsqueeze(1).expand(-1, self.model_tester.num_choices, -1).contiguous() - if isinstance(v, torch.Tensor) and v.ndim != 0 + if isinstance(v, torch.Tensor) and v.ndim > 1 else v for k, v in inputs_dict.items() } diff --git a/tests/test_modeling_flaubert.py b/tests/test_modeling_flaubert.py index af2918cb947e..d4342e21843f 100644 --- a/tests/test_modeling_flaubert.py +++ b/tests/test_modeling_flaubert.py @@ -32,6 +32,7 @@ FlaubertForQuestionAnsweringSimple, FlaubertForSequenceClassification, FlaubertForTokenClassification, + FlaubertForMultipleChoice, ) from transformers.modeling_flaubert import FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST @@ -90,6 +91,7 @@ def prepare_config_and_inputs(self): sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) is_impossible_labels = ids_tensor([self.batch_size], 2).float() + choice_labels = ids_tensor([self.batch_size], self.num_choices) config = FlaubertConfig( vocab_size=self.vocab_size, @@ -118,6 +120,7 @@ def prepare_config_and_inputs(self): sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ) @@ -133,6 +136,7 @@ def create_and_check_flaubert_model( sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ): model = FlaubertModel(config=config) @@ -158,6 +162,7 @@ def create_and_check_flaubert_lm_head( sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ): model = FlaubertWithLMHeadModel(config) @@ -183,6 +188,7 @@ def create_and_check_flaubert_simple_qa( sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ): model = FlaubertForQuestionAnsweringSimple(config) @@ -212,6 +218,7 @@ def create_and_check_flaubert_qa( sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ): model = FlaubertForQuestionAnswering(config) @@ -278,6 +285,7 @@ def create_and_check_flaubert_sequence_classif( sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ): model = FlaubertForSequenceClassification(config) @@ -304,6 +312,7 @@ def create_and_check_flaubert_token_classif( sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ): config.num_labels = self.num_labels @@ -319,6 +328,38 @@ def create_and_check_flaubert_token_classif( self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]) self.check_loss_output(result) + def create_and_check_flaubert_multiple_choice( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ): + config.num_choices = self.num_choices + model = FlaubertForMultipleChoice(config=config) + model.to(torch_device) + model.eval() + multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + loss, logits = model( + multiple_choice_inputs_ids, + attention_mask=multiple_choice_input_mask, + token_type_ids=multiple_choice_token_type_ids, + labels=choice_labels, + ) + result = { + "loss": loss, + "logits": logits, + } + self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices]) + self.check_loss_output(result) + def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() ( @@ -329,6 +370,7 @@ def prepare_config_and_inputs_for_common(self): sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ) = config_and_inputs inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "lengths": input_lengths} @@ -346,6 +388,7 @@ class FlaubertModelTest(ModelTesterMixin, unittest.TestCase): FlaubertForQuestionAnsweringSimple, FlaubertForSequenceClassification, FlaubertForTokenClassification, + FlaubertForMultipleChoice, ) if is_torch_available() else () @@ -382,6 +425,10 @@ def test_flaubert_token_classif(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_flaubert_token_classif(*config_and_inputs) + def test_flaubert_multiple_choice(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_flaubert_multiple_choice(*config_and_inputs) + @slow def test_model_from_pretrained(self): for model_name in FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py index 839c064209d5..88bfaa63cdc1 100644 --- a/tests/test_modeling_tf_common.py +++ b/tests/test_modeling_tf_common.py @@ -80,8 +80,8 @@ class TFModelTesterMixin: def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): if model_class in TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING.values(): inputs_dict = { - k: tf.tile(tf.expand_dims(v, 1), (1, self.model_tester.num_choices, 1)) - if isinstance(v, tf.Tensor) and v.ndim != 0 + k: tf.tile(tf.expand_dims(v, 1), (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1)) + if isinstance(v, tf.Tensor) and v.ndim > 0 else v for k, v in inputs_dict.items() } diff --git a/tests/test_modeling_tf_flaubert.py b/tests/test_modeling_tf_flaubert.py index 1b3e6d882397..399c78ca53da 100644 --- a/tests/test_modeling_tf_flaubert.py +++ b/tests/test_modeling_tf_flaubert.py @@ -18,11 +18,340 @@ from transformers import is_tf_available from transformers.testing_utils import require_tf, slow +from .test_configuration_common import ConfigTester +from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor + if is_tf_available(): import tensorflow as tf import numpy as np - from transformers import TFFlaubertModel + + from transformers import ( + FlaubertConfig, + TFFlaubertModel, + TFFlaubertWithLMHeadModel, + TFFlaubertForSequenceClassification, + TFFlaubertForQuestionAnsweringSimple, + TFFlaubertForTokenClassification, + TFFlaubertForMultipleChoice, + TF_FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST, + ) + + +class TFFlaubertModelTester: + def __init__( + self, parent, + ): + self.parent = parent + self.batch_size = 13 + self.seq_length = 7 + self.is_training = True + self.use_input_lengths = True + self.use_token_type_ids = True + self.use_labels = True + self.gelu_activation = True + self.sinusoidal_embeddings = False + self.causal = False + self.asm = False + self.n_langs = 2 + self.vocab_size = 99 + self.n_special = 0 + self.hidden_size = 32 + self.num_hidden_layers = 5 + self.num_attention_heads = 4 + self.hidden_dropout_prob = 0.1 + self.attention_probs_dropout_prob = 0.1 + self.max_position_embeddings = 512 + self.type_vocab_size = 16 + self.type_sequence_label_size = 2 + self.initializer_range = 0.02 + self.num_labels = 3 + self.num_choices = 4 + self.summary_type = "last" + self.use_proj = True + self.scope = None + self.bos_token_id = 0 + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + input_mask = ids_tensor([self.batch_size, self.seq_length], 2, dtype=tf.float32) + + input_lengths = None + if self.use_input_lengths: + input_lengths = ( + ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2 + ) # small variation of seq_length + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.n_langs) + + sequence_labels = None + token_labels = None + is_impossible_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = FlaubertConfig( + vocab_size=self.vocab_size, + n_special=self.n_special, + emb_dim=self.hidden_size, + n_layers=self.num_hidden_layers, + n_heads=self.num_attention_heads, + dropout=self.hidden_dropout_prob, + attention_dropout=self.attention_probs_dropout_prob, + gelu_activation=self.gelu_activation, + sinusoidal_embeddings=self.sinusoidal_embeddings, + asm=self.asm, + causal=self.causal, + n_langs=self.n_langs, + max_position_embeddings=self.max_position_embeddings, + initializer_range=self.initializer_range, + summary_type=self.summary_type, + use_proj=self.use_proj, + bos_token_id=self.bos_token_id, + ) + + return ( + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ) + + def create_and_check_flaubert_model( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ): + model = TFFlaubertModel(config=config) + inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids} + outputs = model(inputs) + + inputs = [input_ids, input_mask] + outputs = model(inputs) + sequence_output = outputs[0] + result = { + "sequence_output": sequence_output.numpy(), + } + self.parent.assertListEqual( + list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] + ) + + def create_and_check_flaubert_lm_head( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ): + model = TFFlaubertWithLMHeadModel(config) + + inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids} + outputs = model(inputs) + + logits = outputs[0] + + result = { + "logits": logits.numpy(), + } + + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size]) + + def create_and_check_flaubert_qa( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ): + model = TFFlaubertForQuestionAnsweringSimple(config) + + inputs = {"input_ids": input_ids, "lengths": input_lengths} + + start_logits, end_logits = model(inputs) + + result = { + "start_logits": start_logits.numpy(), + "end_logits": end_logits.numpy(), + } + + self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) + self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) + + def create_and_check_flaubert_sequence_classif( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ): + model = TFFlaubertForSequenceClassification(config) + + inputs = {"input_ids": input_ids, "lengths": input_lengths} + + (logits,) = model(inputs) + + result = { + "logits": logits.numpy(), + } + + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.type_sequence_label_size]) + + def create_and_check_flaubert_for_token_classification( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ): + config.num_labels = self.num_labels + model = TFFlaubertForTokenClassification(config=config) + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + (logits,) = model(inputs) + result = { + "logits": logits.numpy(), + } + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]) + + def create_and_check_flaubert_for_multiple_choice( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ): + config.num_choices = self.num_choices + model = TFFlaubertForMultipleChoice(config=config) + multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1)) + multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1)) + multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1)) + inputs = { + "input_ids": multiple_choice_inputs_ids, + "attention_mask": multiple_choice_input_mask, + "token_type_ids": multiple_choice_token_type_ids, + } + (logits,) = model(inputs) + result = {"logits": logits.numpy()} + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ) = config_and_inputs + inputs_dict = { + "input_ids": input_ids, + "token_type_ids": token_type_ids, + "langs": token_type_ids, + "lengths": input_lengths, + } + return config, inputs_dict + + +@require_tf +class TFFlaubertModelTest(TFModelTesterMixin, unittest.TestCase): + + all_model_classes = ( + ( + TFFlaubertModel, + TFFlaubertWithLMHeadModel, + TFFlaubertForSequenceClassification, + TFFlaubertForQuestionAnsweringSimple, + TFFlaubertForTokenClassification, + TFFlaubertForMultipleChoice, + ) + if is_tf_available() + else () + ) + all_generative_model_classes = ( + (TFFlaubertWithLMHeadModel,) if is_tf_available() else () + ) # TODO (PVP): Check other models whether language generation is also applicable + + def setUp(self): + self.model_tester = TFFlaubertModelTester(self) + self.config_tester = ConfigTester(self, config_class=FlaubertConfig, emb_dim=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_flaubert_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_flaubert_model(*config_and_inputs) + + def test_flaubert_lm_head(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_flaubert_lm_head(*config_and_inputs) + + def test_flaubert_qa(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_flaubert_qa(*config_and_inputs) + + def test_flaubert_sequence_classif(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_flaubert_sequence_classif(*config_and_inputs) + + def test_for_token_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_flaubert_for_token_classification(*config_and_inputs) + + def test_for_multiple_choice(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_flaubert_for_multiple_choice(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + for model_name in TF_FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + model = TFFlaubertModel.from_pretrained(model_name) + self.assertIsNotNone(model) @require_tf diff --git a/tests/test_modeling_tf_xlm.py b/tests/test_modeling_tf_xlm.py index 26cdb0a39c6f..1903f4a8dfb4 100644 --- a/tests/test_modeling_tf_xlm.py +++ b/tests/test_modeling_tf_xlm.py @@ -32,6 +32,7 @@ TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple, TFXLMForTokenClassification, + TFXLMForMultipleChoice, TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST, ) @@ -91,6 +92,7 @@ def prepare_config_and_inputs(self): sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32) + choice_labels = ids_tensor([self.batch_size], self.num_choices) config = XLMConfig( vocab_size=self.vocab_size, @@ -120,6 +122,7 @@ def prepare_config_and_inputs(self): sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ) @@ -132,6 +135,7 @@ def create_and_check_xlm_model( sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ): model = TFXLMModel(config=config) @@ -157,6 +161,7 @@ def create_and_check_xlm_lm_head( sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ): model = TFXLMWithLMHeadModel(config) @@ -181,6 +186,7 @@ def create_and_check_xlm_qa( sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ): model = TFXLMForQuestionAnsweringSimple(config) @@ -206,6 +212,7 @@ def create_and_check_xlm_sequence_classif( sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ): model = TFXLMForSequenceClassification(config) @@ -229,6 +236,7 @@ def create_and_check_xlm_for_token_classification( sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ): config.num_labels = self.num_labels @@ -240,6 +248,32 @@ def create_and_check_xlm_for_token_classification( } self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]) + def create_and_check_xlm_for_multiple_choice( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ): + config.num_choices = self.num_choices + model = TFXLMForMultipleChoice(config=config) + multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1)) + multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1)) + multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1)) + inputs = { + "input_ids": multiple_choice_inputs_ids, + "attention_mask": multiple_choice_input_mask, + "token_type_ids": multiple_choice_token_type_ids, + } + (logits,) = model(inputs) + result = {"logits": logits.numpy()} + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) + def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() ( @@ -250,6 +284,7 @@ def prepare_config_and_inputs_for_common(self): sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ) = config_and_inputs inputs_dict = { @@ -265,13 +300,13 @@ def prepare_config_and_inputs_for_common(self): class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase): all_model_classes = ( - # TODO The multiple choice model is missing and should be added. ( TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple, TFXLMForTokenClassification, + TFXLMForMultipleChoice, ) if is_tf_available() else () @@ -307,6 +342,10 @@ def test_for_token_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_xlm_for_token_classification(*config_and_inputs) + def test_for_multiple_choice(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_xlm_for_multiple_choice(*config_and_inputs) + @slow def test_model_from_pretrained(self): for model_name in TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: diff --git a/tests/test_modeling_xlm.py b/tests/test_modeling_xlm.py index 2a5cd4096ae2..efa9346cee51 100644 --- a/tests/test_modeling_xlm.py +++ b/tests/test_modeling_xlm.py @@ -33,6 +33,7 @@ XLMForQuestionAnswering, XLMForSequenceClassification, XLMForQuestionAnsweringSimple, + XLMForMultipleChoice, ) from transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_LIST @@ -63,7 +64,7 @@ def __init__( self.max_position_embeddings = 512 self.type_sequence_label_size = 2 self.initializer_range = 0.02 - self.num_labels = 3 + self.num_labels = 2 self.num_choices = 4 self.summary_type = "last" self.use_proj = True @@ -91,6 +92,7 @@ def prepare_config_and_inputs(self): sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) is_impossible_labels = ids_tensor([self.batch_size], 2).float() + choice_labels = ids_tensor([self.batch_size], self.num_choices) config = XLMConfig( vocab_size=self.vocab_size, @@ -109,6 +111,7 @@ def prepare_config_and_inputs(self): initializer_range=self.initializer_range, summary_type=self.summary_type, use_proj=self.use_proj, + num_labels=self.num_labels, bos_token_id=self.bos_token_id, ) @@ -120,6 +123,7 @@ def prepare_config_and_inputs(self): sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ) @@ -135,6 +139,7 @@ def create_and_check_xlm_model( sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ): model = XLMModel(config=config) @@ -160,6 +165,7 @@ def create_and_check_xlm_lm_head( sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ): model = XLMWithLMHeadModel(config) @@ -185,6 +191,7 @@ def create_and_check_xlm_simple_qa( sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ): model = XLMForQuestionAnsweringSimple(config) @@ -214,6 +221,7 @@ def create_and_check_xlm_qa( sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ): model = XLMForQuestionAnswering(config) @@ -280,6 +288,7 @@ def create_and_check_xlm_sequence_classif( sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ): model = XLMForSequenceClassification(config) @@ -306,6 +315,7 @@ def create_and_check_xlm_token_classif( sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ): config.num_labels = self.num_labels @@ -321,6 +331,38 @@ def create_and_check_xlm_token_classif( self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]) self.check_loss_output(result) + def create_and_check_xlm_for_multiple_choice( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ): + config.num_choices = self.num_choices + model = XLMForMultipleChoice(config=config) + model.to(torch_device) + model.eval() + multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + loss, logits = model( + multiple_choice_inputs_ids, + attention_mask=multiple_choice_input_mask, + token_type_ids=multiple_choice_token_type_ids, + labels=choice_labels, + ) + result = { + "loss": loss, + "logits": logits, + } + self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices]) + self.check_loss_output(result) + def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() ( @@ -331,6 +373,7 @@ def prepare_config_and_inputs_for_common(self): sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ) = config_and_inputs inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "lengths": input_lengths} @@ -348,6 +391,7 @@ class XLMModelTest(ModelTesterMixin, unittest.TestCase): XLMForSequenceClassification, XLMForQuestionAnsweringSimple, XLMForTokenClassification, + XLMForMultipleChoice, ) if is_torch_available() else () @@ -387,6 +431,10 @@ def test_xlm_token_classif(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_xlm_token_classif(*config_and_inputs) + def test_xlm_for_multiple_choice(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_xlm_for_multiple_choice(*config_and_inputs) + @slow def test_model_from_pretrained(self): for model_name in XLM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: From 54f9fbeff822ec0547fd23d0338654456925f6b7 Mon Sep 17 00:00:00 2001 From: Julien Plu Date: Wed, 29 Jul 2020 20:32:01 +0200 Subject: [PATCH 056/127] Rework TF trainer (#6038) * Fully rework training/prediction loops * fix method name * Fix variable name * Fix property name * Fix scope * Fix method name * Fix tuple index * Fix tuple index * Fix indentation * Fix variable name * fix eval before log * Add drop remainder for test dataset * Fix step number + fix logging datetime * fix eval loss value * use global step instead of step + fix logging at step 0 * Fix logging datetime * Fix global_step usage * Fix breaking loop + logging datetime * Fix step in prediction loop * Fix step breaking * Fix train/test loops * Force TF at least 2.2 for the trainer * Use assert_cardinality to facilitate the dataset size computation * Log steps per epoch * Make tfds compliant with TPU * Make tfds compliant with TPU * Use TF dataset enumerate instead of the Python one * revert previous commit * Fix data_dir * Apply style * rebase on master * Address Sylvain's comments * Address Sylvain's and Lysandre comments * Trigger CI * Remove unused import --- examples/README.md | 2 +- .../multiple-choice/utils_multiple_choice.py | 2 + examples/question-answering/run_tf_squad.py | 11 +- examples/text-classification/run_tf_glue.py | 27 +- examples/token-classification/run_tf_ner.py | 6 - examples/token-classification/utils_ner.py | 4 +- src/transformers/modeling_tf_utils.py | 7 +- src/transformers/trainer_tf.py | 400 +++++++++--------- src/transformers/training_args_tf.py | 4 +- 9 files changed, 248 insertions(+), 215 deletions(-) diff --git a/examples/README.md b/examples/README.md index 7a330a74eff8..a298ea4ea3e6 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1,7 +1,7 @@ # Examples Version 2.9 of ðŸ¤— Transformers introduces a new [`Trainer`](https://github.com/huggingface/transformers/blob/master/src/transformers/trainer.py) class for PyTorch, and its equivalent [`TFTrainer`](https://github.com/huggingface/transformers/blob/master/src/transformers/trainer_tf.py) for TF 2. -Running the examples requires PyTorch 1.3.1+ or TensorFlow 2.1+. +Running the examples requires PyTorch 1.3.1+ or TensorFlow 2.2+. Here is the list of all our examples: - **grouped by task** (all official examples work for multiple models) diff --git a/examples/multiple-choice/utils_multiple_choice.py b/examples/multiple-choice/utils_multiple_choice.py index 2f6dd040dce0..d17dd902f681 100644 --- a/examples/multiple-choice/utils_multiple_choice.py +++ b/examples/multiple-choice/utils_multiple_choice.py @@ -204,6 +204,8 @@ def gen(): ) def get_dataset(self): + self.dataset = self.dataset.apply(tf.data.experimental.assert_cardinality(len(self.features))) + return self.dataset def __len__(self): diff --git a/examples/question-answering/run_tf_squad.py b/examples/question-answering/run_tf_squad.py index 1c654c32bfa5..7e90416bf5d7 100644 --- a/examples/question-answering/run_tf_squad.py +++ b/examples/question-answering/run_tf_squad.py @@ -21,6 +21,8 @@ from dataclasses import dataclass, field from typing import Optional +import tensorflow as tf + from transformers import ( AutoConfig, AutoTokenizer, @@ -68,6 +70,7 @@ class DataTrainingArguments: data_dir: Optional[str] = field( default=None, metadata={"help": "The input data dir. Should contain the .json files for the SQuAD task."} ) + use_tfds: Optional[bool] = field(default=True, metadata={"help": "If TFDS should be used or not."}) max_seq_length: int = field( default=128, metadata={ @@ -170,7 +173,7 @@ def main(): ) # Get datasets - if not data_args.data_dir: + if data_args.use_tfds: if data_args.version_2_with_negative: logger.warn("tensorflow_datasets does not handle version 2 of SQuAD. Switch to version 1 automatically") @@ -179,7 +182,7 @@ def main(): except ImportError: raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.") - tfds_examples = tfds.load("squad") + tfds_examples = tfds.load("squad", data_dir=data_args.data_dir) train_examples = ( SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=False) if training_args.do_train @@ -209,6 +212,8 @@ def main(): else None ) + train_dataset = train_dataset.apply(tf.data.experimental.assert_cardinality(len(train_examples))) + eval_dataset = ( squad_convert_examples_to_features( examples=eval_examples, @@ -223,6 +228,8 @@ def main(): else None ) + eval_dataset = eval_dataset.apply(tf.data.experimental.assert_cardinality(len(eval_examples))) + # Initialize our Trainer trainer = TFTrainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset,) diff --git a/examples/text-classification/run_tf_glue.py b/examples/text-classification/run_tf_glue.py index a1e4f7a90ae4..5477447040d6 100644 --- a/examples/text-classification/run_tf_glue.py +++ b/examples/text-classification/run_tf_glue.py @@ -9,6 +9,7 @@ from typing import Dict, Optional import numpy as np +import tensorflow as tf import tensorflow_datasets as tfds from transformers import ( @@ -35,7 +36,11 @@ class Split(Enum): def get_tfds( - task_name: str, tokenizer: PreTrainedTokenizer, max_seq_length: Optional[int] = None, mode: Split = Split.train + task_name: str, + tokenizer: PreTrainedTokenizer, + max_seq_length: Optional[int] = None, + mode: Split = Split.train, + data_dir: str = None, ): if task_name == "mnli-mm" and mode == Split.dev: tfds_name = "mnli_mismatched" @@ -50,9 +55,11 @@ def get_tfds( else: tfds_name = task_name - ds = tfds.load("glue/" + tfds_name, split=mode.value) + ds, info = tfds.load("glue/" + tfds_name, split=mode.value, with_info=True, data_dir=data_dir) + ds = glue_convert_examples_to_features(ds, tokenizer, max_seq_length, task_name) + ds = ds.apply(tf.data.experimental.assert_cardinality(info.splits[mode.value].num_examples)) - return glue_convert_examples_to_features(ds, tokenizer, max_seq_length, task_name) + return ds logger = logging.getLogger(__name__) @@ -69,6 +76,7 @@ class GlueDataTrainingArguments: """ task_name: str = field(metadata={"help": "The name of the task to train on: " + ", ".join(glue_processors.keys())}) + data_dir: Optional[str] = field(default=None, metadata={"help": "The input/output data dir for TFDS."}) max_seq_length: int = field( default=128, metadata={ @@ -171,13 +179,22 @@ def main(): # Get datasets train_dataset = ( - get_tfds(task_name=data_args.task_name, tokenizer=tokenizer, max_seq_length=data_args.max_seq_length) + get_tfds( + task_name=data_args.task_name, + tokenizer=tokenizer, + max_seq_length=data_args.max_seq_length, + data_dir=data_args.data_dir, + ) if training_args.do_train else None ) eval_dataset = ( get_tfds( - task_name=data_args.task_name, tokenizer=tokenizer, max_seq_length=data_args.max_seq_length, mode=Split.dev + task_name=data_args.task_name, + tokenizer=tokenizer, + max_seq_length=data_args.max_seq_length, + mode=Split.dev, + data_dir=data_args.data_dir, ) if training_args.do_eval else None diff --git a/examples/token-classification/run_tf_ner.py b/examples/token-classification/run_tf_ner.py index 068f0617371c..5f38d5f981af 100644 --- a/examples/token-classification/run_tf_ner.py +++ b/examples/token-classification/run_tf_ner.py @@ -17,7 +17,6 @@ import logging import os -import warnings from dataclasses import dataclass, field from typing import Dict, List, Optional, Tuple @@ -185,11 +184,6 @@ def align_predictions(predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[L for i in range(batch_size): for j in range(seq_len): - if label_ids[i, j] == -1: - label_ids[i, j] = -100 - warnings.warn( - "Using `-1` to mask the loss for the token is depreciated. Please use `-100` instead." - ) if label_ids[i, j] != -100: out_label_list[i].append(label_map[label_ids[i][j]]) preds_list[i].append(label_map[preds[i][j]]) diff --git a/examples/token-classification/utils_ner.py b/examples/token-classification/utils_ner.py index 42e07f642a8e..af9680d26c9e 100644 --- a/examples/token-classification/utils_ner.py +++ b/examples/token-classification/utils_ner.py @@ -146,7 +146,7 @@ class TFNerDataset: """ features: List[InputFeatures] - pad_token_label_id: int = -1 + pad_token_label_id: int = -100 # Use cross entropy ignore_index as padding label id so that only # real label ids contribute to the loss later. @@ -221,6 +221,8 @@ def gen(): ) def get_dataset(self): + self.dataset = self.dataset.apply(tf.data.experimental.assert_cardinality(len(self.features))) + return self.dataset def __len__(self): diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py index 10b355f22a9d..f241fc8dcad2 100644 --- a/src/transformers/modeling_tf_utils.py +++ b/src/transformers/modeling_tf_utils.py @@ -17,7 +17,6 @@ import functools import logging import os -import warnings from typing import Dict, List, Optional, Union import h5py @@ -174,11 +173,7 @@ def compute_loss(self, labels, logits): ) # make sure only labels that are not equal to -100 # are taken into account as loss - if tf.math.reduce_any(labels == -1).numpy() is True: - warnings.warn("Using `-1` to mask the loss for the token is deprecated. Please use `-100` instead.") - active_loss = tf.reshape(labels, (-1,)) != -1 - else: - active_loss = tf.reshape(labels, (-1,)) != -100 + active_loss = tf.reshape(labels, (-1,)) != -100 reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss) labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss) diff --git a/src/transformers/trainer_tf.py b/src/transformers/trainer_tf.py index 9a2c8181ebe4..c9c06edfbf57 100644 --- a/src/transformers/trainer_tf.py +++ b/src/transformers/trainer_tf.py @@ -1,12 +1,15 @@ """Tensorflow trainer class.""" +import datetime import logging import math import os +import sys from typing import Callable, Dict, Optional, Tuple import numpy as np import tensorflow as tf +from packaging.version import parse from .modeling_tf_utils import TFPreTrainedModel from .optimization_tf import GradientAccumulator, create_optimizer @@ -21,6 +24,15 @@ logger = logging.getLogger(__name__) +if parse(tf.__version__).release < (2, 2, 0): + logger.info( + "You need to run the TensorFlow trainer with at least the version 2.2.0, your version is {}".format( + tf.__version__ + ) + ) + sys.exit(1) + + class TFTrainer: """ TFTrainer is a simple but feature-complete training and eval loop for TensorFlow, @@ -57,7 +69,7 @@ class TFTrainer: compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None prediction_loss_only: bool tb_writer: Optional[tf.summary.SummaryWriter] = None - optimizers: Tuple[tf.keras.optimizers.Optimizer, tf.keras.optimizers.schedules.LearningRateSchedule] = None + optimizers: Tuple[tf.keras.optimizers.Optimizer, tf.keras.optimizers.schedules.LearningRateSchedule] = (None, None) global_step: Optional[int] = None epoch_logging: Optional[float] = None @@ -70,7 +82,10 @@ def __init__( compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, prediction_loss_only=False, tb_writer: Optional[tf.summary.SummaryWriter] = None, - optimizers: Tuple[tf.keras.optimizers.Optimizer, tf.keras.optimizers.schedules.LearningRateSchedule] = None, + optimizers: Tuple[tf.keras.optimizers.Optimizer, tf.keras.optimizers.schedules.LearningRateSchedule] = ( + None, + None, + ), ): self.model = model self.args = args @@ -78,7 +93,7 @@ def __init__( self.eval_dataset = eval_dataset self.compute_metrics = compute_metrics self.prediction_loss_only = prediction_loss_only - self.optimizers = optimizers + self.optimizer, self.lr_scheduler = optimizers self.gradient_accumulator = GradientAccumulator() self.global_step = 0 self.epoch_logging = 0 @@ -105,23 +120,19 @@ def get_train_tfdataset(self) -> tf.data.Dataset: if self.train_dataset is None: raise ValueError("Trainer: training requires a train_dataset.") - self.num_train_examples = self.train_dataset.reduce(tf.constant(0), lambda x, _: x + 1).numpy() + self.total_train_batch_size = self.args.train_batch_size * self.args.gradient_accumulation_steps + self.num_train_examples = tf.data.experimental.cardinality(self.train_dataset).numpy() - if self.args.max_steps > 0: - self.train_steps = self.args.max_steps - else: - self.train_steps: int = math.ceil(self.num_train_examples / self.args.train_batch_size) + if self.num_train_examples < 0: + raise ValueError("The training dataset must have an asserted cardinality") ds = ( - self.train_dataset.cache() - .shuffle(self.num_train_examples) - .batch(self.args.train_batch_size, drop_remainder=self.args.dataloader_drop_last) + self.train_dataset.repeat() + .shuffle(self.num_train_examples, seed=self.args.seed) + .batch(self.total_train_batch_size, drop_remainder=self.args.dataloader_drop_last) .prefetch(tf.data.experimental.AUTOTUNE) ) - if self.args.max_steps > 0: - self.train_dataset = self.train_dataset.repeat(-1) - return self.args.strategy.experimental_distribute_dataset(ds) def get_eval_tfdataset(self, eval_dataset: Optional[tf.data.Dataset] = None) -> tf.data.Dataset: @@ -136,13 +147,20 @@ def get_eval_tfdataset(self, eval_dataset: Optional[tf.data.Dataset] = None) -> raise ValueError("Trainer: evaluation requires an eval_dataset.") eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset + num_examples = tf.data.experimental.cardinality(eval_dataset).numpy() + + if num_examples < 0: + raise ValueError("The training dataset must have an asserted cardinality") + + approx = math.floor if self.args.dataloader_drop_last else math.ceil + steps = approx(num_examples / self.args.eval_batch_size) ds = ( - eval_dataset.cache() + eval_dataset.repeat() .batch(self.args.eval_batch_size, drop_remainder=self.args.dataloader_drop_last) .prefetch(tf.data.experimental.AUTOTUNE) ) - return self.args.strategy.experimental_distribute_dataset(ds) + return self.args.strategy.experimental_distribute_dataset(ds), steps, num_examples def get_test_tfdataset(self, test_dataset: tf.data.Dataset) -> tf.data.Dataset: """ @@ -151,11 +169,23 @@ def get_test_tfdataset(self, test_dataset: tf.data.Dataset) -> tf.data.Dataset: Args: test_dataset (:class:`~tf.data.Dataset`): The dataset to use. """ - ds = test_dataset.batch(self.args.eval_batch_size, drop_remainder=self.args.dataloader_drop_last) - return self.args.strategy.experimental_distribute_dataset(ds) + num_examples = tf.data.experimental.cardinality(test_dataset).numpy() - def get_optimizers( + if num_examples < 0: + raise ValueError("The training dataset must have an asserted cardinality") + + approx = math.floor if self.args.dataloader_drop_last else math.ceil + steps = approx(num_examples / self.args.eval_batch_size) + ds = ( + test_dataset.repeat() + .batch(self.args.eval_batch_size, drop_remainder=self.args.dataloader_drop_last) + .prefetch(tf.data.experimental.AUTOTUNE) + ) + + return self.args.strategy.experimental_distribute_dataset(ds), steps, num_examples + + def create_optimizer_and_scheduler( self, num_training_steps: int, ) -> Tuple[tf.keras.optimizers.Optimizer, tf.keras.optimizers.schedules.LearningRateSchedule]: """ @@ -164,20 +194,16 @@ def get_optimizers( We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the TFTrainer's init through :obj:`optimizers`, or override this method in a subclass. """ - if self.optimizers is not None: - return self.optimizers - - optimizer, scheduler = create_optimizer( - self.args.learning_rate, - num_training_steps, - self.args.warmup_steps, - adam_beta1=self.args.adam_beta1, - adam_beta2=self.args.adam_beta2, - adam_epsilon=self.args.adam_epsilon, - weight_decay_rate=self.args.weight_decay, - ) - - return optimizer, scheduler + if not self.optimizer and not self.lr_scheduler: + self.optimizer, self.lr_scheduler = create_optimizer( + self.args.learning_rate, + num_training_steps, + self.args.warmup_steps, + adam_beta1=self.args.adam_beta1, + adam_beta2=self.args.adam_beta2, + adam_epsilon=self.args.adam_epsilon, + weight_decay_rate=self.args.weight_decay, + ) def _setup_wandb(self): """ @@ -195,29 +221,13 @@ def _setup_wandb(self): logger.info('Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"') wandb.init(project=os.getenv("WANDB_PROJECT", "huggingface"), config=vars(self.args)) - @tf.function - def _evaluate_steps(self, per_replica_features, per_replica_labels): - """ - One step evaluation across replica. - Args: - per_replica_features: the batched features. - per_replica_labels: the batched labels. - Returns: - The loss corresponding to the given batch. - """ - per_replica_loss, per_replica_logits = self.args.strategy.experimental_run_v2( - self._run_model, args=(per_replica_features, per_replica_labels, False) - ) - - try: - reduced_loss = self.args.strategy.reduce(tf.distribute.ReduceOp.MEAN, per_replica_loss, axis=0) - except ValueError: - reduced_loss = self.args.strategy.reduce(tf.distribute.ReduceOp.MEAN, per_replica_loss, None) - - return reduced_loss, per_replica_logits - def _prediction_loop( - self, dataset: tf.data.Dataset, description: str, prediction_loss_only: Optional[bool] = None + self, + dataset: tf.data.Dataset, + steps: int, + num_examples: int, + description: str, + prediction_loss_only: Optional[bool] = None, ) -> PredictionOutput: """ Prediction/evaluation loop, shared by `evaluate()` and `predict()`. @@ -228,21 +238,20 @@ def _prediction_loop( prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only logger.info("***** Running %s *****", description) + logger.info(" Num examples = %d", num_examples) logger.info(" Batch size = %d", self.args.eval_batch_size) label_ids: np.ndarray = None preds: np.ndarray = None - - step: int = 1 + self.eval_loss = tf.keras.metrics.Sum() # Reset the past mems state at the beginning of the evaluation if necessary. if self.args.past_index >= 0: self._past = None - for features, labels in dataset: - step = tf.convert_to_tensor(step, dtype=tf.int64) - loss, logits = self._evaluate_steps(features, labels) - loss = tf.reduce_mean(loss) + for step, batch in enumerate(dataset): + logits = self.distributed_test_steps(batch) + _, labels = batch if not prediction_loss_only: if isinstance(logits, tuple): @@ -274,14 +283,15 @@ def _prediction_loop( else: label_ids = np.append(label_ids, labels.numpy(), axis=0) - step += 1 + if step == steps: + break if self.compute_metrics is not None and preds is not None and label_ids is not None: metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids)) else: metrics = {} - metrics["eval_loss"] = loss.numpy() + metrics["eval_loss"] = self.eval_loss.result().numpy() / (steps * self.args.eval_batch_size) for key in list(metrics.keys()): if not key.startswith("eval_"): @@ -322,9 +332,9 @@ def evaluate(self, eval_dataset: Optional[tf.data.Dataset] = None) -> Dict[str, Returns: A dictionary containing the evaluation loss and the potential metrics computed from the predictions. """ - eval_ds = self.get_eval_tfdataset(eval_dataset) + eval_ds, steps, num_examples = self.get_eval_tfdataset(eval_dataset) - output = self._prediction_loop(eval_ds, description="Evaluation") + output = self._prediction_loop(eval_ds, steps, num_examples, description="Evaluation") logs = {**output.metrics} logs["epoch"] = self.epoch_logging @@ -333,6 +343,19 @@ def evaluate(self, eval_dataset: Optional[tf.data.Dataset] = None) -> Dict[str, return output.metrics + def test_step(self, features, labels): + per_example_loss, logits = self._run_model(features, labels, False) + + self.eval_loss.update_state(per_example_loss) + + return logits + + @tf.function + def distributed_test_steps(self, batch): + logits = self.args.strategy.run(self.test_step, batch) + + return logits + def train(self) -> None: """ Train method to train the model. @@ -346,24 +369,18 @@ def train(self) -> None: if self.args.max_steps > 0: t_total = self.args.max_steps - steps_per_epoch = self.args.max_steps + self.steps_per_epoch = self.args.max_steps else: - if self.args.dataloader_drop_last: - approx = math.floor - else: - approx = math.ceil - - steps_per_epoch = approx( - self.num_train_examples / (self.args.train_batch_size * self.args.gradient_accumulation_steps) - ) - t_total = steps_per_epoch * self.args.num_train_epochs + approx = math.floor if self.args.dataloader_drop_last else math.ceil + self.steps_per_epoch = approx(self.num_train_examples / self.total_train_batch_size) + t_total = self.steps_per_epoch * self.args.num_train_epochs with self.args.strategy.scope(): - optimizer, lr_scheduler = self.get_optimizers(num_training_steps=t_total) - iterations = optimizer.iterations + self.create_optimizer_and_scheduler(num_training_steps=t_total) + iterations = self.optimizer.iterations self.global_step = iterations.numpy() folder = os.path.join(self.args.output_dir, PREFIX_CHECKPOINT_DIR) - ckpt = tf.train.Checkpoint(optimizer=optimizer, model=self.model) + ckpt = tf.train.Checkpoint(optimizer=self.optimizer, model=self.model) self.model.ckpt_manager = tf.train.CheckpointManager(ckpt, folder, max_to_keep=self.args.save_total_limit) if self.model.ckpt_manager.latest_checkpoint: @@ -384,141 +401,138 @@ def train(self) -> None: else: epochs_trained = 1 - tf.summary.experimental.set_step(iterations) + tf.summary.experimental.set_step(iterations) - epochs = 1 if self.args.max_steps > 0 else self.args.num_train_epochs + epochs = 1 if self.args.max_steps > 0 else self.args.num_train_epochs - if self.args.fp16: - policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16") - tf.keras.mixed_precision.experimental.set_policy(policy) + if self.args.fp16: + policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16") + tf.keras.mixed_precision.experimental.set_policy(policy) - with self.tb_writer.as_default(): - tf.summary.text("args", self.args.to_json_string()) + with self.tb_writer.as_default(): + tf.summary.text("args", self.args.to_json_string()) - self.tb_writer.flush() + self.tb_writer.flush() - logger.info("***** Running training *****") - logger.info(" Num examples = %d", self.num_train_examples) - logger.info(" Num Epochs = %d", epochs) - logger.info(" Instantaneous batch size per device = %d", self.args.per_device_train_batch_size) - logger.info( - " Total train batch size (w. parallel, distributed & accumulation) = %d", self.args.train_batch_size - ) - logger.info(" Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps) - logger.info(" Total optimization steps = %d", t_total) - - for epoch_iter in range(epochs_trained, int(epochs + 1)): - # Reset the past mems state at the beginning of each epoch if necessary. - if self.args.past_index >= 0: - self._past = None - for step, training_loss in enumerate(self._training_steps(train_ds, optimizer)): - self.global_step = iterations.numpy() - self.epoch_logging = epoch_iter - 1 + (step + 1) / steps_per_epoch - - if self.args.debug: - logs = {} - logs["loss"] = training_loss.numpy() - logs["epoch"] = self.epoch_logging - - self._log(logs) - - if self.global_step == 1 and self.args.debug: - with self.tb_writer.as_default(): - tf.summary.trace_export( - name="training", step=self.global_step, profiler_outdir=self.args.logging_dir - ) - - if self.args.evaluate_during_training and self.global_step % self.args.eval_steps == 0: - self.evaluate() - - if ( - self.global_step % self.args.logging_steps == 0 - or self.global_step == 1 - and self.args.logging_first_step - ): - logs = {} - logs["loss"] = training_loss.numpy() - logs["learning_rate"] = lr_scheduler(self.global_step).numpy() - logs["epoch"] = self.epoch_logging - - self._log(logs) - - if self.global_step % self.args.save_steps == 0: - ckpt_save_path = self.model.ckpt_manager.save() - logger.info("Saving checkpoint for step {} at {}".format(self.global_step, ckpt_save_path)) - - if self.args.max_steps > 0 and self.global_step % self.args.max_steps == 0: - break + logger.info("***** Running training *****") + logger.info(" Num examples = %d", self.num_train_examples) + logger.info(" Num Epochs = %d", epochs) + logger.info(" Instantaneous batch size per device = %d", self.args.per_device_train_batch_size) + logger.info( + " Total train batch size (w. parallel, distributed & accumulation) = %d", self.total_train_batch_size + ) + logger.info(" Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps) + logger.info(" Steps per epoch = %d", self.steps_per_epoch) + logger.info(" Total optimization steps = %d", t_total) - if self.args.past_index and hasattr(self, "_past"): - # Clean the state at the end of training - delattr(self, "_past") + self.train_loss = tf.keras.metrics.Sum() + start_time = datetime.datetime.now() - def _training_steps(self, ds, optimizer): - """ - Returns a generator over training steps (i.e. parameters update). - """ - for i, loss in enumerate(self._accumulate_next_gradients(ds)): - if i % self.args.gradient_accumulation_steps == 0: - self._apply_gradients(optimizer) - yield loss + for epoch_iter in range(epochs_trained, int(epochs + 1)): + # Reset the past mems state at the beginning of each epoch if necessary. + if self.args.past_index >= 0: + self._past = None - @tf.function - def _apply_gradients(self, optimizer): - """Applies the gradients (cross-replica).""" - self.args.strategy.experimental_run_v2(self._step, args=(optimizer,)) + for step, batch in enumerate(train_ds): + self.global_step = iterations.numpy() + self.epoch_logging = epoch_iter - 1 + (step + 1) / self.steps_per_epoch - def _step(self, optimizer): - """Applies gradients and resets accumulation.""" - gradient_scale = self.gradient_accumulator.step * self.args.strategy.num_replicas_in_sync - gradients = [ - gradient / tf.cast(gradient_scale, gradient.dtype) for gradient in self.gradient_accumulator.gradients - ] - gradients = [(tf.clip_by_value(grad, -self.args.max_grad_norm, self.args.max_grad_norm)) for grad in gradients] + self.distributed_training_steps(batch) - optimizer.apply_gradients(list(zip(gradients, self.model.trainable_variables))) - self.gradient_accumulator.reset() + training_loss = self.train_loss.result() / ((step + 1) * self.total_train_batch_size) - def _accumulate_next_gradients(self, ds): - """Accumulates the gradients from the next element in dataset.""" - iterator = iter(ds) + if self.args.debug: + logs = {} + logs["loss"] = training_loss.numpy() + logs["epoch"] = self.epoch_logging - @tf.function - def _accumulate_next(): - per_replica_features, per_replica_labels = next(iterator) + self._log(logs) - return self._accumulate_gradients(per_replica_features, per_replica_labels) + if self.global_step == 1 and self.args.debug: + with self.tb_writer.as_default(): + tf.summary.trace_export( + name="training", step=self.global_step, profiler_outdir=self.args.logging_dir + ) - while True: - try: - yield _accumulate_next() - except tf.errors.OutOfRangeError: - break + if ( + self.global_step > 0 + and self.args.evaluate_during_training + and self.global_step % self.args.eval_steps == 0 + ): + self.evaluate() - def _accumulate_gradients(self, per_replica_features, per_replica_labels): - """Accumulates the gradients across all the replica.""" - per_replica_loss = self.args.strategy.experimental_run_v2( - self._forward, args=(per_replica_features, per_replica_labels) - ) + if (self.global_step > 0 and self.global_step % self.args.logging_steps == 0) or ( + self.global_step == 1 and self.args.logging_first_step + ): + logs = {} + logs["loss"] = training_loss.numpy() + logs["learning_rate"] = self.lr_scheduler(self.global_step).numpy() + logs["epoch"] = self.epoch_logging + + self._log(logs) + + if self.global_step > 0 and self.global_step % self.args.save_steps == 0: + ckpt_save_path = self.model.ckpt_manager.save() + + logger.info("Saving checkpoint for step {} at {}".format(self.global_step, ckpt_save_path)) + + if self.global_step > 0 and self.global_step % self.steps_per_epoch == 0: + break - try: - reduced_loss = self.args.strategy.reduce(tf.distribute.ReduceOp.MEAN, per_replica_loss, axis=0) - except ValueError: - reduced_loss = self.args.strategy.reduce(tf.distribute.ReduceOp.MEAN, per_replica_loss, None) + self.train_loss.reset_states() - return reduced_loss + end_time = datetime.datetime.now() - def _forward(self, features, labels): - """Forwards a training example and accumulates the gradients.""" + logger.info("Training took: {}".format(str(end_time - start_time))) + + if self.args.past_index and hasattr(self, "_past"): + # Clean the state at the end of training + delattr(self, "_past") + + def training_step(self, features, labels): per_example_loss, _ = self._run_model(features, labels, True) - gradients = tf.gradients(per_example_loss, self.model.trainable_variables) + scaled_loss = per_example_loss / self.total_train_batch_size + gradients = tf.gradients(scaled_loss, self.model.trainable_variables) gradients = [ g if g is not None else tf.zeros_like(v) for g, v in zip(gradients, self.model.trainable_variables) ] - self.gradient_accumulator(gradients) + if self.args.gradient_accumulation_steps > 1: + self.gradient_accumulator(gradients) + + self.train_loss.update_state(per_example_loss) + + if self.args.gradient_accumulation_steps == 1: + return gradients + + def apply_gradients(self, features, labels): + if self.args.gradient_accumulation_steps == 1: + gradients = self.training_step(features, labels) + + self.optimizer.apply_gradients(list(zip(gradients, self.model.trainable_variables))) + else: + for _ in tf.range(self.args.gradient_accumulation_steps): + reduced_features = features[: self.args.train_batch_size / self.args.n_replicas] + reduced_labels = labels[: self.args.train_batch_size / self.args.n_replicas] + + self.training_step(reduced_features, reduced_labels) + + features = tf.concat( + [features[self.args.train_batch_size / self.args.n_replicas :], reduced_features], axis=0 + ) + + gradients = self.gradient_accumulator.gradients + gradients = [ + (tf.clip_by_value(grad, -self.args.max_grad_norm, self.args.max_grad_norm)) for grad in gradients + ] + + self.optimizer.apply_gradients(list(zip(gradients, self.model.trainable_variables))) + self.gradient_accumulator.reset() - return per_example_loss + @tf.function + def distributed_training_steps(self, batch): + with self.args.strategy.scope(): + self.args.strategy.run(self.apply_gradients, batch) def _run_model(self, features, labels, training): """ @@ -530,14 +544,16 @@ def _run_model(self, features, labels, training): """ if self.args.past_index >= 0 and getattr(self, "_past", None) is not None: features["mems"] = self._past + if isinstance(labels, (dict)): outputs = self.model(features, training=training, **labels)[:2] else: outputs = self.model(features, labels=labels, training=training)[:2] + loss, logits = outputs[:2] + if self.args.past_index >= 0: self._past = outputs[self.args.past_index] - loss += sum(self.model.losses) * (1.0 / self.args.n_replicas) return loss, logits @@ -560,9 +576,9 @@ def predict(self, test_dataset: tf.data.Dataset) -> PredictionOutput: metrics (:obj:`Dict[str, float]`, `optional`): The potential dictionary of metrics (if the dataset contained labels). """ - test_ds = self.get_test_tfdataset(test_dataset) + test_ds, steps, num_examples = self.get_test_tfdataset(test_dataset) - return self._prediction_loop(test_ds, description="Prediction") + return self._prediction_loop(test_ds, steps, num_examples, description="Prediction") def save_model(self, output_dir: Optional[str] = None): """ diff --git a/src/transformers/training_args_tf.py b/src/transformers/training_args_tf.py index 1ec50caffc9f..0adf34464507 100644 --- a/src/transformers/training_args_tf.py +++ b/src/transformers/training_args_tf.py @@ -162,7 +162,7 @@ def train_batch_size(self) -> int: "version. Using `--per_device_train_batch_size` is preferred." ) per_device_batch_size = self.per_gpu_train_batch_size or self.per_device_train_batch_size - return per_device_batch_size * max(1, self.n_replicas) + return per_device_batch_size * self.n_replicas @property def eval_batch_size(self) -> int: @@ -175,7 +175,7 @@ def eval_batch_size(self) -> int: "version. Using `--per_device_eval_batch_size` is preferred." ) per_device_batch_size = self.per_gpu_eval_batch_size or self.per_device_eval_batch_size - return per_device_batch_size * max(1, self.n_replicas) + return per_device_batch_size * self.n_replicas @property @tf_required From 3212b8850d20f5adf6242074193854cf1f808c86 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Wed, 29 Jul 2020 22:09:46 -0700 Subject: [PATCH 057/127] [s2s] add support for overriding config params (#6149) --- examples/lightning_base.py | 23 +++++++++++ examples/seq2seq/README.md | 47 ++++++++++++++-------- examples/seq2seq/finetune.sh | 4 ++ examples/seq2seq/test_seq2seq_examples.py | 49 +++++++++++++++++++++++ 4 files changed, 106 insertions(+), 17 deletions(-) diff --git a/examples/lightning_base.py b/examples/lightning_base.py index 1124f57662b6..754538e79279 100644 --- a/examples/lightning_base.py +++ b/examples/lightning_base.py @@ -70,6 +70,13 @@ def __init__( ) else: self.config: PretrainedConfig = config + + extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout") + for p in extra_model_params: + if getattr(self.hparams, p, None): + assert hasattr(self.config, p), f"model config doesn't have a `{p}` attribute" + setattr(self.config, p, getattr(self.hparams, p)) + if tokenizer is None: self.tokenizer = AutoTokenizer.from_pretrained( self.hparams.tokenizer_name if self.hparams.tokenizer_name else self.hparams.model_name_or_path, @@ -182,6 +189,22 @@ def add_model_specific_args(parser, root_dir): type=str, help="Where do you want to store the pre-trained models downloaded from s3", ) + parser.add_argument( + "--encoder_layerdrop", + type=float, + help="Encoder layer dropout probability (Optional). Goes into model.config", + ) + parser.add_argument( + "--decoder_layerdrop", + type=float, + help="Decoder layer dropout probability (Optional). Goes into model.config", + ) + parser.add_argument( + "--dropout", type=float, help="Dropout probability (Optional). Goes into model.config", + ) + parser.add_argument( + "--attention_dropout", type=float, help="Attention dropout probability (Optional). Goes into model.config", + ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") diff --git a/examples/seq2seq/README.md b/examples/seq2seq/README.md index 033d200c0994..58ba4fce118f 100644 --- a/examples/seq2seq/README.md +++ b/examples/seq2seq/README.md @@ -3,7 +3,7 @@ This directory contains examples for finetuning and evaluating transformers on summarization and translation tasks. Summarization support is more mature than translation support. Please tag @sshleifer with any issues/unexpected behaviors, or send a PR! -For `bertabs` instructions, see [`bertabs/README.md`](bertabs/README.md). +For `bertabs` instructions, see [`bertabs/README.md`](bertabs/README.md). ### Data @@ -35,23 +35,23 @@ export ENRO_DIR=${PWD}/wmt_en_ro this should make a directory called `wmt_en_ro/` with files like `test.source`. ``` -If you are using your own data, it must be formatted as one directory with 6 files: train.source, train.target, val.source, val.target, test.source, test.target. +If you are using your own data, it must be formatted as one directory with 6 files: train.source, train.target, val.source, val.target, test.source, test.target. The `.source` files are the input, the `.target` files are the desired output. - + ### Tips and Tricks General Tips: -- since you need to run from `examples/seq2seq`, and likely need to modify code, the easiest workflow is fork transformers, clone your fork, and run `pip install -e .` before you get started. +- since you need to run from `examples/seq2seq`, and likely need to modify code, the easiest workflow is fork transformers, clone your fork, and run `pip install -e .` before you get started. - try `--freeze_encoder` or `--freeze_embeds` for faster training/larger batch size. (3hr per epoch with bs=8, see the "xsum_shared_task" command below) - `fp16_opt_level=O1` (the default works best). - In addition to the pytorch-lightning .ckpt checkpoint, a transformers checkpoint will be saved. Load it with `BartForConditionalGeneration.from_pretrained(f'{output_dir}/best_tfmr)`. - At the moment, `--do_predict` does not work in a multi-gpu setting. You need to use `evaluate_checkpoint` or the `run_eval.py` code. -- This warning can be safely ignored: +- This warning can be safely ignored: > "Some weights of BartForConditionalGeneration were not initialized from the model checkpoint at facebook/bart-large-xsum and are newly initialized: ['final_logits_bias']" - Both finetuning and eval are 30% faster with `--fp16`. For that you need to [install apex](https://github.com/NVIDIA/apex#quick-start). -- Read scripts before you run them! +- Read scripts before you run them! Summarization Tips: - (summ) 1 epoch at batch size 1 for bart-large takes 24 hours and requires 13GB GPU RAM with fp16 on an NVIDIA-V100. @@ -60,12 +60,25 @@ Summarization Tips: - `--max_target_length=60 --val_max_target_length=60 --test_max_target_length=100 ` is a reasonable setting for XSUM. - `wandb` can be used by specifying `--logger_name wandb`. It is useful for reproducibility. Specify the environment variable `WANDB_PROJECT='hf_xsum'` to do the XSUM shared task. - If you are finetuning on your own dataset, start from `distilbart-cnn-12-6` if you want long summaries and `distilbart-xsum-12-6` if you want short summaries. -(It rarely makes sense to start from `bart-large` unless you are a researching finetuning methods). +(It rarely makes sense to start from `bart-large` unless you are a researching finetuning methods). **Update 2018-07-18** Datasets: Seq2SeqDataset will be used for all models besides MBart, for which MBartDataset will be used.** A new dataset is needed to support multilingual tasks. +### Finetuning Training Params + +To override the pretrained model's training params, you can pass them to `./finetune.sh`: + +```bash +./finetune.sh \ + [...] + --encoder_layerdrop 0.1 \ + --decoder_layerdrop 0.1 \ + --dropout 0.1 \ + --attention_dropout 0.1 \ +``` + ### Summarization Finetuning Run/modify `finetune.sh` @@ -90,7 +103,7 @@ Best performing command: ```bash # optionally export ENRO_DIR='wmt_en_ro' # Download instructions above -# export WANDB_PROJECT="MT" # optional +# export WANDB_PROJECT="MT" # optional export MAX_LEN=200 export BS=4 export GAS=8 # gradient accumulation steps @@ -109,8 +122,8 @@ export BS=4 export GAS=1 # gradient accumulation steps ./train_mbart_cc25_enro.sh --output_dir enro_finetune_baseline --gpus 8 --logger_name wandb ``` -### Finetuning Outputs -As you train, `output_dir` will be filled with files, that look kind of like this (comments are mine). +### Finetuning Outputs +As you train, `output_dir` will be filled with files, that look kind of like this (comments are mine). Some of them are metrics, some of them are checkpoints, some of them are metadata. Here is a quick tour: ```bash @@ -128,8 +141,8 @@ output_dir â”œâ”€â”€ student # this is a huggingface checkpoint generated by SummarizationDistiller. It is the student before it gets finetuned. â”‚Â Â â”œâ”€â”€ config.json â”‚Â Â â””â”€â”€ pytorch_model.bin -â”œâ”€â”€ test_generations.txt -# ^^ are the summaries or translations produced by your best checkpoint on the test data. Populated when training is done +â”œâ”€â”€ test_generations.txt +# ^^ are the summaries or translations produced by your best checkpoint on the test data. Populated when training is done â”œâ”€â”€ test_results.txt # a convenience file with the test set metrics. This data is also in metrics.json['test'] â”œâ”€â”€ hparams.pkl # the command line args passed after some light preprocessing. Should be saved fairly quickly. ``` @@ -191,7 +204,7 @@ python run_eval.py sshleifer/distilbart-cnn-12-6 $DATA_DIR/val.source dbart_val_ ![DBART](https://huggingface.co/front/thumbnails/distilbart_large.png) For the CNN/DailyMail dataset, (relatively longer, more extractive summaries), we found a simple technique that works: -you just copy alternating layers from `bart-large-cnn` and finetune more on the same data. +you just copy alternating layers from `bart-large-cnn` and finetune more on the same data. For the XSUM dataset, that didnâ€™t work as well so we used that same initialization strategy followed by a combination of Distillbertâ€™s ce_loss and the hidden states MSE loss used in the tinybert paper. @@ -207,7 +220,7 @@ They are initialized by copying layers from the associated `bart-large-{cnn|xsum The command that produced `sshleifer/distilbart-cnn-12-6` is ```bash ./train_distilbart_cnn.sh -``` +``` runtime: 6H on NVIDIA RTX 24GB GPU *Note*: You can get the same simple distillation logic by using `./run_distiller.sh --no_teacher` followed by identical arguments as the ones in `train_distilbart_cnn.sh`. @@ -223,15 +236,15 @@ This is how `sshleifer/distilbart-xsum*` checkpoints were produced. The command that produced `sshleifer/distilbart-xsum-12-6` is: ```bash -./train_distilbart_xsum.sh +./train_distilbart_xsum.sh ``` -runtime: 13H on V-100 16GB GPU. +runtime: 13H on V-100 16GB GPU. ### Contributing - follow the standard contributing guidelines and code of conduct. - add tests to `test_seq2seq_examples.py` - To run only the seq2seq tests, you must be in the root of the repository and run: ```bash -pytest examples/seq2seq/ +pytest examples/seq2seq/ ``` diff --git a/examples/seq2seq/finetune.sh b/examples/seq2seq/finetune.sh index 78de9a3f74cf..fa7e9bf63cb1 100755 --- a/examples/seq2seq/finetune.sh +++ b/examples/seq2seq/finetune.sh @@ -10,4 +10,8 @@ python finetune.py \ --do_predict \ --n_val 1000 \ --val_check_interval 0.1 \ + --encoder_layerdrop 0.1 \ + --decoder_layerdrop 0.1 \ + --dropout 0.1 \ + --attention_dropout 0.1 \ $@ diff --git a/examples/seq2seq/test_seq2seq_examples.py b/examples/seq2seq/test_seq2seq_examples.py index 44e3d6c703dc..246881ce07ab 100644 --- a/examples/seq2seq/test_seq2seq_examples.py +++ b/examples/seq2seq/test_seq2seq_examples.py @@ -277,6 +277,55 @@ def test_finetune(model): assert bart.decoder.embed_tokens == bart.shared +def test_finetune_extra_model_args(): + args_d: dict = CHEAP_ARGS.copy() + + task = "summarization" + tmp_dir = make_test_data_dir() + + args_d.update( + data_dir=tmp_dir, + tokenizer_name=None, + train_batch_size=2, + eval_batch_size=2, + do_predict=False, + task=task, + src_lang="en_XX", + tgt_lang="ro_RO", + freeze_encoder=True, + freeze_embeds=True, + ) + + # test models whose config includes the extra_model_args + model = BART_TINY + output_dir = tempfile.mkdtemp(prefix="output_1_") + args_d1 = args_d.copy() + args_d1.update( + model_name_or_path=model, output_dir=output_dir, + ) + extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout") + for p in extra_model_params: + args_d1[p] = 0.5 + args = argparse.Namespace(**args_d1) + model = main(args) + for p in extra_model_params: + assert getattr(model.config, p) == 0.5, f"failed to override the model config for param {p}" + + # test models whose config doesn't include the extra_model_args + model = T5_TINY + output_dir = tempfile.mkdtemp(prefix="output_2_") + args_d2 = args_d.copy() + args_d2.update( + model_name_or_path=model, output_dir=output_dir, + ) + unsupported_param = "encoder_layerdrop" + args_d2[unsupported_param] = 0.5 + args = argparse.Namespace(**args_d2) + with pytest.raises(Exception) as excinfo: + model = main(args) + assert str(excinfo.value) == f"model config doesn't have a `{unsupported_param}` attribute" + + def test_pack_dataset(): tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-cc25") From d24ea708d742263efe4f4b8d525402f2d916c96c Mon Sep 17 00:00:00 2001 From: Oren Amsalem Date: Thu, 30 Jul 2020 13:13:29 +0300 Subject: [PATCH 058/127] Actually the extra_id are from 0-99 and not from 1-100 (#5967) a = tokenizer.encode("we got a ", return_tensors='pt',add_special_tokens=True) print(a) >tensor([[ 62, 530, 3, 9, 32000]]) a = tokenizer.encode("we got a ", return_tensors='pt',add_special_tokens=True) print(a) >tensor([[ 62, 530, 3, 9, 3, 2, 25666, 834, 23, 26, 834, 2915, 3155]]) --- docs/source/model_doc/t5.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/model_doc/t5.rst b/docs/source/model_doc/t5.rst index 2e7bd285f072..f7451300c860 100644 --- a/docs/source/model_doc/t5.rst +++ b/docs/source/model_doc/t5.rst @@ -38,13 +38,13 @@ T5 can be trained / fine-tuned both in a supervised and unsupervised fashion. In this setup spans of the input sequence are masked by so-called sentinel tokens (*a.k.a* unique mask tokens) and the output sequence is formed as a concatenation of the same sentinel tokens and the *real* masked tokens. - Each sentinel token represents a unique mask token for this sentence and should start with ````, ````, ... up to ````. As a default 100 sentinel tokens are available in ``T5Tokenizer``. + Each sentinel token represents a unique mask token for this sentence and should start with ````, ````, ... up to ````. As a default 100 sentinel tokens are available in ``T5Tokenizer``. *E.g.* the sentence "The cute dog walks in the park" with the masks put on "cute dog" and "the" should be processed as follows: :: - input_ids = tokenizer.encode('The walks in park', return_tensors='pt') - labels = tokenizer.encode(' cute dog the ', return_tensors='pt') + input_ids = tokenizer.encode('The walks in park', return_tensors='pt') + labels = tokenizer.encode(' cute dog the ', return_tensors='pt') # the forward function automatically creates the correct decoder_input_ids model(input_ids=input_ids, labels=labels) From c127d055e6d1b1344a33e0c6c84f8cf6816ee2f3 Mon Sep 17 00:00:00 2001 From: Oren Amsalem Date: Thu, 30 Jul 2020 15:53:35 +0300 Subject: [PATCH 059/127] add another e.g. to avoid confusion (#6055) --- src/transformers/modeling_tf_pytorch_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/modeling_tf_pytorch_utils.py b/src/transformers/modeling_tf_pytorch_utils.py index 5052695bc5e8..566efb233f96 100644 --- a/src/transformers/modeling_tf_pytorch_utils.py +++ b/src/transformers/modeling_tf_pytorch_utils.py @@ -203,7 +203,7 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a f"- This IS expected if you are initializing {tf_model.__class__.__name__} from a TF 2.0 model trained on another task " f"or with another architecture (e.g. initializing a BertForSequenceClassification model from a TFBertForPretraining model).\n" f"- This IS NOT expected if you are initializing {tf_model.__class__.__name__} from a TF 2.0 model that you expect " - f"to be exactly identical (initializing a BertForSequenceClassification model from a TFBertForSequenceClassification model)." + f"to be exactly identical (e.g. initializing a BertForSequenceClassification model from a TFBertForSequenceClassification model)." ) else: logger.warning(f"All PyTorch model weights were used when initializing {tf_model.__class__.__name__}.\n") @@ -350,7 +350,7 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F f"- This IS expected if you are initializing {pt_model.__class__.__name__} from a TF 2.0 model trained on another task " f"or with another architecture (e.g. initializing a BertForSequenceClassification model from a TFBertForPretraining model).\n" f"- This IS NOT expected if you are initializing {pt_model.__class__.__name__} from a TF 2.0 model that you expect " - f"to be exactly identical (initializing a BertForSequenceClassification model from a TFBertForSequenceClassification model)." + f"to be exactly identical (e.g. initializing a BertForSequenceClassification model from a TFBertForSequenceClassification model)." ) else: logger.warning(f"All TF 2.0 model weights were used when initializing {pt_model.__class__.__name__}.\n") From 562b6369c4a009c748ca86ce05d6ae5b7670e705 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Thu, 30 Jul 2020 09:13:16 -0400 Subject: [PATCH 060/127] Tf trainer cleanup (#6143) * Clean up TFTrainer * Add import * Fix conflicts --- src/transformers/trainer_tf.py | 92 +++++++++++++++++++++++++--------- 1 file changed, 69 insertions(+), 23 deletions(-) diff --git a/src/transformers/trainer_tf.py b/src/transformers/trainer_tf.py index c9c06edfbf57..808582e819e7 100644 --- a/src/transformers/trainer_tf.py +++ b/src/transformers/trainer_tf.py @@ -5,6 +5,7 @@ import math import os import sys +import warnings from typing import Callable, Dict, Optional, Tuple import numpy as np @@ -104,7 +105,7 @@ def __init__( self.tb_writer = tf.summary.create_file_writer(self.args.logging_dir) if is_wandb_available(): - self._setup_wandb() + self.setup_wandb() elif os.environ.get("WANDB_DISABLED") != "true": logger.info( "You are instantiating a Trainer but W&B is not installed. To use wandb logging, " @@ -116,6 +117,8 @@ def __init__( def get_train_tfdataset(self) -> tf.data.Dataset: """ Returns the training :class:`~tf.data.Dataset`. + + Subclass and override this method if you want to inject some custom behavior. """ if self.train_dataset is None: raise ValueError("Trainer: training requires a train_dataset.") @@ -142,6 +145,8 @@ def get_eval_tfdataset(self, eval_dataset: Optional[tf.data.Dataset] = None) -> Args: eval_dataset (:class:`~tf.data.Dataset`, `optional`): If provided, will override `self.eval_dataset`. + + Subclass and override this method if you want to inject some custom behavior. """ if eval_dataset is None and self.eval_dataset is None: raise ValueError("Trainer: evaluation requires an eval_dataset.") @@ -168,6 +173,8 @@ def get_test_tfdataset(self, test_dataset: tf.data.Dataset) -> tf.data.Dataset: Args: test_dataset (:class:`~tf.data.Dataset`): The dataset to use. + + Subclass and override this method if you want to inject some custom behavior. """ num_examples = tf.data.experimental.cardinality(test_dataset).numpy() @@ -185,14 +192,12 @@ def get_test_tfdataset(self, test_dataset: tf.data.Dataset) -> tf.data.Dataset: return self.args.strategy.experimental_distribute_dataset(ds), steps, num_examples - def create_optimizer_and_scheduler( - self, num_training_steps: int, - ) -> Tuple[tf.keras.optimizers.Optimizer, tf.keras.optimizers.schedules.LearningRateSchedule]: + def create_optimizer_and_scheduler(self, num_training_steps: int): """ Setup the optimizer and the learning rate scheduler. We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the - TFTrainer's init through :obj:`optimizers`, or override this method in a subclass. + TFTrainer's init through :obj:`optimizers`, or subclass and override this method. """ if not self.optimizer and not self.lr_scheduler: self.optimizer, self.lr_scheduler = create_optimizer( @@ -205,12 +210,12 @@ def create_optimizer_and_scheduler( weight_decay_rate=self.args.weight_decay, ) - def _setup_wandb(self): + def setup_wandb(self): """ Setup the optional Weights & Biases (`wandb`) integration. - One can override this method to customize the setup if needed. Find more information at https://docs.wandb.com/huggingface - You can also override the following environment variables: + One can subclass and override this method to customize the setup if needed. Find more information + `here `__. You can also override the following environment variables: Environment: WANDB_PROJECT: @@ -218,10 +223,17 @@ def _setup_wandb(self): WANDB_DISABLED: (Optional): boolean - defaults to false, set to "true" to disable wandb entirely """ + if hasattr(self, "_setup_wandb"): + warnings.warn( + "The `_setup_wandb` method is deprecated and won't be called in a future version, define `setup_wandb` in your subclass.", + FutureWarning, + ) + return self._setup_wandb() + logger.info('Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"') wandb.init(project=os.getenv("WANDB_PROJECT", "huggingface"), config=vars(self.args)) - def _prediction_loop( + def prediction_loop( self, dataset: tf.data.Dataset, steps: int, @@ -230,10 +242,19 @@ def _prediction_loop( prediction_loss_only: Optional[bool] = None, ) -> PredictionOutput: """ - Prediction/evaluation loop, shared by `evaluate()` and `predict()`. + Prediction/evaluation loop, shared by :func:`~transformers.TFTrainer.evaluate` and + :func:`~transformers.TFTrainer.predict`. Works both with or without labels. """ + if hasattr(self, "_prediction_loop"): + warnings.warn( + "The `_prediction_loop` method is deprecated and won't be called in a future version, define `prediction_loop` in your subclass.", + FutureWarning, + ) + return self._prediction_loop( + dataset, steps, num_examples, description, prediction_loss_only=prediction_loss_only + ) prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only @@ -250,7 +271,7 @@ def _prediction_loop( self._past = None for step, batch in enumerate(dataset): - logits = self.distributed_test_steps(batch) + logits = self.distributed_prediction_steps(batch) _, labels = batch if not prediction_loss_only: @@ -303,7 +324,13 @@ def _prediction_loop( return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics) - def _log(self, logs: Dict[str, float]) -> None: + def log(self, logs: Dict[str, float]) -> None: + if hasattr(self, "_log"): + warnings.warn( + "The `_log` method is deprecated and won't be called in a future version, define `log` in your subclass.", + FutureWarning, + ) + return self._log(logs) logs["epoch"] = self.epoch_logging if self.tb_writer: @@ -335,24 +362,28 @@ def evaluate(self, eval_dataset: Optional[tf.data.Dataset] = None) -> Dict[str, eval_ds, steps, num_examples = self.get_eval_tfdataset(eval_dataset) output = self._prediction_loop(eval_ds, steps, num_examples, description="Evaluation") - logs = {**output.metrics} logs["epoch"] = self.epoch_logging - self._log(logs) + self.log(logs) return output.metrics - def test_step(self, features, labels): - per_example_loss, logits = self._run_model(features, labels, False) + def prediction_step(self, features: tf.Tensor, labels: tf.Tensor) -> tf.Tensor: + """ + Compute the prediction on features and update the loss with labels. + + Subclass and override to inject some custom behavior. + """ + per_example_loss, logits = self.run_model(features, labels, False) self.eval_loss.update_state(per_example_loss) return logits @tf.function - def distributed_test_steps(self, batch): - logits = self.args.strategy.run(self.test_step, batch) + def distributed_prediction_steps(self, batch): + logits = self.args.strategy.run(self.prediction_step, batch) return logits @@ -446,7 +477,7 @@ def train(self) -> None: logs["loss"] = training_loss.numpy() logs["epoch"] = self.epoch_logging - self._log(logs) + self.log(logs) if self.global_step == 1 and self.args.debug: with self.tb_writer.as_default(): @@ -469,7 +500,7 @@ def train(self) -> None: logs["learning_rate"] = self.lr_scheduler(self.global_step).numpy() logs["epoch"] = self.epoch_logging - self._log(logs) + self.log(logs) if self.global_step > 0 and self.global_step % self.args.save_steps == 0: ckpt_save_path = self.model.ckpt_manager.save() @@ -490,7 +521,12 @@ def train(self) -> None: delattr(self, "_past") def training_step(self, features, labels): - per_example_loss, _ = self._run_model(features, labels, True) + """ + Perform a training step on features and labels. + + Subclass and override to inject some custom behavior. + """ + per_example_loss, _ = self.run_model(features, labels, True) scaled_loss = per_example_loss / self.total_train_batch_size gradients = tf.gradients(scaled_loss, self.model.trainable_variables) gradients = [ @@ -534,14 +570,24 @@ def distributed_training_steps(self, batch): with self.args.strategy.scope(): self.args.strategy.run(self.apply_gradients, batch) - def _run_model(self, features, labels, training): + def run_model(self, features, labels, training): """ Computes the loss of the given features and labels pair. + + Subclass and override this method if you want to inject some custom behavior. + Args: features: the batched features. labels: the batched labels. training: run the model in training mode or not """ + if hasattr(self, "_run_model"): + warnings.warn( + "The `_run_model` method is deprecated and won't be called in a future version, define `run_model` in your subclass.", + FutureWarning, + ) + return self._run_model(features, labels, training) + if self.args.past_index >= 0 and getattr(self, "_past", None) is not None: features["mems"] = self._past @@ -578,7 +624,7 @@ def predict(self, test_dataset: tf.data.Dataset) -> PredictionOutput: """ test_ds, steps, num_examples = self.get_test_tfdataset(test_dataset) - return self._prediction_loop(test_ds, steps, num_examples, description="Prediction") + return self.prediction_loop(test_ds, steps, num_examples, description="Prediction") def save_model(self, output_dir: Optional[str] = None): """ From 91cb95461e438dc57555c4f57f8ce95a56328036 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Thu, 30 Jul 2020 09:17:00 -0400 Subject: [PATCH 061/127] Switch from return_tuple to return_dict (#6138) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Switch from return_tuple to return_dict * Fix test * [WIP] Test TF Flaubert + Add {XLM, Flaubert}{TokenClassification, MultipleCâ€¦ (#5614) * Test TF Flaubert + Add {XLM, Flaubert}{TokenClassification, MultipleChoice} models and tests * AutoModels Tiny tweaks * Style * Final changes before merge * Re-order for simpler review * Final fixes * Addressing @sgugger's comments * Test MultipleChoice * Rework TF trainer (#6038) * Fully rework training/prediction loops * fix method name * Fix variable name * Fix property name * Fix scope * Fix method name * Fix tuple index * Fix tuple index * Fix indentation * Fix variable name * fix eval before log * Add drop remainder for test dataset * Fix step number + fix logging datetime * fix eval loss value * use global step instead of step + fix logging at step 0 * Fix logging datetime * Fix global_step usage * Fix breaking loop + logging datetime * Fix step in prediction loop * Fix step breaking * Fix train/test loops * Force TF at least 2.2 for the trainer * Use assert_cardinality to facilitate the dataset size computation * Log steps per epoch * Make tfds compliant with TPU * Make tfds compliant with TPU * Use TF dataset enumerate instead of the Python one * revert previous commit * Fix data_dir * Apply style * rebase on master * Address Sylvain's comments * Address Sylvain's and Lysandre comments * Trigger CI * Remove unused import * Switch from return_tuple to return_dict * Fix test * Add recent model Co-authored-by: Lysandre Debut Co-authored-by: Julien Plu --- docs/source/quicktour.rst | 17 ++- docs/source/training.rst | 2 +- examples/question-answering/run_squad.py | 5 - examples/seq2seq/test_seq2seq_examples.py | 2 +- src/transformers/configuration_utils.py | 15 +-- src/transformers/file_utils.py | 120 ++++++++++++++----- src/transformers/modeling_albert.py | 73 +++++------ src/transformers/modeling_bart.py | 51 ++++---- src/transformers/modeling_bert.py | 93 +++++++------- src/transformers/modeling_camembert.py | 6 - src/transformers/modeling_ctrl.py | 19 +-- src/transformers/modeling_distilbert.py | 57 ++++----- src/transformers/modeling_dpr.py | 50 ++++---- src/transformers/modeling_electra.py | 61 +++++----- src/transformers/modeling_encoder_decoder.py | 2 - src/transformers/modeling_flaubert.py | 11 +- src/transformers/modeling_gpt2.py | 37 +++--- src/transformers/modeling_longformer.py | 67 ++++++----- src/transformers/modeling_mmbt.py | 30 +++-- src/transformers/modeling_mobilebert.py | 83 ++++++------- src/transformers/modeling_openai.py | 37 +++--- src/transformers/modeling_outputs.py | 48 ++++---- src/transformers/modeling_reformer.py | 81 ++++--------- src/transformers/modeling_roberta.py | 45 +++---- src/transformers/modeling_t5.py | 42 +++---- src/transformers/modeling_transfo_xl.py | 27 +++-- src/transformers/modeling_utils.py | 10 +- src/transformers/modeling_xlm.py | 63 +++++----- src/transformers/modeling_xlm_roberta.py | 6 - src/transformers/modeling_xlnet.py | 87 +++++++------- src/transformers/pipelines.py | 2 +- src/transformers/trainer.py | 4 +- templates/adding_a_new_model/modeling_xxx.py | 51 ++++---- tests/test_modeling_common.py | 3 +- tests/test_modeling_t5.py | 1 - 35 files changed, 675 insertions(+), 633 deletions(-) diff --git a/docs/source/quicktour.rst b/docs/source/quicktour.rst index d0ac2e9d8122..d8a143cc0a67 100644 --- a/docs/source/quicktour.rst +++ b/docs/source/quicktour.rst @@ -230,19 +230,16 @@ final activations of the model. >>> ## PYTORCH CODE >>> print(pt_outputs) - SequenceClassifierOutput(loss=None, logits=tensor([[-4.0833, 4.3364], - [ 0.0818, -0.0418]], grad_fn=), hidden_states=None, attentions=None) + (tensor([[-4.0833, 4.3364], + [ 0.0818, -0.0418]], grad_fn=),) >>> ## TENSORFLOW CODE >>> print(tf_outputs) (,) -The model can return more than just the final activations, which is why the PyTorch output is a special class and the -TensorFlow output is a tuple. Here we only asked for the final activations, so we get a tuple with one element on the -TensorFlow side and a :class:`~transformers.modeling_outputs.SequenceClassifierOutput` with just the ``logits`` field -filled on the PyTorch side. - +The model can return more than just the final activations, which is why the output is a tuple. Here we only asked for +the final activations, so we get a tuple with one element. .. note:: All ðŸ¤— Transformers models (PyTorch or TensorFlow) return the activations of the model *before* the final @@ -254,7 +251,7 @@ Let's apply the SoftMax activation to get predictions. >>> ## PYTORCH CODE >>> import torch.nn.functional as F - >>> pt_predictions = F.softmax(pt_outputs.logits, dim=-1) + >>> pt_predictions = F.softmax(pt_outputs[0], dim=-1) >>> ## TENSORFLOW CODE >>> import tensorflow as tf >>> tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1) @@ -341,8 +338,8 @@ code is easy to access and tweak if you need to. In our previous example, the model was called "distilbert-base-uncased-finetuned-sst-2-english", which means it's using the :doc:`DistilBERT ` architecture. As -:class:`~transformers.AutoModelForSequenceClassification` (or :class:`~transformers.TFAutoModelForSequenceClassification` -if you are using TensorFlow)` was used, the model automatically created is then a +:class:`~transformers.AutoModelForSequenceClassification` (or :class:`~transformers.TFAutoModelForSequenceClassification` +if you are using TensorFlow) was used, the model automatically created is then a :class:`~transformers.DistilBertForSequenceClassification`. You can look at its documentation for all details relevant to that specific model, or browse the source code. This is how you would directly instantiate model and tokenizer without the auto magic: diff --git a/docs/source/training.rst b/docs/source/training.rst index 7ddfcc40fb53..799f96e94afe 100644 --- a/docs/source/training.rst +++ b/docs/source/training.rst @@ -49,7 +49,7 @@ put it in train mode. .. code-block:: python from transformers import BertForSequenceClassification - model = BertForSequenceClassification.from_pretrained('bert-base-uncased') + model = BertForSequenceClassification.from_pretrained('bert-base-uncased', return_dict=True) model.train() This is useful because it allows us to make use of the pre-trained BERT diff --git a/examples/question-answering/run_squad.py b/examples/question-answering/run_squad.py index fa11a33ca628..faaffea50191 100644 --- a/examples/question-answering/run_squad.py +++ b/examples/question-answering/run_squad.py @@ -199,9 +199,6 @@ def train(args, train_dataset, model, tokenizer): {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)} ) - if isinstance(model, torch.nn.DataParallel): - inputs["return_tuple"] = True - outputs = model(**inputs) # model outputs are always tuple in transformers (see doc) loss = outputs[0] @@ -316,8 +313,6 @@ def evaluate(args, model, tokenizer, prefix=""): inputs.update( {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)} ) - if isinstance(model, torch.nn.DataParallel): - inputs["return_tuple"] = True outputs = model(**inputs) for i, feature_index in enumerate(feature_indices): diff --git a/examples/seq2seq/test_seq2seq_examples.py b/examples/seq2seq/test_seq2seq_examples.py index 246881ce07ab..d12aa0349332 100644 --- a/examples/seq2seq/test_seq2seq_examples.py +++ b/examples/seq2seq/test_seq2seq_examples.py @@ -144,7 +144,7 @@ def test_distill_checkpointing_with_teacher(self): evaluate_checkpoint(ckpts[0], dest_dir=Path(tempfile.mkdtemp())) def test_loss_fn(self): - model = AutoModelForSeq2SeqLM.from_pretrained(BART_TINY) + model = AutoModelForSeq2SeqLM.from_pretrained(BART_TINY, return_dict=True) input_ids, mask = model.dummy_inputs["input_ids"], model.dummy_inputs["attention_mask"] target_ids = torch.tensor([[0, 4, 8, 2], [0, 8, 2, 1]], dtype=torch.long, device=model.device) decoder_input_ids = target_ids[:, :-1].contiguous() # Why this line? diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index c8dd3572aeba..af31087697c4 100644 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -49,8 +49,9 @@ class PretrainedConfig(object): Whether or not the model should returns all attentions. use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether or not the model should return the last key/values attentions (not used by all models). - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`False`): - Whether or not the model should return tuples instead of :obj:`ModelOutput` objects. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not the model should return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`): Whether the model is used as an encoder/decoder or not. is_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`): @@ -133,7 +134,7 @@ class PretrainedConfig(object): def __init__(self, **kwargs): # Attributes with defaults - self.return_tuple = kwargs.pop("return_tuple", False) + self.return_dict = kwargs.pop("return_dict", False) self.output_hidden_states = kwargs.pop("output_hidden_states", False) self.output_attentions = kwargs.pop("output_attentions", False) self.use_cache = kwargs.pop("use_cache", True) # Not used by all models @@ -194,12 +195,12 @@ def __init__(self, **kwargs): raise err @property - def use_return_tuple(self) -> bool: + def use_return_dict(self) -> bool: """ - :obj:`bool`: Whether or not the model should return a tuple. + :obj:`bool`: Whether or not return :class:`~transformers.file_utils.ModelOutput` instead of tuples. """ - # If torchscript is set, force return_tuple to avoid jit errors - return self.return_tuple or self.torchscript + # If torchscript is set, force `return_dict=False` to avoid jit errors + return self.return_dict and not self.torchscript @property def num_labels(self) -> int: diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py index 5a46fb062467..5bdf1f792d95 100644 --- a/src/transformers/file_utils.py +++ b/src/transformers/file_utils.py @@ -13,14 +13,17 @@ import sys import tarfile import tempfile +from collections import OrderedDict from contextlib import contextmanager +from dataclasses import fields from functools import partial, wraps from hashlib import sha256 from pathlib import Path -from typing import Dict, Optional, Union +from typing import Any, Dict, Optional, Tuple, Union from urllib.parse import urlparse from zipfile import ZipFile, is_zipfile +import numpy as np import requests from filelock import FileLock from tqdm.auto import tqdm @@ -190,8 +193,8 @@ def docstring_decorator(fn): RETURN_INTRODUCTION = r""" Returns: :class:`~{full_output_type}` or :obj:`tuple(torch.FloatTensor)`: - A :class:`~{full_output_type}` or a tuple of :obj:`torch.FloatTensor` (if ``return_tuple=True`` is passed or - when ``config.return_tuple=True``) comprising various elements depending on the configuration + A :class:`~{full_output_type}` (if ``return_dict=True`` is passed or when ``config.return_dict=True``) or a + tuple of :obj:`torch.FloatTensor` comprising various elements depending on the configuration (:class:`~transformers.{config_class}`) and inputs. """ @@ -257,7 +260,7 @@ def _prepare_output_docstrings(output_type, config_class): >>> import torch >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') - >>> model = {model_class}.from_pretrained('{checkpoint}') + >>> model = {model_class}.from_pretrained('{checkpoint}', return_dict=True) >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> labels = torch.tensor([1] * inputs["input_ids"].size(1)).unsqueeze(0) # Batch size 1 @@ -274,7 +277,7 @@ def _prepare_output_docstrings(output_type, config_class): >>> import torch >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') - >>> model = {model_class}.from_pretrained('{checkpoint}') + >>> model = {model_class}.from_pretrained('{checkpoint}', return_dict=True) >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> start_positions = torch.tensor([1]) @@ -293,7 +296,7 @@ def _prepare_output_docstrings(output_type, config_class): >>> import torch >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') - >>> model = {model_class}.from_pretrained('{checkpoint}') + >>> model = {model_class}.from_pretrained('{checkpoint}', return_dict=True) >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 @@ -309,7 +312,7 @@ def _prepare_output_docstrings(output_type, config_class): >>> import torch >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') - >>> model = {model_class}.from_pretrained('{checkpoint}') + >>> model = {model_class}.from_pretrained('{checkpoint}', return_dict=True) >>> input_ids = tokenizer("Hello, my dog is cute", return_tensors="pt")["input_ids"] @@ -325,7 +328,7 @@ def _prepare_output_docstrings(output_type, config_class): >>> import torch >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') - >>> model = {model_class}.from_pretrained('{checkpoint}') + >>> model = {model_class}.from_pretrained('{checkpoint}', return_dict=True) >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> outputs = model(**inputs) @@ -340,7 +343,7 @@ def _prepare_output_docstrings(output_type, config_class): >>> import torch >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') - >>> model = {model_class}.from_pretrained('{checkpoint}') + >>> model = {model_class}.from_pretrained('{checkpoint}', return_dict=True) >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." >>> choice0 = "It is eaten with a fork and a knife." @@ -362,7 +365,7 @@ def _prepare_output_docstrings(output_type, config_class): >>> from transformers import {tokenizer_class}, {model_class} >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') - >>> model = {model_class}.from_pretrained('{checkpoint}') + >>> model = {model_class}.from_pretrained('{checkpoint}', return_dict=True) >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> outputs = model(**inputs, labels=inputs["input_ids"]) @@ -900,30 +903,91 @@ def wrapper(*args, **kwargs): return wrapper -class ModelOutput: +def is_tensor(x): + """ Tests if ``x`` is a :obj:`torch.Tensor`, :obj:`tf.Tensor` or :obj:`np.ndarray`. """ + if is_torch_available(): + import torch + + if isinstance(x, torch.Tensor): + return True + if is_tf_available(): + import tensorflow as tf + + if isinstance(x, tf.Tensor): + return True + return isinstance(x, np.ndarray) + + +class ModelOutput(OrderedDict): """ Base class for all model outputs as dataclass. Has a ``__getitem__`` that allows indexing by integer or slice (like - a tuple) or strings (like a dictionnary) that will ignore the ``None`` attributes. + a tuple) or strings (like a dictionnary) that will ignore the ``None`` attributes. Otherwise behaves like a + regular python dictionary. + + .. warning:: + You can't unpack a :obj:`ModelOutput` directly. Use the :meth:`~transformers.file_utils.ModelOutput.to_tuple` + method to convert it to a tuple before. """ - def to_tuple(self): - """ - Converts :obj:`self` to a tuple. + def __post_init__(self): + class_fields = fields(self) + + # Safety and consistency checks + assert len(class_fields), f"{self.__class__.__name__} has no fields." + assert all( + field.default is None for field in class_fields[1:] + ), f"{self.__class__.__name__} should not have more than one required field." + + first_field = getattr(self, class_fields[0].name) + other_fields_are_none = all(getattr(self, field.name) is None for field in class_fields[1:]) + + if other_fields_are_none and not is_tensor(first_field): + try: + iterator = iter(first_field) + first_field_iterator = True + except TypeError: + first_field_iterator = False + + # if we provided an iterator as first field and the iterator is a (key, value) iterator + # set the associated fields + if first_field_iterator: + for element in iterator: + if ( + not isinstance(element, (list, tuple)) + or not len(element) == 2 + or not isinstance(element[0], str) + ): + break + setattr(self, element[0], element[1]) + if element[1] is not None: + self[element[0]] = element[1] + else: + for field in class_fields: + v = getattr(self, field.name) + if v is not None: + self[field.name] = v - Return: A tuple containing all non-:obj:`None` attributes of the :obj:`self`. - """ - return tuple(getattr(self, f) for f in self.__dataclass_fields__.keys() if getattr(self, f, None) is not None) + def __delitem__(self, *args, **kwargs): + raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.") - def to_dict(self): - """ - Converts :obj:`self` to a Python dictionary. + def setdefault(self, *args, **kwargs): + raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.") - Return: A dictionary containing all non-:obj:`None` attributes of the :obj:`self`. - """ - return {f: getattr(self, f) for f in self.__dataclass_fields__.keys() if getattr(self, f, None) is not None} + def pop(self, *args, **kwargs): + raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.") + + def update(self, *args, **kwargs): + raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.") - def __getitem__(self, i): - return self.to_dict()[i] if isinstance(i, str) else self.to_tuple()[i] + def __getitem__(self, k): + if isinstance(k, str): + inner_dict = {k: v for (k, v) in self.items()} + return inner_dict[k] + else: + return self.to_tuple()[k] - def __len__(self): - return len(self.to_tuple()) + def to_tuple(self) -> Tuple[Any]: + """ + Convert self to a tuple containing all the attributes/keys that are not ``None``. + """ + return tuple(self[k] for k in self.keys()) diff --git a/src/transformers/modeling_albert.py b/src/transformers/modeling_albert.py index 2f52d1f49854..ef96228b5ba1 100644 --- a/src/transformers/modeling_albert.py +++ b/src/transformers/modeling_albert.py @@ -346,7 +346,7 @@ def forward( head_mask=None, output_attentions=False, output_hidden_states=False, - return_tuple=False, + return_dict=False, ): hidden_states = self.embedding_hidden_mapping_in(hidden_states) @@ -375,7 +375,7 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if return_tuple: + if not return_dict: return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions @@ -430,9 +430,9 @@ class AlbertForPretrainingOutput(ModelOutput): heads. """ - loss: Optional[torch.FloatTensor] - prediction_logits: torch.FloatTensor - sop_logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + prediction_logits: torch.FloatTensor = None + sop_logits: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -488,8 +488,9 @@ class AlbertForPretrainingOutput(ModelOutput): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -561,13 +562,13 @@ def forward( inputs_embeds=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -599,14 +600,14 @@ def forward( head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = encoder_outputs[0] pooled_output = self.pooler_activation(self.pooler(sequence_output[:, 0])) - if return_tuple: + if not return_dict: return (sequence_output, pooled_output) + encoder_outputs[1:] return BaseModelOutputWithPooling( @@ -653,7 +654,7 @@ def forward( sentence_order_label=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs, ): r""" @@ -678,7 +679,7 @@ def forward( >>> import torch >>> tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') - >>> model = AlbertForPreTraining.from_pretrained('albert-base-v2') + >>> model = AlbertForPreTraining.from_pretrained('albert-base-v2', return_dict=True) >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 >>> outputs = model(input_ids) @@ -695,7 +696,7 @@ def forward( ) labels = kwargs.pop("masked_lm_labels") assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.albert( input_ids, @@ -706,7 +707,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output, pooled_output = outputs[:2] @@ -721,7 +722,7 @@ def forward( sentence_order_loss = loss_fct(sop_scores.view(-1, 2), sentence_order_label.view(-1)) total_loss = masked_lm_loss + sentence_order_loss - if return_tuple: + if not return_dict: output = (prediction_scores, sop_scores) + outputs[2:] return ((total_loss,) + output) if total_loss is not None else output @@ -808,7 +809,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs ): r""" @@ -827,7 +828,7 @@ def forward( ) labels = kwargs.pop("masked_lm_labels") assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.albert( input_ids=input_ids, @@ -838,7 +839,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_outputs = outputs[0] @@ -849,7 +850,7 @@ def forward( loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if return_tuple: + if not return_dict: output = (prediction_scores,) + outputs[2:] return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output @@ -895,7 +896,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -904,7 +905,7 @@ def forward( If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.albert( input_ids=input_ids, @@ -915,7 +916,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) pooled_output = outputs[1] @@ -933,7 +934,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output @@ -976,14 +977,14 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.albert( input_ids, @@ -994,7 +995,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -1014,7 +1015,7 @@ def forward( else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output @@ -1057,7 +1058,7 @@ def forward( end_positions=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1069,7 +1070,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.albert( input_ids=input_ids, @@ -1080,7 +1081,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -1107,7 +1108,7 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if return_tuple: + if not return_dict: output = (start_logits, end_logits) + outputs[2:] return ((total_loss,) + output) if total_loss is not None else output @@ -1153,7 +1154,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1161,7 +1162,7 @@ def forward( Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1182,7 +1183,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) pooled_output = outputs[1] @@ -1196,7 +1197,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if return_tuple: + if not return_dict: output = (reshaped_logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py index 1104567a48c5..92a5bd43b0bb 100644 --- a/src/transformers/modeling_bart.py +++ b/src/transformers/modeling_bart.py @@ -124,8 +124,9 @@ If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -304,7 +305,7 @@ def __init__(self, config: BartConfig, embed_tokens): self.layer_norm = LayerNorm(config.d_model) if config.normalize_before else None def forward( - self, input_ids, attention_mask=None, output_attentions=False, output_hidden_states=False, return_tuple=False + self, input_ids, attention_mask=None, output_attentions=False, output_hidden_states=False, return_dict=False ): """ Args: @@ -359,7 +360,7 @@ def forward( # T x B x C -> B x T x C x = x.transpose(0, 1) - if return_tuple: + if not return_dict: return tuple(v for v in [x, encoder_states, all_attentions] if v is not None) return BaseModelOutput(last_hidden_state=x, hidden_states=encoder_states, attentions=all_attentions) @@ -495,7 +496,7 @@ def forward( use_cache=False, output_attentions=False, output_hidden_states=False, - return_tuple=False, + return_dict=False, **unused, ): """ @@ -588,7 +589,7 @@ def forward( else: next_cache = None - if return_tuple: + if not return_dict: return tuple(v for v in [x, next_cache, all_hidden_states, all_self_attns] if v is not None) return BaseModelOutputWithPast( last_hidden_state=x, past_key_values=next_cache, hidden_states=all_hidden_states, attentions=all_self_attns @@ -850,7 +851,7 @@ def forward( use_cache=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs, ): @@ -862,7 +863,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict # make masks if user doesn't supply if not use_cache: @@ -884,10 +885,10 @@ def forward( attention_mask=attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) - # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOuput when return_tuple=False - elif not return_tuple and not isinstance(encoder_outputs, BaseModelOutput): + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOuput when return_dict=False + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): encoder_outputs = BaseModelOutput( last_hidden_state=encoder_outputs[0], hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, @@ -905,10 +906,10 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) - if return_tuple: + if not return_dict: return decoder_outputs + encoder_outputs return Seq2SeqModelOutput( @@ -976,7 +977,7 @@ def forward( use_cache=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **unused, ): r""" @@ -1018,7 +1019,7 @@ def forward( FutureWarning, ) decoder_past_key_values = unused.pop("decoder_cached_states") - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: use_cache = False @@ -1033,7 +1034,7 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) lm_logits = F.linear(outputs[0], self.model.shared.weight, bias=self.final_logits_bias) @@ -1043,7 +1044,7 @@ def forward( # TODO(SS): do we need to ignore pad tokens in labels? masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1)) - if return_tuple: + if not return_dict: output = (lm_logits,) + outputs[1:] return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output @@ -1146,7 +1147,7 @@ def forward( use_cache=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1154,7 +1155,7 @@ def forward( Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: use_cache = False @@ -1167,7 +1168,7 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) x = outputs[0] # last hidden state eos_mask = input_ids.eq(self.config.eos_token_id) @@ -1180,7 +1181,7 @@ def forward( if labels is not None: loss = F.cross_entropy(logits.view(-1, self.config.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + outputs[1:] return ((loss,) + output) if loss is not None else output @@ -1232,7 +1233,7 @@ def forward( use_cache=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1244,7 +1245,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if start_positions is not None and end_positions is not None: use_cache = False @@ -1257,7 +1258,7 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -1284,7 +1285,7 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if return_tuple: + if not return_dict: output = (start_logits, end_logits,) + outputs[1:] return ((total_loss,) + output) if total_loss is not None else output diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py index 850cae298469..11dd8f8b36d0 100644 --- a/src/transformers/modeling_bert.py +++ b/src/transformers/modeling_bert.py @@ -429,7 +429,7 @@ def forward( encoder_attention_mask=None, output_attentions=False, output_hidden_states=False, - return_tuple=False, + return_dict=False, ): all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -469,7 +469,7 @@ def custom_forward(*inputs): if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if return_tuple: + if not return_dict: return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions @@ -609,9 +609,9 @@ class BertForPretrainingOutput(ModelOutput): heads. """ - loss: Optional[torch.FloatTensor] - prediction_logits: torch.FloatTensor - seq_relationship_logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + prediction_logits: torch.FloatTensor = None + seq_relationship_logits: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -674,8 +674,9 @@ class BertForPretrainingOutput(ModelOutput): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -743,13 +744,13 @@ def forward( encoder_attention_mask=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -800,12 +801,12 @@ def forward( encoder_attention_mask=encoder_extended_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output) - if return_tuple: + if not return_dict: return (sequence_output, pooled_output) + encoder_outputs[1:] return BaseModelOutputWithPooling( @@ -847,7 +848,7 @@ def forward( next_sentence_label=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs ): r""" @@ -872,7 +873,7 @@ def forward( >>> import torch >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') - >>> model = BertForPreTraining.from_pretrained('bert-base-uncased') + >>> model = BertForPreTraining.from_pretrained('bert-base-uncased', return_dict=True) >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> outputs = model(**inputs) @@ -887,7 +888,7 @@ def forward( ) labels = kwargs.pop("masked_lm_labels") assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.bert( input_ids, @@ -898,7 +899,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output, pooled_output = outputs[:2] @@ -911,7 +912,7 @@ def forward( next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) total_loss = masked_lm_loss + next_sentence_loss - if return_tuple: + if not return_dict: output = (prediction_scores, seq_relationship_score) + outputs[2:] return ((total_loss,) + output) if total_loss is not None else output @@ -955,7 +956,7 @@ def forward( encoder_attention_mask=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs ): r""" @@ -977,14 +978,14 @@ def forward( >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased') >>> config = BertConfig.from_pretrained("bert-base-cased") >>> config.is_decoder = True - >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config) + >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config, return_dict=True) >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> outputs = model(**inputs) >>> prediction_logits = outputs.logits """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.bert( input_ids, @@ -997,7 +998,7 @@ def forward( encoder_attention_mask=encoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -1011,7 +1012,7 @@ def forward( loss_fct = CrossEntropyLoss() lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if return_tuple: + if not return_dict: output = (prediction_scores,) + outputs[2:] return ((lm_loss,) + output) if lm_loss is not None else output @@ -1065,7 +1066,7 @@ def forward( encoder_attention_mask=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs ): r""" @@ -1086,7 +1087,7 @@ def forward( assert "lm_labels" not in kwargs, "Use `BertWithLMHead` for autoregressive language modeling task." assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.bert( input_ids, @@ -1099,7 +1100,7 @@ def forward( encoder_attention_mask=encoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -1110,7 +1111,7 @@ def forward( loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if return_tuple: + if not return_dict: output = (prediction_scores,) + outputs[2:] return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output @@ -1161,7 +1162,7 @@ def forward( next_sentence_label=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1178,7 +1179,7 @@ def forward( >>> import torch >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') - >>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased') + >>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased', return_dict=True) >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light." @@ -1188,7 +1189,7 @@ def forward( >>> logits = outputs.logits >>> assert logits[0, 0] < logits[0, 1] # next sentence was random """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.bert( input_ids, @@ -1199,7 +1200,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) pooled_output = outputs[1] @@ -1211,7 +1212,7 @@ def forward( loss_fct = CrossEntropyLoss() next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), next_sentence_label.view(-1)) - if return_tuple: + if not return_dict: output = (seq_relationship_scores,) + outputs[2:] return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output @@ -1257,7 +1258,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1266,7 +1267,7 @@ def forward( If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.bert( input_ids, @@ -1277,7 +1278,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) pooled_output = outputs[1] @@ -1295,7 +1296,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output @@ -1337,7 +1338,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1345,7 +1346,7 @@ def forward( Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1367,7 +1368,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) pooled_output = outputs[1] @@ -1381,7 +1382,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if return_tuple: + if not return_dict: output = (reshaped_logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output @@ -1424,14 +1425,14 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.bert( input_ids, @@ -1442,7 +1443,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -1464,7 +1465,7 @@ def forward( else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output @@ -1507,7 +1508,7 @@ def forward( end_positions=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1519,7 +1520,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.bert( input_ids, @@ -1530,7 +1531,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -1557,7 +1558,7 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if return_tuple: + if not return_dict: output = (start_logits, end_logits) + outputs[2:] return ((total_loss,) + output) if total_loss is not None else output diff --git a/src/transformers/modeling_camembert.py b/src/transformers/modeling_camembert.py index def89a214d45..2e9a24d4d20c 100644 --- a/src/transformers/modeling_camembert.py +++ b/src/transformers/modeling_camembert.py @@ -51,12 +51,6 @@ model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. - output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. - output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. """ diff --git a/src/transformers/modeling_ctrl.py b/src/transformers/modeling_ctrl.py index 7638bcf014fa..653aaa501618 100644 --- a/src/transformers/modeling_ctrl.py +++ b/src/transformers/modeling_ctrl.py @@ -295,8 +295,9 @@ def _init_weights(self, module): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -355,7 +356,7 @@ def forward( use_cache=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs, ): if "past" in kwargs: @@ -371,7 +372,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -472,7 +473,7 @@ def forward( attention_output_shape = input_shape[:-1] + (-1,) + all_attentions[0].shape[-2:] all_attentions = tuple(t.view(*attention_output_shape) for t in all_attentions) - if return_tuple: + if not return_dict: return tuple(v for v in [hidden_states, presents, all_hidden_states, all_attentions] if v is not None) return BaseModelOutputWithPast( @@ -526,7 +527,7 @@ def forward( use_cache=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs, ): r""" @@ -544,7 +545,7 @@ def forward( ) past_key_values = kwargs.pop("past") assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict transformer_outputs = self.transformer( input_ids, @@ -557,7 +558,7 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) hidden_states = transformer_outputs[0] @@ -573,7 +574,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) - if return_tuple: + if not return_dict: output = (lm_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/modeling_distilbert.py b/src/transformers/modeling_distilbert.py index 9c3f4e03198e..ca19495e7b29 100644 --- a/src/transformers/modeling_distilbert.py +++ b/src/transformers/modeling_distilbert.py @@ -279,7 +279,7 @@ def __init__(self, config): self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.n_layers)]) def forward( - self, x, attn_mask=None, head_mask=None, output_attentions=False, output_hidden_states=False, return_tuple=None + self, x, attn_mask=None, head_mask=None, output_attentions=False, output_hidden_states=False, return_dict=None ): """ Parameters @@ -324,7 +324,7 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_state,) - if return_tuple: + if not return_dict: return tuple(v for v in [hidden_state, all_hidden_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_state, hidden_states=all_hidden_states, attentions=all_attentions @@ -396,8 +396,9 @@ def _init_weights(self, module): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -444,13 +445,13 @@ def forward( inputs_embeds=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -477,7 +478,7 @@ def forward( head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) @@ -516,7 +517,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs ): r""" @@ -535,7 +536,7 @@ def forward( ) labels = kwargs.pop("masked_lm_labels") assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict dlbrt_output = self.distilbert( input_ids=input_ids, @@ -544,7 +545,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) hidden_states = dlbrt_output[0] # (bs, seq_length, dim) prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim) @@ -556,7 +557,7 @@ def forward( if labels is not None: mlm_loss = self.mlm_loss_fct(prediction_logits.view(-1, prediction_logits.size(-1)), labels.view(-1)) - if return_tuple: + if not return_dict: output = (prediction_logits,) + dlbrt_output[1:] return ((mlm_loss,) + output) if mlm_loss is not None else output @@ -601,7 +602,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -610,7 +611,7 @@ def forward( If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict distilbert_output = self.distilbert( input_ids=input_ids, @@ -619,7 +620,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) hidden_state = distilbert_output[0] # (bs, seq_len, dim) pooled_output = hidden_state[:, 0] # (bs, dim) @@ -637,7 +638,7 @@ def forward( loss_fct = nn.CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + distilbert_output[1:] return ((loss,) + output) if loss is not None else output @@ -682,7 +683,7 @@ def forward( end_positions=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -694,7 +695,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict distilbert_output = self.distilbert( input_ids=input_ids, @@ -703,7 +704,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) hidden_states = distilbert_output[0] # (bs, max_query_len, dim) @@ -730,7 +731,7 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if return_tuple: + if not return_dict: output = (start_logits, end_logits) + distilbert_output[1:] return ((total_loss,) + output) if total_loss is not None else output @@ -775,14 +776,14 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.distilbert( input_ids, @@ -791,7 +792,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -813,7 +814,7 @@ def forward( else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + outputs[1:] return ((loss,) + output) if loss is not None else output @@ -849,7 +850,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -865,7 +866,7 @@ def forward( >>> import torch >>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') - >>> model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-cased') + >>> model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-cased', return_dict=True) >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." >>> choice0 = "It is eaten with a fork and a knife." @@ -879,7 +880,7 @@ def forward( >>> loss = outputs.loss >>> logits = outputs.logits """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -897,7 +898,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) hidden_state = outputs[0] # (bs * num_choices, seq_len, dim) @@ -914,7 +915,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if return_tuple: + if not return_dict: output = (reshaped_logits,) + outputs[1:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/modeling_dpr.py b/src/transformers/modeling_dpr.py index 7cffaabdc09d..fde9952461ea 100644 --- a/src/transformers/modeling_dpr.py +++ b/src/transformers/modeling_dpr.py @@ -134,8 +134,8 @@ class DPRReaderOutput(ModelOutput): """ start_logits: torch.FloatTensor - end_logits: torch.FloatTensor - relevance_logits: torch.FloatTensor + end_logits: torch.FloatTensor = None + relevance_logits: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -161,7 +161,7 @@ def forward( inputs_embeds: Optional[Tensor] = None, output_attentions: bool = False, output_hidden_states: bool = False, - return_tuple: bool = False, + return_dict: bool = False, ) -> Union[BaseModelOutputWithPooling, Tuple[Tensor, ...]]: outputs = self.bert_model( input_ids=input_ids, @@ -170,14 +170,14 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output, pooled_output = outputs[:2] pooled_output = sequence_output[:, 0, :] if self.projection_dim > 0: pooled_output = self.encode_proj(pooled_output) - if return_tuple: + if not return_dict: return (sequence_output, pooled_output) + outputs[2:] return BaseModelOutputWithPooling( @@ -217,7 +217,7 @@ def forward( inputs_embeds: Optional[Tensor] = None, output_attentions: bool = False, output_hidden_states: bool = False, - return_tuple: bool = False, + return_dict: bool = False, ) -> Union[DPRReaderOutput, Tuple[Tensor, ...]]: # notations: N - number of questions in a batch, M - number of passages per questions, L - sequence length n_passages, sequence_length = input_ids.size() if input_ids is not None else inputs_embeds.size()[:2] @@ -228,7 +228,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -244,7 +244,7 @@ def forward( end_logits = end_logits.view(n_passages, sequence_length) relevance_logits = relevance_logits.view(n_passages) - if return_tuple: + if not return_dict: return (start_logits, end_logits, relevance_logits) + outputs[2:] return DPRReaderOutput( @@ -361,6 +361,9 @@ def init_weights(self): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states tensors of all layers are returned. See ``hidden_states`` under returned tensors for more detail. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ DPR_READER_INPUTS_DOCSTRING = r""" @@ -388,6 +391,9 @@ def init_weights(self): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states tensors of all layers are returned. See ``hidden_states`` under returned tensors for more detail. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -412,7 +418,7 @@ def forward( inputs_embeds: Optional[Tensor] = None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ) -> Union[DPRContextEncoderOutput, Tuple[Tensor, ...]]: r""" Return: @@ -421,7 +427,7 @@ def forward( from transformers import DPRContextEncoder, DPRContextEncoderTokenizer tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base') - model = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base') + model = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', return_dict=True) input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='pt')["input_ids"] embeddings = model(input_ids).pooler_output """ @@ -430,7 +436,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -459,10 +465,10 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) - if return_tuple: + if not return_dict: return outputs[1:] return DPRContextEncoderOutput( pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions @@ -490,7 +496,7 @@ def forward( inputs_embeds: Optional[Tensor] = None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ) -> Union[DPRQuestionEncoderOutput, Tuple[Tensor, ...]]: r""" Return: @@ -499,7 +505,7 @@ def forward( from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base') - model = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base') + model = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base', return_dict=True) input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='pt')["input_ids"] embeddings = model(input_ids).pooler_output """ @@ -507,7 +513,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -536,10 +542,10 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) - if return_tuple: + if not return_dict: return outputs[1:] return DPRQuestionEncoderOutput( pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions @@ -565,7 +571,7 @@ def forward( inputs_embeds: Optional[Tensor] = None, output_attentions: bool = None, output_hidden_states: bool = None, - return_tuple=None, + return_dict=None, ) -> Union[DPRReaderOutput, Tuple[Tensor, ...]]: r""" Return: @@ -574,7 +580,7 @@ def forward( from transformers import DPRReader, DPRReaderTokenizer tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base') - model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base') + model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base', return_dict=True) encoded_inputs = tokenizer( questions=["What is love ?"], titles=["Haddaway"], @@ -591,7 +597,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -613,5 +619,5 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) diff --git a/src/transformers/modeling_electra.py b/src/transformers/modeling_electra.py index 8f24343ccaa7..1f2cb118c0f0 100644 --- a/src/transformers/modeling_electra.py +++ b/src/transformers/modeling_electra.py @@ -208,8 +208,8 @@ class ElectraForPretrainingOutput(ModelOutput): heads. """ - loss: Optional[torch.FloatTensor] - logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -272,8 +272,9 @@ class ElectraForPretrainingOutput(ModelOutput): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -331,13 +332,13 @@ def forward( inputs_embeds=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -371,7 +372,7 @@ def forward( head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) return hidden_states @@ -428,7 +429,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -437,7 +438,7 @@ def forward( If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict discriminator_hidden_states = self.electra( input_ids, @@ -448,7 +449,7 @@ def forward( inputs_embeds, output_attentions, output_hidden_states, - return_tuple, + return_dict, ) sequence_output = discriminator_hidden_states[0] @@ -464,7 +465,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + discriminator_hidden_states[1:] return ((loss,) + output) if loss is not None else output @@ -505,7 +506,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`): @@ -527,7 +528,7 @@ def forward( >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 >>> logits = model(input_ids).logits """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict discriminator_hidden_states = self.electra( input_ids, @@ -538,7 +539,7 @@ def forward( inputs_embeds, output_attentions, output_hidden_states, - return_tuple, + return_dict, ) discriminator_sequence_output = discriminator_hidden_states[0] @@ -555,7 +556,7 @@ def forward( else: loss = loss_fct(logits.view(-1, discriminator_sequence_output.shape[1]), labels.float()) - if return_tuple: + if not return_dict: output = (logits,) + discriminator_hidden_states[1:] return ((loss,) + output) if loss is not None else output @@ -606,7 +607,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs ): r""" @@ -625,7 +626,7 @@ def forward( ) labels = kwargs.pop("masked_lm_labels") assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict generator_hidden_states = self.electra( input_ids, @@ -636,7 +637,7 @@ def forward( inputs_embeds, output_attentions, output_hidden_states, - return_tuple, + return_dict, ) generator_sequence_output = generator_hidden_states[0] @@ -649,7 +650,7 @@ def forward( loss_fct = nn.CrossEntropyLoss() # -100 index = padding token loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if return_tuple: + if not return_dict: output = (prediction_scores,) + generator_hidden_states[1:] return ((loss,) + output) if loss is not None else output @@ -695,14 +696,14 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict discriminator_hidden_states = self.electra( input_ids, @@ -713,7 +714,7 @@ def forward( inputs_embeds, output_attentions, output_hidden_states, - return_tuple, + return_dict, ) discriminator_sequence_output = discriminator_hidden_states[0] @@ -732,7 +733,7 @@ def forward( else: loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + discriminator_hidden_states[1:] return ((loss,) + output) if loss is not None else output @@ -782,7 +783,7 @@ def forward( end_positions=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -794,7 +795,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict discriminator_hidden_states = self.electra( input_ids, @@ -831,7 +832,7 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if return_tuple: + if not return_dict: output = (start_logits, end_logits,) + discriminator_hidden_states[1:] return ((total_loss,) + output) if total_loss is not None else output @@ -876,7 +877,7 @@ def forward( inputs_embeds=None, labels=None, output_attentions=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -884,7 +885,7 @@ def forward( Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -905,7 +906,7 @@ def forward( head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = discriminator_hidden_states[0] @@ -919,7 +920,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if return_tuple: + if not return_dict: output = (reshaped_logits,) + discriminator_hidden_states[1:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/modeling_encoder_decoder.py b/src/transformers/modeling_encoder_decoder.py index 52d7058d1b2b..3eb92ad8f905 100644 --- a/src/transformers/modeling_encoder_decoder.py +++ b/src/transformers/modeling_encoder_decoder.py @@ -273,7 +273,6 @@ def forward( attention_mask=attention_mask, inputs_embeds=inputs_embeds, head_mask=head_mask, - return_tuple=True, **kwargs_encoder, ) @@ -288,7 +287,6 @@ def forward( encoder_attention_mask=attention_mask, head_mask=decoder_head_mask, labels=labels, - return_tuple=True, **kwargs_decoder, ) diff --git a/src/transformers/modeling_flaubert.py b/src/transformers/modeling_flaubert.py index 5d0ebf27fc3f..61ef9d8fc7fe 100644 --- a/src/transformers/modeling_flaubert.py +++ b/src/transformers/modeling_flaubert.py @@ -110,8 +110,9 @@ If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -148,13 +149,13 @@ def forward( inputs_embeds=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict # removed: src_enc=None, src_len=None if input_ids is not None: @@ -284,7 +285,7 @@ def forward( # move back sequence length to dimension 0 # tensor = tensor.transpose(0, 1) - if return_tuple: + if not return_dict: return tuple(v for v in [tensor, hidden_states, attentions] if v is not None) return BaseModelOutput(last_hidden_state=tensor, hidden_states=hidden_states, attentions=attentions) diff --git a/src/transformers/modeling_gpt2.py b/src/transformers/modeling_gpt2.py index 0514586a5f24..a2168726ccd2 100644 --- a/src/transformers/modeling_gpt2.py +++ b/src/transformers/modeling_gpt2.py @@ -323,10 +323,10 @@ class GPT2DoubleHeadsModelOutput(ModelOutput): heads. """ - lm_loss: Optional[torch.FloatTensor] - mc_loss: Optional[torch.FloatTensor] - lm_logits: torch.FloatTensor - mc_logits: torch.FloatTensor + lm_loss: Optional[torch.FloatTensor] = None + mc_loss: Optional[torch.FloatTensor] = None + lm_logits: torch.FloatTensor = None + mc_logits: torch.FloatTensor = None past_key_values: Optional[List[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -395,8 +395,9 @@ class GPT2DoubleHeadsModelOutput(ModelOutput): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -448,7 +449,7 @@ def forward( use_cache=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs, ): if "past" in kwargs: @@ -464,7 +465,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -560,7 +561,7 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if return_tuple: + if not return_dict: return tuple(v for v in [hidden_states, presents, all_hidden_states, all_attentions] if v is not None) return BaseModelOutputWithPast( @@ -616,7 +617,7 @@ def forward( use_cache=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs, ): r""" @@ -634,7 +635,7 @@ def forward( ) past_key_values = kwargs.pop("past") assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict transformer_outputs = self.transformer( input_ids, @@ -647,7 +648,7 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) hidden_states = transformer_outputs[0] @@ -662,7 +663,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) - if return_tuple: + if not return_dict: output = (lm_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output @@ -713,7 +714,7 @@ def forward( use_cache=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs, ): r""" @@ -741,7 +742,7 @@ def forward( >>> from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2') - >>> model = GPT2DoubleHeadsModel.from_pretrained('gpt2') + >>> model = GPT2DoubleHeadsModel.from_pretrained('gpt2, return_dict=True) >>> # Add a [CLS] to the vocabulary (we should train it also!) >>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'}) @@ -773,7 +774,7 @@ def forward( ) past_key_values = kwargs.pop("past") assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict transformer_outputs = self.transformer( input_ids, @@ -786,7 +787,7 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) hidden_states = transformer_outputs[0] @@ -805,7 +806,7 @@ def forward( loss_fct = CrossEntropyLoss() lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) - if return_tuple: + if not return_dict: output = (lm_logits, mc_logits) + transformer_outputs[1:] if mc_loss is not None: output = (mc_loss,) + output diff --git a/src/transformers/modeling_longformer.py b/src/transformers/modeling_longformer.py index c440af07a059..da422b95f842 100644 --- a/src/transformers/modeling_longformer.py +++ b/src/transformers/modeling_longformer.py @@ -694,7 +694,7 @@ def forward( attention_mask=None, output_attentions=False, output_hidden_states=False, - return_tuple=False, + return_dict=False, ): all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -724,7 +724,7 @@ def custom_forward(*inputs): if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if return_tuple: + if not return_dict: return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions @@ -811,8 +811,9 @@ def _init_weights(self, module): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -942,7 +943,7 @@ def forward( inputs_embeds=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" @@ -953,7 +954,7 @@ def forward( >>> import torch >>> from transformers import LongformerModel, LongformerTokenizer - >>> model = LongformerModel.from_pretrained('allenai/longformer-base-4096') + >>> model = LongformerModel.from_pretrained('allenai/longformer-base-4096', return_dict=True) >>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') >>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document @@ -965,14 +966,16 @@ def forward( ... # classification: the token ... # QA: question tokens ... # LM: potentially on the beginning of sentences and paragraphs - >>> sequence_output, pooled_output = model(input_ids, attention_mask=attention_mask) + >>> outputs = model(input_ids, attention_mask=attention_mask) + >>> sequence_output = outputs.last_hidden_state + >>> pooled_output = outputs.pooler_output """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -1016,7 +1019,7 @@ def forward( attention_mask=extended_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output) @@ -1026,7 +1029,7 @@ def forward( # unpad `sequence_output` because the calling function is expecting a length == input_ids.size(1) sequence_output = sequence_output[:, :-padding_len] - if return_tuple: + if not return_dict: return (sequence_output, pooled_output) + encoder_outputs[1:] return BaseModelOutputWithPooling( @@ -1063,7 +1066,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs ): r""" @@ -1082,7 +1085,7 @@ def forward( >>> import torch >>> from transformers import LongformerForMaskedLM, LongformerTokenizer - >>> model = LongformerForMaskedLM.from_pretrained('allenai/longformer-base-4096') + >>> model = LongformerForMaskedLM.from_pretrained('allenai/longformer-base-4096', return_dict=True) >>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') >>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document @@ -1102,7 +1105,7 @@ def forward( ) labels = kwargs.pop("masked_lm_labels") assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.longformer( input_ids, @@ -1113,7 +1116,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] prediction_scores = self.lm_head(sequence_output) @@ -1123,7 +1126,7 @@ def forward( loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if return_tuple: + if not return_dict: output = (prediction_scores,) + outputs[2:] return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output @@ -1171,7 +1174,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1180,7 +1183,7 @@ def forward( If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if global_attention_mask is None: logger.info("Initializing global attention on CLS token...") @@ -1197,7 +1200,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] logits = self.classifier(sequence_output) @@ -1212,7 +1215,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output @@ -1272,7 +1275,7 @@ def forward( end_positions=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1291,7 +1294,7 @@ def forward( >>> import torch >>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa") - >>> model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa") + >>> model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa", return_dict=True) >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" >>> encoding = tokenizer(question, text, return_tensors="pt") @@ -1310,7 +1313,7 @@ def forward( >>> answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens)) # remove space prepending space token """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict # set global attention on question tokens if global_attention_mask is None: @@ -1327,7 +1330,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -1354,7 +1357,7 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if return_tuple: + if not return_dict: output = (start_logits, end_logits) + outputs[2:] return ((total_loss,) + output) if total_loss is not None else output @@ -1404,14 +1407,14 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.longformer( input_ids, @@ -1422,7 +1425,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -1444,7 +1447,7 @@ def forward( else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output @@ -1489,7 +1492,7 @@ def forward( inputs_embeds=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1498,7 +1501,7 @@ def forward( of the input tensors. (see `input_ids` above) """ num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict # set global attention on question tokens if global_attention_mask is None: @@ -1536,7 +1539,7 @@ def forward( inputs_embeds=flat_inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) pooled_output = outputs[1] @@ -1549,7 +1552,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if return_tuple: + if not return_dict: output = (reshaped_logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/modeling_mmbt.py b/src/transformers/modeling_mmbt.py index ec3138e2b40a..18105269d00a 100644 --- a/src/transformers/modeling_mmbt.py +++ b/src/transformers/modeling_mmbt.py @@ -23,7 +23,7 @@ from torch.nn import CrossEntropyLoss, MSELoss from .file_utils import add_start_docstrings, add_start_docstrings_to_callable, replace_return_docstrings -from .modeling_outputs import BaseModelOutputWithPooling +from .modeling_outputs import BaseModelOutputWithPooling, SequenceClassifierOutput from .modeling_utils import ModuleUtilsMixin @@ -148,8 +148,9 @@ def forward(self, input_modal, start_token=None, end_token=None, position_ids=No If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -182,7 +183,7 @@ def forward( encoder_attention_mask=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" Returns: @@ -198,7 +199,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -257,13 +258,13 @@ def forward( encoder_attention_mask=encoder_extended_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = encoder_outputs[0] pooled_output = self.transformer.pooler(sequence_output) - if return_tuple: + if not return_dict: return (sequence_output, pooled_output) + encoder_outputs[1:] return BaseModelOutputWithPooling( @@ -339,7 +340,9 @@ def forward( head_mask=None, inputs_embeds=None, labels=None, + return_dict=None, ): + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.mmbt( input_modal=input_modal, @@ -353,6 +356,7 @@ def forward( modal_position_ids=modal_position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, + return_dict=return_dict, ) pooled_output = outputs[1] @@ -360,8 +364,7 @@ def forward( pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) - outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here - + loss = None if labels is not None: if self.num_labels == 1: # We are doing regression @@ -370,6 +373,11 @@ def forward( else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - outputs = (loss,) + outputs - return outputs # (loss), logits, (hidden_states), (attentions) + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) diff --git a/src/transformers/modeling_mobilebert.py b/src/transformers/modeling_mobilebert.py index 13c9ade0270c..d3a4cd8e32ba 100644 --- a/src/transformers/modeling_mobilebert.py +++ b/src/transformers/modeling_mobilebert.py @@ -550,7 +550,7 @@ def forward( encoder_attention_mask=None, output_attentions=False, output_hidden_states=False, - return_tuple=False, + return_dict=False, ): all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -575,7 +575,7 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if return_tuple: + if not return_dict: return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions @@ -708,9 +708,9 @@ class MobileBertForPretrainingOutput(ModelOutput): heads. """ - loss: Optional[torch.FloatTensor] - prediction_logits: torch.FloatTensor - seq_relationship_logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + prediction_logits: torch.FloatTensor = None + seq_relationship_logits: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -773,8 +773,9 @@ class MobileBertForPretrainingOutput(ModelOutput): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -831,13 +832,13 @@ def forward( encoder_attention_mask=None, output_hidden_states=None, output_attentions=None, - return_tuple=None, + return_dict=None, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -890,12 +891,12 @@ def forward( encoder_attention_mask=encoder_extended_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output) - if return_tuple: + if not return_dict: return (sequence_output, pooled_output) + encoder_outputs[1:] return BaseModelOutputWithPooling( @@ -958,7 +959,7 @@ def forward( next_sentence_label=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`): @@ -979,7 +980,7 @@ def forward( >>> import torch >>> tokenizer = MobileBertTokenizer.from_pretrained("google/mobilebert-uncased") - >>> model = MobileBertForPreTraining.from_pretrained("google/mobilebert-uncased") + >>> model = MobileBertForPreTraining.from_pretrained("google/mobilebert-uncased", return_dict=True) >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 >>> outputs = model(input_ids) @@ -988,7 +989,7 @@ def forward( >>> seq_relationship_logits = outputs.seq_relationship_logits """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.mobilebert( input_ids, @@ -999,7 +1000,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output, pooled_output = outputs[:2] prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) @@ -1011,7 +1012,7 @@ def forward( next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) total_loss = masked_lm_loss + next_sentence_loss - if return_tuple: + if not return_dict: output = (prediction_scores, seq_relationship_score) + outputs[2:] return ((total_loss,) + output) if total_loss is not None else output @@ -1079,7 +1080,7 @@ def forward( encoder_attention_mask=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs ): r""" @@ -1097,7 +1098,7 @@ def forward( FutureWarning, ) labels = kwargs.pop("masked_lm_labels") - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.mobilebert( input_ids, @@ -1110,7 +1111,7 @@ def forward( encoder_attention_mask=encoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -1121,7 +1122,7 @@ def forward( loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if return_tuple: + if not return_dict: output = (prediction_scores,) + outputs[2:] return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output @@ -1169,7 +1170,7 @@ def forward( next_sentence_label=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1186,7 +1187,7 @@ def forward( >>> import torch >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased') - >>> model = MobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased') + >>> model = MobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased', return_dict=True) >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light." @@ -1196,7 +1197,7 @@ def forward( >>> loss = outputs.loss >>> logits = outputs.logits """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.mobilebert( input_ids, @@ -1207,7 +1208,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) pooled_output = outputs[1] @@ -1218,7 +1219,7 @@ def forward( loss_fct = CrossEntropyLoss() next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) - if return_tuple: + if not return_dict: output = (seq_relationship_score,) + outputs[2:] return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output @@ -1263,7 +1264,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1272,7 +1273,7 @@ def forward( If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.mobilebert( input_ids, @@ -1283,7 +1284,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) @@ -1299,7 +1300,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output @@ -1342,7 +1343,7 @@ def forward( end_positions=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1354,7 +1355,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.mobilebert( input_ids, @@ -1365,7 +1366,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -1392,7 +1393,7 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if return_tuple: + if not return_dict: output = (start_logits, end_logits) + outputs[2:] return ((total_loss,) + output) if total_loss is not None else output @@ -1438,7 +1439,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1446,7 +1447,7 @@ def forward( Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1468,7 +1469,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) pooled_output = outputs[1] @@ -1482,7 +1483,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if return_tuple: + if not return_dict: output = (reshaped_logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output @@ -1525,14 +1526,14 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.mobilebert( input_ids, @@ -1543,7 +1544,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -1565,7 +1566,7 @@ def forward( else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/modeling_openai.py b/src/transformers/modeling_openai.py index 3efa7d353f6e..04cf8fb8a4f4 100644 --- a/src/transformers/modeling_openai.py +++ b/src/transformers/modeling_openai.py @@ -315,10 +315,10 @@ class OpenAIGPTDoubleHeadsModelOutput(ModelOutput): heads. """ - lm_loss: Optional[torch.FloatTensor] - mc_loss: Optional[torch.FloatTensor] - lm_logits: torch.FloatTensor - mc_logits: torch.FloatTensor + lm_loss: Optional[torch.FloatTensor] = None + mc_loss: Optional[torch.FloatTensor] = None + lm_logits: torch.FloatTensor = None + mc_logits: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -374,8 +374,9 @@ class OpenAIGPTDoubleHeadsModelOutput(ModelOutput): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -425,13 +426,13 @@ def forward( inputs_embeds=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -496,7 +497,7 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if return_tuple: + if not return_dict: return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) return BaseModelOutput( @@ -538,7 +539,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): @@ -548,7 +549,7 @@ def forward( All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict transformer_outputs = self.transformer( input_ids, @@ -559,7 +560,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) hidden_states = transformer_outputs[0] lm_logits = self.lm_head(hidden_states) @@ -573,7 +574,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) - if return_tuple: + if not return_dict: output = (lm_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output @@ -622,7 +623,7 @@ def forward( mc_labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs ): r""" @@ -650,7 +651,7 @@ def forward( import torch tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') - model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt') + model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt', return_dict=True) tokenizer.add_special_tokens({'cls_token': '[CLS]'}) # Add a [CLS] to the vocabulary (we should train it also!) model.resize_token_embeddings(len(tokenizer)) @@ -662,7 +663,7 @@ def forward( lm_logits = outputs.lm_logits mc_logits = outputs.mc_logits """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if "lm_labels" in kwargs: warnings.warn( "The `lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.", @@ -680,7 +681,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) hidden_states = transformer_outputs[0] @@ -698,7 +699,7 @@ def forward( loss_fct = CrossEntropyLoss() mc_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) - if return_tuple: + if not return_dict: output = (lm_logits, mc_logits) + transformer_outputs[1:] if mc_loss is not None: output = (mc_loss,) + output diff --git a/src/transformers/modeling_outputs.py b/src/transformers/modeling_outputs.py index f9cf15c40b9e..3a91d17904d5 100644 --- a/src/transformers/modeling_outputs.py +++ b/src/transformers/modeling_outputs.py @@ -63,7 +63,7 @@ class BaseModelOutputWithPooling(ModelOutput): """ last_hidden_state: torch.FloatTensor - pooler_output: torch.FloatTensor + pooler_output: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -179,7 +179,7 @@ class CausalLMOutput(ModelOutput): """ loss: Optional[torch.FloatTensor] - logits: torch.FloatTensor + logits: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -213,8 +213,8 @@ class CausalLMOutputWithPast(ModelOutput): heads. """ - loss: Optional[torch.FloatTensor] - logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None past_key_values: Optional[List[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -243,8 +243,8 @@ class MaskedLMOutput(ModelOutput): heads. """ - loss: Optional[torch.FloatTensor] - logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -291,8 +291,8 @@ class Seq2SeqLMOutput(ModelOutput): self-attention heads. """ - loss: Optional[torch.FloatTensor] - logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None decoder_past_key_values: Optional[List[torch.FloatTensor]] = None decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -324,8 +324,8 @@ class NextSentencePredictorOutput(ModelOutput): heads. """ - loss: Optional[torch.FloatTensor] - logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -353,8 +353,8 @@ class SequenceClassifierOutput(ModelOutput): heads. """ - loss: Optional[torch.FloatTensor] - logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -401,8 +401,8 @@ class Seq2SeqSequenceClassifierOutput(ModelOutput): self-attention heads. """ - loss: Optional[torch.FloatTensor] - logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None decoder_past_key_values: Optional[List[torch.FloatTensor]] = None decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -436,8 +436,8 @@ class MultipleChoiceModelOutput(ModelOutput): heads. """ - loss: Optional[torch.FloatTensor] - logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -465,8 +465,8 @@ class TokenClassifierOutput(ModelOutput): heads. """ - loss: Optional[torch.FloatTensor] - logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -496,9 +496,9 @@ class QuestionAnsweringModelOutput(ModelOutput): heads. """ - loss: Optional[torch.FloatTensor] - start_logits: torch.FloatTensor - end_logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + start_logits: torch.FloatTensor = None + end_logits: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -547,9 +547,9 @@ class Seq2SeqQuestionAnsweringModelOutput(ModelOutput): self-attention heads. """ - loss: Optional[torch.FloatTensor] - start_logits: torch.FloatTensor - end_logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + start_logits: torch.FloatTensor = None + end_logits: torch.FloatTensor = None decoder_past_key_values: Optional[List[torch.FloatTensor]] = None decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None diff --git a/src/transformers/modeling_reformer.py b/src/transformers/modeling_reformer.py index 6beed9df7afb..8109d6b98f91 100644 --- a/src/transformers/modeling_reformer.py +++ b/src/transformers/modeling_reformer.py @@ -39,13 +39,7 @@ add_start_docstrings, add_start_docstrings_to_callable, ) -from .modeling_outputs import ( - BaseModelOutput, - CausalLMOutput, - MaskedLMOutput, - QuestionAnsweringModelOutput, - SequenceClassifierOutput, -) +from .modeling_outputs import CausalLMOutput, MaskedLMOutput, QuestionAnsweringModelOutput, SequenceClassifierOutput from .modeling_utils import PreTrainedModel, apply_chunking_to_forward @@ -1851,8 +1845,8 @@ class ReformerModelWithLMHeadOutput(ModelOutput): heads. """ - loss: Optional[torch.FloatTensor] - logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None past_buckets_states: Optional[List[Tuple[torch.LongTensor, torch.FloatTensor]]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -1922,8 +1916,9 @@ class ReformerModelWithLMHeadOutput(ModelOutput): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -1962,7 +1957,7 @@ def _prune_heads(self, heads_to_prune): @add_code_sample_docstrings( tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/reformer-crime-and-punishment", - output_type=BaseModelOutput, + output_type=ReformerModelOutput, config_class=_CONFIG_FOR_DOC, ) def forward( @@ -1977,40 +1972,14 @@ def forward( use_cache=None, output_hidden_states=None, output_attentions=None, - return_tuple=None, + return_dict=None, ): - r""" - Return: - :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - past_buckets_states (:obj:`List[Tuple(torch.LongTensor, torch.FloatTensor)]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): - List of :obj:`tuple(torch.LongTensor, torch.FloatTensor` of length :obj:`config.n_layers`, with :obj:`tuple(0)` being the previous `buckets` of shape - :obj:`(batch_size, num_heads, num_hashes, sequence_length)`) - and :obj:`tuple(1)` being the previous `hidden_states` of shape - :obj:`(batch_size, sequence_length, hidden_size)`). - - Contains pre-computed buckets and hidden-states that can be used (see - ``past_buckets_states`` input) to speed up sequential decoding. - all_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - all_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ - use_cache = use_cache if use_cache is not None else self.config.use_cache output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -2102,7 +2071,7 @@ def forward( hidden_states = encoder_outputs.all_hidden_states if output_hidden_states else None attentions = encoder_outputs.all_attentions if output_attentions else None - if return_tuple: + if not return_dict: return tuple(v for v in [sequence_output, past_buckets_states, hidden_states, attentions] if v is not None) return ReformerModelOutput( last_hidden_state=sequence_output, @@ -2208,7 +2177,7 @@ def forward( use_cache=None, output_hidden_states=None, output_attentions=None, - return_tuple=None, + return_dict=None, labels=None, ): r""" @@ -2218,7 +2187,7 @@ def forward( All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict reformer_outputs = self.reformer( input_ids, @@ -2231,7 +2200,7 @@ def forward( use_cache=use_cache, output_hidden_states=output_hidden_states, output_attentions=output_attentions, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = reformer_outputs[0] @@ -2246,7 +2215,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(shift_logits.view(-1, self.config.vocab_size), shift_labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + reformer_outputs[1:] return ((loss,) + output) if loss is not None else output @@ -2326,7 +2295,7 @@ def forward( labels=None, output_hidden_states=None, output_attentions=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): @@ -2334,7 +2303,7 @@ def forward( Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict reformer_outputs = self.reformer( input_ids, @@ -2346,7 +2315,7 @@ def forward( use_cache=False, # no causal mask output_hidden_states=output_hidden_states, output_attentions=output_attentions, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = reformer_outputs[0] @@ -2357,7 +2326,7 @@ def forward( loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + reformer_outputs[1:] return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output @@ -2408,7 +2377,7 @@ def forward( labels=None, output_hidden_states=None, output_attentions=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -2427,7 +2396,7 @@ def forward( num_hashes=num_hashes, output_hidden_states=output_hidden_states, output_attentions=output_attentions, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -2443,7 +2412,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output @@ -2511,7 +2480,7 @@ def forward( end_positions=None, output_hidden_states=None, output_attentions=None, - return_tuple=None, + return_dict=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -2523,7 +2492,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict reformer_outputs = self.reformer( input_ids, @@ -2535,7 +2504,7 @@ def forward( use_cache=False, # no causal mask output_hidden_states=output_hidden_states, output_attentions=output_attentions, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = reformer_outputs[0] @@ -2562,7 +2531,7 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if return_tuple: + if not return_dict: output = (start_logits, end_logits) + reformer_outputs[1:] return ((total_loss,) + output) if total_loss is not None else output diff --git a/src/transformers/modeling_roberta.py b/src/transformers/modeling_roberta.py index 00a0ecc397c7..7779e81eceef 100644 --- a/src/transformers/modeling_roberta.py +++ b/src/transformers/modeling_roberta.py @@ -143,8 +143,9 @@ def create_position_ids_from_inputs_embeds(self, inputs_embeds): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -208,7 +209,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs ): r""" @@ -227,7 +228,7 @@ def forward( ) labels = kwargs.pop("masked_lm_labels") assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.roberta( input_ids, @@ -238,7 +239,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] prediction_scores = self.lm_head(sequence_output) @@ -248,7 +249,7 @@ def forward( loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if return_tuple: + if not return_dict: output = (prediction_scores,) + outputs[2:] return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output @@ -321,7 +322,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -330,7 +331,7 @@ def forward( If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.roberta( input_ids, @@ -341,7 +342,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] logits = self.classifier(sequence_output) @@ -356,7 +357,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output @@ -401,7 +402,7 @@ def forward( inputs_embeds=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -409,7 +410,7 @@ def forward( Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -431,7 +432,7 @@ def forward( inputs_embeds=flat_inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) pooled_output = outputs[1] @@ -444,7 +445,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if return_tuple: + if not return_dict: output = (reshaped_logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output @@ -490,14 +491,14 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.roberta( input_ids, @@ -508,7 +509,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -530,7 +531,7 @@ def forward( else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output @@ -595,7 +596,7 @@ def forward( end_positions=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -607,7 +608,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.roberta( input_ids, @@ -618,7 +619,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -645,7 +646,7 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if return_tuple: + if not return_dict: output = (start_logits, end_logits) + outputs[2:] return ((total_loss,) + output) if total_loss is not None else output diff --git a/src/transformers/modeling_t5.py b/src/transformers/modeling_t5.py index 925ab53e3695..d7665ba2017d 100644 --- a/src/transformers/modeling_t5.py +++ b/src/transformers/modeling_t5.py @@ -675,7 +675,7 @@ def forward( use_cache=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -683,7 +683,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -787,7 +787,7 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if return_tuple: + if not return_dict: return tuple( v for v in [hidden_states, present_key_value_states, all_hidden_states, all_attentions] @@ -868,8 +868,9 @@ def forward( If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -930,7 +931,7 @@ def forward( head_mask=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs, ): r""" @@ -957,7 +958,7 @@ def forward( assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." use_cache = use_cache if use_cache is not None else self.config.use_cache - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict # Encode if needed (training, first prediction pass) if encoder_outputs is None: @@ -968,9 +969,9 @@ def forward( head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) - elif not return_tuple and not isinstance(encoder_outputs, BaseModelOutput): + elif not return_dict and not isinstance(encoder_outputs, BaseModelOutput): encoder_outputs = BaseModelOutput( last_hidden_state=encoder_outputs[0], hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, @@ -1005,11 +1006,11 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) past = (encoder_outputs, decoder_outputs[1]) if use_cache is True else None - if return_tuple: + if not return_dict: if past is not None: decoder_outputs = decoder_outputs[:1] + (past,) + decoder_outputs[2:] return decoder_outputs + encoder_outputs @@ -1081,7 +1082,7 @@ def forward( head_mask=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs, ): r""" @@ -1100,13 +1101,14 @@ def forward( >>> from transformers import T5Tokenizer, T5ForConditionalGeneration >>> tokenizer = T5Tokenizer.from_pretrained('t5-small') - >>> model = T5ForConditionalGeneration.from_pretrained('t5-small') + >>> model = T5ForConditionalGeneration.from_pretrained('t5-small', return_dict=True) >>> input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt") # Batch size 1 >>> outputs = model(input_ids=input_ids, labels=input_ids) - >>> loss, prediction_scores = outputs[:2] + >>> loss = outputs.loss + >>> logits = outputs.logits >>> tokenizer = T5Tokenizer.from_pretrained('t5-small') - >>> model = T5ForConditionalGeneration.from_pretrained('t5-small') + >>> model = T5ForConditionalGeneration.from_pretrained('t5-small', return_dict=True) >>> input_ids = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="pt") # Batch size 1 >>> outputs = model.generate(input_ids) """ @@ -1126,7 +1128,7 @@ def forward( assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." use_cache = use_cache if use_cache is not None else self.config.use_cache - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict # Encode if needed (training, first prediction pass) if encoder_outputs is None: @@ -1138,9 +1140,9 @@ def forward( head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) - elif not return_tuple and not isinstance(encoder_outputs, BaseModelOutput): + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): encoder_outputs = BaseModelOutput( last_hidden_state=encoder_outputs[0], hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, @@ -1174,7 +1176,7 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = decoder_outputs[0] @@ -1190,7 +1192,7 @@ def forward( # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666 past = (encoder_outputs, decoder_outputs[1]) if use_cache is True else None - if return_tuple: + if not return_dict: if past is not None: decoder_outputs = decoder_outputs[:1] + (past,) + decoder_outputs[2:] output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs diff --git a/src/transformers/modeling_transfo_xl.py b/src/transformers/modeling_transfo_xl.py index ca98fe5abc5d..bdad2f406d93 100644 --- a/src/transformers/modeling_transfo_xl.py +++ b/src/transformers/modeling_transfo_xl.py @@ -618,7 +618,7 @@ class TransfoXLModelOutput(ModelOutput): """ last_hidden_state: torch.FloatTensor - mems: List[torch.FloatTensor] + mems: List[torch.FloatTensor] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -650,9 +650,9 @@ class TransfoXLLMHeadModelOutput(ModelOutput): heads. """ - losses: Optional[torch.FloatTensor] - prediction_scores: torch.FloatTensor - mems: List[torch.FloatTensor] + losses: Optional[torch.FloatTensor] = None + prediction_scores: torch.FloatTensor = None + mems: List[torch.FloatTensor] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -695,8 +695,9 @@ class TransfoXLLMHeadModelOutput(ModelOutput): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -836,13 +837,13 @@ def forward( inputs_embeds=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library # so we transpose here from shape [bsz, len] to shape [len, bsz] @@ -941,7 +942,7 @@ def forward( # We transpose back here to shape [bsz, len, hidden_dim] core_out = core_out.transpose(0, 1).contiguous() - if return_tuple: + if not return_dict: return tuple(v for v in [core_out, new_mems, hids, attentions] if v is not None) return TransfoXLModelOutput( @@ -1013,7 +1014,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): @@ -1023,7 +1024,7 @@ def forward( All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None: bsz, tgt_len = input_ids.size(0), input_ids.size(1) elif inputs_embeds is not None: @@ -1038,7 +1039,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) last_hidden = transformer_outputs[0] @@ -1048,7 +1049,7 @@ def forward( prediction_scores = softmax_output.view(bsz, tgt_len, -1) if labels is None else () loss = softmax_output.view(bsz, tgt_len - 1) if labels is not None else None - if return_tuple: + if not return_dict: output = (prediction_scores,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 7296ba4ac4d7..d1d548ffbd78 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -1167,7 +1167,7 @@ def forward( cls_index: Optional[torch.LongTensor] = None, is_impossible: Optional[torch.LongTensor] = None, p_mask: Optional[torch.FloatTensor] = None, - return_tuple: bool = False, + return_dict: bool = False, ) -> Union[SquadHeadOutput, Tuple[torch.FloatTensor]]: """ Args: @@ -1184,8 +1184,8 @@ def forward( p_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len)`, `optional`): Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token should be masked. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`False`): - Whether or not to return a plain tuple instead of a :class:`~transformers.file_utils.ModelOuput`. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to return a :class:`~transformers.file_utils.ModelOuput` instead of a plain tuple. Returns: """ @@ -1214,7 +1214,7 @@ def forward( # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss total_loss += cls_loss * 0.5 - return (total_loss,) if return_tuple else SquadHeadOutput(loss=total_loss) + return SquadHeadOutput(loss=total_loss) if return_dict else (total_loss,) else: # during inference, compute the end logits based on beam search @@ -1244,7 +1244,7 @@ def forward( start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs) cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index) - if return_tuple: + if not return_dict: return (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) else: return SquadHeadOutput( diff --git a/src/transformers/modeling_xlm.py b/src/transformers/modeling_xlm.py index 9a366cee6bdd..27a8ed21aa86 100644 --- a/src/transformers/modeling_xlm.py +++ b/src/transformers/modeling_xlm.py @@ -367,8 +367,9 @@ class XLMForQuestionAnsweringOutput(ModelOutput): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -482,13 +483,13 @@ def forward( inputs_embeds=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None: bs, slen = input_ids.size() @@ -595,7 +596,7 @@ def forward( # move back sequence length to dimension 0 # tensor = tensor.transpose(0, 1) - if return_tuple: + if not return_dict: return tuple(v for v in [tensor, hidden_states, attentions] if v is not None) return BaseModelOutput(last_hidden_state=tensor, hidden_states=hidden_states, attentions=attentions) @@ -693,7 +694,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): @@ -703,7 +704,7 @@ def forward( All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict transformer_outputs = self.transformer( input_ids, @@ -717,13 +718,13 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) output = transformer_outputs[0] outputs = self.pred_layer(output, labels) # (loss, logits) or (logits,) depending on if labels are provided. - if return_tuple: + if not return_dict: return outputs + transformer_outputs[1:] return MaskedLMOutput( @@ -770,7 +771,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -779,7 +780,7 @@ def forward( If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict transformer_outputs = self.transformer( input_ids, @@ -793,7 +794,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) output = transformer_outputs[0] @@ -809,7 +810,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output @@ -857,7 +858,7 @@ def forward( end_positions=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -869,7 +870,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict transformer_outputs = self.transformer( input_ids, @@ -883,7 +884,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = transformer_outputs[0] @@ -910,7 +911,7 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if return_tuple: + if not return_dict: output = (start_logits, end_logits) + transformer_outputs[1:] return ((total_loss,) + output) if total_loss is not None else output @@ -957,7 +958,7 @@ def forward( p_mask=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -984,7 +985,7 @@ def forward( >>> import torch >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') - >>> model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048') + >>> model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048', return_dict=True) >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 >>> start_positions = torch.tensor([1]) @@ -993,7 +994,7 @@ def forward( >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) >>> loss = outputs.loss """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict transformer_outputs = self.transformer( input_ids, @@ -1007,7 +1008,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) output = transformer_outputs[0] @@ -1019,10 +1020,10 @@ def forward( cls_index=cls_index, is_impossible=is_impossible, p_mask=p_mask, - return_tuple=return_tuple, + return_dict=return_dict, ) - if return_tuple: + if not return_dict: return outputs + transformer_outputs[1:] return XLMForQuestionAnsweringOutput( @@ -1074,14 +1075,14 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.transformer( input_ids, @@ -1095,7 +1096,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -1117,7 +1118,7 @@ def forward( else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + outputs[1:] return ((loss,) + output) if loss is not None else output @@ -1162,7 +1163,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1170,7 +1171,7 @@ def forward( Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1204,7 +1205,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) output = transformer_outputs[0] logits = self.sequence_summary(output) @@ -1216,7 +1217,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if return_tuple: + if not return_dict: output = (reshaped_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/modeling_xlm_roberta.py b/src/transformers/modeling_xlm_roberta.py index 775e3451c428..b76d9744407b 100644 --- a/src/transformers/modeling_xlm_roberta.py +++ b/src/transformers/modeling_xlm_roberta.py @@ -53,12 +53,6 @@ config (:class:`~transformers.XLMRobertaConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. - output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. - output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. """ diff --git a/src/transformers/modeling_xlnet.py b/src/transformers/modeling_xlnet.py index 4448313817e0..e0892661ffbb 100644 --- a/src/transformers/modeling_xlnet.py +++ b/src/transformers/modeling_xlnet.py @@ -627,8 +627,8 @@ class XLNetLMHeadModelOutput(ModelOutput): heads. """ - loss: Optional[torch.FloatTensor] - logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None mems: Optional[List[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -661,8 +661,8 @@ class XLNetForSequenceClassificationOutput(ModelOutput): heads. """ - loss: Optional[torch.FloatTensor] - logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None mems: Optional[List[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -695,8 +695,8 @@ class XLNetForTokenClassificationOutput(ModelOutput): heads. """ - loss: Optional[torch.FloatTensor] - logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None mems: Optional[List[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -731,8 +731,8 @@ class XLNetForMultipleChoiceOutput(ModelOutput): heads. """ - loss: Optional[torch.FloatTensor] - logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None mems: Optional[List[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -767,9 +767,9 @@ class XLNetForQuestionAnsweringSimpleOutput(ModelOutput): heads. """ - loss: Optional[torch.FloatTensor] - start_logits: torch.FloatTensor - end_logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + start_logits: torch.FloatTensor = None + end_logits: torch.FloatTensor = None mems: Optional[List[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -891,8 +891,9 @@ class XLNetForQuestionAnsweringOutput(ModelOutput): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -1051,13 +1052,13 @@ def forward( use_cache=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache) # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end @@ -1239,7 +1240,7 @@ def forward( else: attentions = tuple(t.permute(2, 3, 0, 1).contiguous() for t in attentions) - if return_tuple: + if not return_dict: return tuple(v for v in [output, new_mems, hidden_states, attentions] if v is not None) return XLNetModelOutput( @@ -1325,7 +1326,7 @@ def forward( use_cache=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_predict)`, `optional`, defaults to :obj:`None`): @@ -1344,7 +1345,7 @@ def forward( import torch tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') - model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased') + model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased', return_dict=True) # We show how to setup inputs to predict a next token using a bi-directional context. input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very ", add_special_tokens=False)).unsqueeze(0) # We will predict the masked token @@ -1369,7 +1370,7 @@ def forward( loss = outputs.loss next_token_logits = outputs.logits # Logits have shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size] """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache) transformer_outputs = self.transformer( @@ -1385,7 +1386,7 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) logits = self.lm_loss(transformer_outputs[0]) @@ -1396,7 +1397,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output @@ -1447,7 +1448,7 @@ def forward( use_cache=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`) @@ -1456,7 +1457,7 @@ def forward( If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache) transformer_outputs = self.transformer( @@ -1472,7 +1473,7 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) output = transformer_outputs[0] @@ -1489,7 +1490,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output @@ -1539,7 +1540,7 @@ def forward( use_cache=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1547,7 +1548,7 @@ def forward( Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache) outputs = self.transformer( @@ -1563,7 +1564,7 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -1584,7 +1585,7 @@ def forward( else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + outputs[1:] return ((loss,) + output) if loss is not None else output @@ -1634,7 +1635,7 @@ def forward( use_cache=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1642,7 +1643,7 @@ def forward( Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache) num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] @@ -1669,7 +1670,7 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) output = transformer_outputs[0] @@ -1683,7 +1684,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels.view(-1)) - if return_tuple: + if not return_dict: output = (reshaped_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output @@ -1734,7 +1735,7 @@ def forward( use_cache=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1746,7 +1747,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache) outputs = self.transformer( @@ -1762,7 +1763,7 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -1789,7 +1790,7 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if return_tuple: + if not return_dict: output = (start_logits, end_logits) + outputs[1:] return ((total_loss,) + output) if total_loss is not None else output @@ -1842,7 +1843,7 @@ def forward( use_cache=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1869,7 +1870,7 @@ def forward( >>> import torch >>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') - >>> model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased') + >>> model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased', return_dict=True) >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 >>> start_positions = torch.tensor([1]) @@ -1878,7 +1879,7 @@ def forward( >>> loss = outputs.loss """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache) transformer_outputs = self.transformer( @@ -1894,7 +1895,7 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) hidden_states = transformer_outputs[0] start_logits = self.start_logits(hidden_states, p_mask=p_mask) @@ -1924,7 +1925,7 @@ def forward( # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss total_loss += cls_loss * 0.5 - if return_tuple: + if not return_dict: return (total_loss,) + transformer_outputs[1:] else: return XLNetForQuestionAnsweringOutput( @@ -1966,7 +1967,7 @@ def forward( hidden_states, start_states=start_states, cls_index=cls_index ) # Shape (batch size,): one single `cls_logits` for each sample - if return_tuple: + if not return_dict: outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) return outputs + transformer_outputs[1:] else: diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py index 2e6b1f4917a7..8eba3c8e9c91 100755 --- a/src/transformers/pipelines.py +++ b/src/transformers/pipelines.py @@ -2122,6 +2122,6 @@ def pipeline( "Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. " "Trying to load the model with Tensorflow." ) - model = model_class.from_pretrained(model, config=config, return_tuple=True, **model_kwargs) + model = model_class.from_pretrained(model, config=config, **model_kwargs) return task_class(model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, task=task, **kwargs) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 2df9113e1e27..e2be9f5a7fcc 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -661,9 +661,7 @@ def _prepare_inputs( if self.args.past_index >= 0 and self._past is not None: inputs["mems"] = self._past - # Our model outputs do not work with DataParallel, so forcing return tuple. - if isinstance(model, nn.DataParallel): - inputs["return_tuple"] = True + return inputs def training_step( diff --git a/templates/adding_a_new_model/modeling_xxx.py b/templates/adding_a_new_model/modeling_xxx.py index f1e031bc3228..73676ed249fe 100644 --- a/templates/adding_a_new_model/modeling_xxx.py +++ b/templates/adding_a_new_model/modeling_xxx.py @@ -260,8 +260,9 @@ def _init_weights(self, module): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -310,13 +311,13 @@ def forward( inputs_embeds=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -351,7 +352,7 @@ def forward( sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output) - if return_tuple: + if not return_dict: return (sequence_output, pooled_output) + encoder_outputs[1:] return BaseModelOutputWithPooling( @@ -393,7 +394,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): @@ -402,7 +403,7 @@ def forward( Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.transformer( input_ids, @@ -413,7 +414,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -424,7 +425,7 @@ def forward( loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if return_tuple: + if not return_dict: output = (prediction_scores,) + outputs[2:] return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output @@ -470,7 +471,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -479,7 +480,7 @@ def forward( If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.transformer( input_ids, @@ -490,7 +491,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) pooled_output = outputs[1] @@ -508,7 +509,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output @@ -550,7 +551,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -558,7 +559,7 @@ def forward( Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -580,7 +581,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) pooled_output = outputs[1] @@ -594,7 +595,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if return_tuple: + if not return_dict: output = (reshaped_logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output @@ -637,14 +638,14 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.transformer( input_ids, @@ -655,7 +656,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -677,7 +678,7 @@ def forward( else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output @@ -720,7 +721,7 @@ def forward( end_positions=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -732,7 +733,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.transformer( input_ids, @@ -743,7 +744,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -770,7 +771,7 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if return_tuple: + if not return_dict: output = (start_logits, end_logits) + outputs[2:] return ((total_loss,) + output) if total_loss is not None else output diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index f6841cb84465..8207f70f3190 100644 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -74,6 +74,7 @@ def _prepare_for_class(self, inputs_dict, model_class): def test_save_load(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True for model_class in self.all_model_classes: model = model_class(config) @@ -803,8 +804,6 @@ def test_multigpu_data_parallel_forward(self): # Wrap model in nn.DataParallel model = torch.nn.DataParallel(model) - # Our model outputs do not work with DataParallel, so forcing return tuple. - inputs_dict["return_tuple"] = True with torch.no_grad(): _ = model(**self._prepare_for_class(inputs_dict, model_class)) diff --git a/tests/test_modeling_t5.py b/tests/test_modeling_t5.py index 9c5cd9dbce82..09949f07b248 100644 --- a/tests/test_modeling_t5.py +++ b/tests/test_modeling_t5.py @@ -329,7 +329,6 @@ def test_export_to_onnx(self): import tempfile config_and_inputs = self.model_tester.prepare_config_and_inputs() - config_and_inputs[0].return_tuple = True model = T5Model(config_and_inputs[0]).to(torch_device) with tempfile.TemporaryDirectory() as tmpdirname: torch.onnx.export( From ec0267475c16a1913e64cb4f81fd54d153e3d815 Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Thu, 30 Jul 2020 11:11:48 -0400 Subject: [PATCH 062/127] Fix FlauBERT GPU test (#6142) * Fix GPU test * Remove legacy constructor --- src/transformers/modeling_flaubert.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/transformers/modeling_flaubert.py b/src/transformers/modeling_flaubert.py index 61ef9d8fc7fe..a778c2f1e368 100644 --- a/src/transformers/modeling_flaubert.py +++ b/src/transformers/modeling_flaubert.py @@ -163,11 +163,13 @@ def forward( else: bs, slen = inputs_embeds.size()[:-1] + device = input_ids.device if input_ids is not None else inputs_embeds.device + if lengths is None: if input_ids is not None: lengths = (input_ids != self.pad_index).sum(dim=1).long() else: - lengths = torch.LongTensor([slen] * bs) + lengths = torch.tensor([slen] * bs, device=device) # mask = input_ids != self.pad_index # check inputs @@ -184,8 +186,6 @@ def forward( # if self.is_decoder and src_enc is not None: # src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None] - device = input_ids.device if input_ids is not None else inputs_embeds.device - # position_ids if position_ids is None: position_ids = torch.arange(slen, dtype=torch.long, device=device) From e642c7890875974963db09c09a393f9fbf65b48a Mon Sep 17 00:00:00 2001 From: guillaume-be Date: Thu, 30 Jul 2020 20:11:39 +0200 Subject: [PATCH 063/127] Addition of a DialoguePipeline (#5516) * initial commit for pipeline implementation Addition of input processing and history concatenation * Conversation pipeline tested and working for single & multiple conversation inputs * Added docstrings for dialogue pipeline * Addition of dialogue pipeline integration tests * Delete test_t5.py * Fixed max code length * Updated styling * Fixed test broken by formatting tools * Removed unused import * Added unit test for DialoguePipeline * Fixed Tensorflow compatibility * Fixed multi-framework support using framework flag * - Fixed docstring - Added `min_length_for_response` as an initialization parameter - Renamed `*args` to `conversations`, `conversations` being a `Conversation` or a `List[Conversation]` - Updated truncation to truncate entire segments of conversations, instead of cutting in the middle of a user/bot input * - renamed pipeline name from dialogue to conversational - removed hardcoded default value of 1000 and use config.max_length instead - added `append_response` and `set_history` method to the Conversation class to avoid direct fields mutation - fixed bug in history truncation method * - Updated ConversationalPipeline to accept only active conversations (otherwise a ValueError is raised) * - Simplified input tensor conversion * - Updated attention_mask value for Tensorflow compatibility * - Updated last dialogue reference to conversational & fixed integration tests * Fixed conflict with master * Updates following review comments * Updated formatting * Added Conversation and ConversationalPipeline to the library __init__, addition of docstrings for Conversation, added both to the docs * Update src/transformers/pipelines.py Updated docsting following review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- docs/source/main_classes/pipelines.rst | 8 + src/transformers/__init__.py | 2 + src/transformers/pipelines.py | 326 ++++++++++++++++++++++++- tests/test_pipelines.py | 95 ++++++- 4 files changed, 428 insertions(+), 3 deletions(-) diff --git a/docs/source/main_classes/pipelines.rst b/docs/source/main_classes/pipelines.rst index ea51feb7ca00..214858fb5abe 100644 --- a/docs/source/main_classes/pipelines.rst +++ b/docs/source/main_classes/pipelines.rst @@ -71,3 +71,11 @@ TextGenerationPipeline ========================================== .. autoclass:: transformers.TextGenerationPipeline + + +ConversationalPipeline +========================================== + +.. autoclass:: transformers.Conversation + +.. autoclass:: transformers.ConversationalPipeline \ No newline at end of file diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index a0fc396e5114..18f6d72cefa3 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -104,6 +104,8 @@ # Pipelines from .pipelines import ( + Conversation, + ConversationalPipeline, CsvPipelineDataFormat, FeatureExtractionPipeline, FillMaskPipeline, diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py index 8eba3c8e9c91..b40f734ef2b6 100755 --- a/src/transformers/pipelines.py +++ b/src/transformers/pipelines.py @@ -20,11 +20,13 @@ import os import pickle import sys +import uuid from abc import ABC, abstractmethod from contextlib import contextmanager from itertools import chain from os.path import abspath, exists from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union +from uuid import UUID import numpy as np @@ -36,7 +38,7 @@ from .tokenization_auto import AutoTokenizer from .tokenization_bert import BasicTokenizer from .tokenization_utils import PreTrainedTokenizer -from .tokenization_utils_base import PaddingStrategy +from .tokenization_utils_base import BatchEncoding, PaddingStrategy if is_tf_available(): @@ -51,6 +53,7 @@ TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING, + TFAutoModelForCausalLM, ) if is_torch_available(): @@ -1895,6 +1898,321 @@ def __call__( return results +class Conversation: + """ + Utility class containing a conversation and its history. This class is meant to be used as an input to the + :obj:`~transformers.ConversationalPipeline`. The conversation contains a number of utility function to manage the addition of new + user input and generated model responses. A conversation needs to contain an unprocessed user input before being + passed to the :obj:`~transformers.ConversationalPipeline`. This user input is either created when the class is instantiated, or by calling + `append_response("input")` after a conversation turn. + + Usage:: + + conversation = Conversation("Going to the movies tonight - any suggestions?") + + # Steps usually performed by the model when generating a response: + # 1. Mark the user input as processed (moved to the history) + conversation.mark_processed() + # 2. Append a mode response + conversation.append_response("The Big lebowski.") + + conversation.add_user_input("Is it good?") + + Arguments: + text (:obj:`str`, `optional`, defaults to :obj:`None`): + The initial user input to start the conversation. + If :obj:`None`, a user input needs to be provided manually using `add_user_input` before the conversation can begin. + conversation_id (:obj:`uuid.UUID`, `optional`, defaults to :obj:`None`): + Unique identifier for the conversation + If :obj:`None`, the random UUID4 id will be assigned to the conversation. + """ + + def __init__(self, text: str = None, conversation_id: UUID = None): + if not conversation_id: + conversation_id = uuid.uuid4() + self.uuid: UUID = conversation_id + self.past_user_inputs: List[str] = [] + self.generated_responses: List[str] = [] + self.history: List[int] = [] + self.new_user_input: Optional[str] = text + + def add_user_input(self, text: str, overwrite: bool = False): + """ + Add a user input to the conversation for the next round. This populates the internal `new_user_input` field. + + Args: + text: str, the user input for the next conversation round + overwrite: bool, flag indicating if existing and unprocessed user input should be overwritten when this function is called + + """ + if self.new_user_input: + if overwrite: + logger.warning( + 'User input added while unprocessed input was existing: "{}" was overwritten with: "{}".'.format( + self.new_user_input, text + ) + ) + self.new_user_input = text + else: + logger.warning( + 'User input added while unprocessed input was existing: "{}" new input ignored: "{}". ' + "Set `overwrite` to True to overwrite unprocessed user input".format(self.new_user_input, text) + ) + else: + self.new_user_input = text + + def mark_processed(self): + """ + Mark the conversation as processed (moves the content of `new_user_input` to `past_user_inputs`) and empties the + `new_user_input` field. + """ + if self.new_user_input: + self.past_user_inputs.append(self.new_user_input) + self.new_user_input = None + + def append_response(self, response: str): + """ + Append a response to the list of generated responses. + + Args: + response: str, the model generated response + """ + self.generated_responses.append(response) + + def set_history(self, history: List[int]): + """ + Updates the value of the history of the conversation. The history is represented by a list of `token_ids`. The + history is used by the model to generate responses based on the previous conversation turns. + + Args: + history: (list of int), history of tokens provided and generated for this conversation + """ + self.history = history + + def __repr__(self): + """ + Generates a string representation of the conversation. + + Return: + :obj:`str` or :obj:`Dict`: + + Example: + Conversation id: 7d15686b-dc94-49f2-9c4b-c9eac6a1f114 + user >> Going to the movies tonight - any suggestions? + bot >> The Big Lebowski + """ + output = "Conversation id: {} \n".format(self.uuid) + for user_input, generated_response in zip(self.past_user_inputs, self.generated_responses): + output += "user >> {} \n".format(user_input) + output += "bot >> {} \n".format(generated_response) + if self.new_user_input is not None: + output += "user >> {} \n".format(self.new_user_input) + return output + + +class ConversationalPipeline(Pipeline): + """ + Multi-turn conversational pipeline. + + Usage:: + + conversational_pipeline = pipeline("conversational") + + conversation_1 = Conversation("Going to the movies tonight - any suggestions?") + conversation_2 = Conversation("What's the last book you have read?") + + conversational_pipeline([conversation_1, conversation_2]) + + conversation_1.add_user_input("Is it an action movie?") + conversation_2.add_user_input("What is the genre of this book?") + + conversational_pipeline([conversation_1, conversation_2]) + + The models that this pipeline can use are models that have been fine-tuned on a multi-turn conversational task, + currently: "microsoft/DialoGPT-small", "microsoft/DialoGPT-medium", "microsoft/DialoGPT-large" + See the up-to-date list of available models on + `huggingface.co/models `__. + + Arguments: + model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`): + The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string + checkpoint identifier or an actual pre-trained model inheriting from + :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for + TensorFlow. + If :obj:`None`, the default of the pipeline will be loaded. + tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`): + The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`, + a string checkpoint identifier or an actual pre-trained tokenizer inheriting from + :class:`~transformers.PreTrainedTokenizer`. + If :obj:`None`, the default of the pipeline will be loaded. + modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`): + Model card attributed to the model for this pipeline. + framework (:obj:`str`, `optional`, defaults to :obj:`None`): + The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be + installed. + If no framework is specified, will default to the one currently installed. If no framework is specified + and both frameworks are installed, will default to PyTorch. + args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`): + Reference to the object in charge of parsing supplied pipeline parameters. + device (:obj:`int`, `optional`, defaults to :obj:`-1`): + Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model + on the associated CUDA device id. + """ + + def __init__(self, min_length_for_response=32, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.tokenizer.eos_token_id is not None, "DialoguePipeline tokenizer should have an EOS token set" + if self.tokenizer.pad_token_id is not None: + self.pad_token_id = self.tokenizer.pad_token_id + else: + self.pad_token_id = self.tokenizer.eos_token_id + self.min_length_for_response = min_length_for_response + + def __call__( + self, + conversations: Union[Conversation, List[Conversation]], + clean_up_tokenization_spaces=True, + **generate_kwargs + ): + r""" + Args: + conversations: (list of :class:`~transformers.pipelines.Conversation`) Conversations to generate responses for + **generate_kwargs: extra kwargs passed to `self.model.generate`_ + + Returns: + list of conversations with updated generated responses for those containing a new user input + """ + + # Input validation + if isinstance(conversations, list): + for conversation in conversations: + assert isinstance( + conversation, Conversation + ), "DialoguePipeline expects a Conversation or list of Conversations as an input" + if conversation.new_user_input is None: + raise ValueError( + "Conversation with UUID {} does not contain new user input to process. " + "Add user inputs with the conversation's `add_user_input` method".format( + type(conversation.uuid) + ) + ) + assert ( + self.tokenizer.pad_token_id is not None or self.tokenizer.eos_token_id is not None + ), "Please make sure that the tokenizer has a pad_token_id or eos_token_id when using a batch input" + elif isinstance(conversations, Conversation): + conversations = [conversations] + else: + raise ValueError("DialoguePipeline expects a Conversation or list of Conversations as an input") + + with self.device_placement(): + + inputs = self._parse_and_tokenize([conversation.new_user_input for conversation in conversations]) + histories = [conversation.history for conversation in conversations] + max_length = generate_kwargs.get("max_length", self.model.config.max_length) + inputs = self._concat_inputs_history(inputs, histories, max_length) + + if self.framework == "pt": + inputs = self.ensure_tensor_on_device(**inputs) + input_length = inputs["input_ids"].shape[-1] + + elif self.framework == "tf": + input_length = tf.shape(inputs["input_ids"])[-1].numpy() + + if input_length > 0.9 * max_length: + logger.warning( + "Longest conversation length: {} is bigger than 0.9 * max_length: {}. " + "You might consider trimming the early phase of the conversation".format(input_length, max_length) + ) + generated_responses = self.model.generate( + inputs["input_ids"], attention_mask=inputs["attention_mask"], **generate_kwargs, + ) + + cleaned_history = self._clean_padding_history(generated_responses) + output = [] + for conversation_index, conversation in enumerate(conversations): + conversation.mark_processed() + conversation.generated_responses.append( + self.tokenizer.decode( + cleaned_history[conversation_index][input_length:], + skip_special_tokens=True, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + ) + ) + conversation.set_history(cleaned_history[conversation_index]) + output.append(conversation) + if len(output) == 1: + return output[0] + else: + return output + + def _parse_and_tokenize(self, *args, **kwargs): + """ + Parse arguments and tokenize, adding an EOS token at the end of the user input + """ + # Parse arguments + inputs = self._args_parser(*args, **kwargs) + inputs = self.tokenizer.batch_encode_plus(inputs, add_special_tokens=False, padding=False).get("input_ids", []) + for input in inputs: + input.append(self.tokenizer.eos_token_id) + return inputs + + def _clean_padding_history(self, generated_tensor) -> List[List[int]]: + """ + Cleans the padding history. Padding may be generated in two places when multiple conversations are provided as + an input: + - at the end of the concatenated history and new user input, so that all input to the model have the same + length + - at the end of the generated response, as some responses will be longer than others + This method cleans up these padding token so that the history for each conversation is not impacted by the + batching process. + """ + outputs = [] + for sequence in generated_tensor: + sequence_tokens = [] + is_previous_pad = False + for token in sequence: + if token == self.pad_token_id: + if is_previous_pad: + continue + else: + is_previous_pad = True + else: + is_previous_pad = False + if self.framework == "pt": + sequence_tokens.append(token.item()) + else: + sequence_tokens.append(int(token.numpy())) + + outputs.append(sequence_tokens) + return outputs + + def _concat_inputs_history(self, inputs: List[List[int]], histories: List[Optional[List[int]]], max_length: int): + """ + Builds an input prepended by the history for this conversation, allowing multi-turn conversation with context + """ + outputs = [] + for new_input, history in zip(inputs, histories): + if history is not None: + new_input = history + new_input + if len(new_input) > max_length - self.min_length_for_response: + cutoff_eos_index = 0 + while len(new_input) - cutoff_eos_index > max_length - self.min_length_for_response: + if cutoff_eos_index >= len(new_input): + break + cutoff_eos_index = new_input[cutoff_eos_index:].index(self.tokenizer.eos_token_id) + if cutoff_eos_index == 0 or cutoff_eos_index == len(new_input) - 1: + break + else: + new_input = new_input[cutoff_eos_index + 1 :] + outputs.append(new_input) + max_len = max([len(item) for item in outputs]) + outputs = [output + [self.pad_token_id] * (max_len - len(output)) for output in outputs] + outputs = BatchEncoding( + {"input_ids": outputs, "attention_mask": [1] * len(outputs)}, tensor_type=self.framework + ) + return outputs + + # Register all the supported tasks here SUPPORTED_TASKS = { "feature-extraction": { @@ -1979,6 +2297,12 @@ def __call__( "tokenizer": {"pt": "facebook/bart-large-mnli", "tf": "roberta-large-mnli"}, }, }, + "conversational": { + "impl": ConversationalPipeline, + "tf": TFAutoModelForCausalLM if is_tf_available() else None, + "pt": AutoModelForCausalLM if is_torch_available() else None, + "default": {"model": {"pt": "microsoft/DialoGPT-medium", "tf": "microsoft/DialoGPT-medium"}}, + }, } diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index 3f2dd55afbfd..cd11fcfb1c0b 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -2,7 +2,7 @@ from typing import Iterable, List, Optional from transformers import pipeline -from transformers.pipelines import SUPPORTED_TASKS, DefaultArgumentHandler, Pipeline +from transformers.pipelines import SUPPORTED_TASKS, Conversation, DefaultArgumentHandler, Pipeline from transformers.testing_utils import require_tf, require_torch, slow, torch_device @@ -28,6 +28,8 @@ ] TF_TRANSLATION_FINETUNED_MODELS = [("patrickvonplaten/t5-tiny-random", "translation_en_to_fr")] +DIALOGUE_FINETUNED_MODELS = ["microsoft/DialoGPT-medium"] + expected_fill_mask_result = [ [ {"sequence": "~~My name is John~~", "score": 0.00782308354973793, "token": 610, "token_str": "Ä John"}, @@ -314,6 +316,64 @@ def test_tf_text_generation(self): nlp = pipeline(task="text-generation", model=model_name, tokenizer=model_name, framework="tf") self._test_mono_column_pipeline(nlp, VALID_INPUTS, {}) + @slow + @require_torch + def test_integration_torch_conversation(self): + # When + nlp = pipeline(task="conversational", device=DEFAULT_DEVICE_NUM) + conversation_1 = Conversation("Going to the movies tonight - any suggestions?") + conversation_2 = Conversation("What's the last book you have read?") + # Then + self.assertEqual(len(conversation_1.past_user_inputs), 0) + self.assertEqual(len(conversation_2.past_user_inputs), 0) + # When + result = nlp([conversation_1, conversation_2], do_sample=False, max_length=1000) + # Then + self.assertEqual(result, [conversation_1, conversation_2]) + self.assertEqual(len(result[0].past_user_inputs), 1) + self.assertEqual(len(result[1].past_user_inputs), 1) + self.assertEqual(len(result[0].generated_responses), 1) + self.assertEqual(len(result[1].generated_responses), 1) + self.assertEqual(result[0].past_user_inputs[0], "Going to the movies tonight - any suggestions?") + self.assertEqual(result[0].generated_responses[0], "The Big Lebowski") + self.assertEqual(result[1].past_user_inputs[0], "What's the last book you have read?") + self.assertEqual(result[1].generated_responses[0], "The Last Question") + # When + conversation_2.add_user_input("Why do you recommend it?") + result = nlp(conversation_2, do_sample=False, max_length=1000) + # Then + self.assertEqual(result, conversation_2) + self.assertEqual(len(result.past_user_inputs), 2) + self.assertEqual(len(result.generated_responses), 2) + self.assertEqual(result.past_user_inputs[1], "Why do you recommend it?") + self.assertEqual(result.generated_responses[1], "It's a good book.") + + @slow + @require_torch + def test_integration_torch_conversation_truncated_history(self): + # When + nlp = pipeline(task="conversational", min_length_for_response=24, device=DEFAULT_DEVICE_NUM) + conversation_1 = Conversation("Going to the movies tonight - any suggestions?") + # Then + self.assertEqual(len(conversation_1.past_user_inputs), 0) + # When + result = nlp(conversation_1, do_sample=False, max_length=36) + # Then + self.assertEqual(result, conversation_1) + self.assertEqual(len(result.past_user_inputs), 1) + self.assertEqual(len(result.generated_responses), 1) + self.assertEqual(result.past_user_inputs[0], "Going to the movies tonight - any suggestions?") + self.assertEqual(result.generated_responses[0], "The Big Lebowski") + # When + conversation_1.add_user_input("Is it an action movie?") + result = nlp(conversation_1, do_sample=False, max_length=36) + # Then + self.assertEqual(result, conversation_1) + self.assertEqual(len(result.past_user_inputs), 2) + self.assertEqual(len(result.generated_responses), 2) + self.assertEqual(result.past_user_inputs[1], "Is it an action movie?") + self.assertEqual(result.generated_responses[1], "It's a comedy.") + QA_FINETUNED_MODELS = ["sshleifer/tiny-distilbert-base-cased-distilled-squad"] @@ -450,6 +510,38 @@ def test_tf_zero_shot_outputs(self): self._test_zero_shot_pipeline_outputs(nlp) +class DialoguePipelineTests(unittest.TestCase): + def _test_conversation_pipeline(self, nlp): + valid_inputs = [Conversation("Hi there!"), [Conversation("Hi there!"), Conversation("How are you?")]] + invalid_inputs = ["Hi there!", Conversation()] + self.assertIsNotNone(nlp) + + mono_result = nlp(valid_inputs[0]) + self.assertIsInstance(mono_result, Conversation) + + multi_result = nlp(valid_inputs[1]) + self.assertIsInstance(multi_result, list) + self.assertIsInstance(multi_result[0], Conversation) + # Inactive conversations passed to the pipeline raise a ValueError + self.assertRaises(ValueError, nlp, valid_inputs[1]) + + for bad_input in invalid_inputs: + self.assertRaises(Exception, nlp, bad_input) + self.assertRaises(Exception, nlp, invalid_inputs) + + @require_torch + def test_torch_conversation(self): + for model_name in DIALOGUE_FINETUNED_MODELS: + nlp = pipeline(task="conversational", model=model_name, tokenizer=model_name) + self._test_conversation_pipeline(nlp) + + @require_tf + def test_tf_conversation(self): + for model_name in DIALOGUE_FINETUNED_MODELS: + nlp = pipeline(task="conversational", model=model_name, tokenizer=model_name, framework="tf") + self._test_conversation_pipeline(nlp) + + class QAPipelineTests(unittest.TestCase): def _test_qa_pipeline(self, nlp): output_keys = {"score", "answer", "start", "end"} @@ -593,7 +685,6 @@ def test_tf_ner_grouped(self): class PipelineCommonTests(unittest.TestCase): - pipelines = SUPPORTED_TASKS.keys() @slow From f3065abdb8805f5beaed9ff1e92ce874e655f5c9 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Thu, 30 Jul 2020 14:51:19 -0400 Subject: [PATCH 064/127] Doc tokenizer (#6110) * Start doc tokenizers * Tokenizer documentation * Start doc tokenizers * Tokenizer documentation * Formatting after rebase * Formatting after merge * Update docs/source/main_classes/tokenizer.rst Co-authored-by: Lysandre Debut * Address comment * Update src/transformers/tokenization_utils_base.py Co-authored-by: Thomas Wolf * Address Thom's comments Co-authored-by: Lysandre Debut Co-authored-by: Thomas Wolf --- docs/source/index.rst | 1 + docs/source/internal/tokenization_utils.rst | 38 + docs/source/main_classes/tokenizer.rst | 51 +- src/transformers/modeling_utils.py | 2 +- src/transformers/tokenization_utils.py | 262 +++-- src/transformers/tokenization_utils_base.py | 1081 ++++++++++++------- src/transformers/tokenization_utils_fast.py | 208 ++-- 7 files changed, 1086 insertions(+), 557 deletions(-) create mode 100644 docs/source/internal/tokenization_utils.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index a9e27953ca28..26e950875ef9 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -206,3 +206,4 @@ conversion utilities for the following models: model_doc/mobilebert model_doc/dpr internal/modeling_utils + internal/tokenization_utils diff --git a/docs/source/internal/tokenization_utils.rst b/docs/source/internal/tokenization_utils.rst new file mode 100644 index 000000000000..48752c8de261 --- /dev/null +++ b/docs/source/internal/tokenization_utils.rst @@ -0,0 +1,38 @@ +Utilities for Tokenizers +------------------------ + +This page lists all the utility functions used by the tokenizers, mainly the class +:class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase` that implements the common methods between +:class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast` and the mixin +:class:`~transformers.tokenization_utils_base.SpecialTokensMixin`. + +Most of those are only useful if you are studying the code of the tokenizers in the library. + +``PreTrainedTokenizerBase`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.tokenization_utils_base.PreTrainedTokenizerBase + :special-members: __call__ + :members: + + +``SpecialTokensMixin`` +~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.tokenization_utils_base.SpecialTokensMixin + :members: + + +Enums and namedtuples +~~~~~~~~~~~~~~~~~~~~~ +.. autoclass:: transformers.tokenization_utils_base.ExplicitEnum + +.. autoclass:: transformers.tokenization_utils_base.PaddingStrategy + +.. autoclass:: transformers.tokenization_utils_base.TensorType + +.. autoclass:: transformers.tokenization_utils_base.TruncationStrategy + +.. autoclass:: transformers.tokenization_utils_base.CharSpan + +.. autoclass:: transformers.tokenization_utils_base.TokenSpan diff --git a/docs/source/main_classes/tokenizer.rst b/docs/source/main_classes/tokenizer.rst index ee12da184753..a15e516df3e3 100644 --- a/docs/source/main_classes/tokenizer.rst +++ b/docs/source/main_classes/tokenizer.rst @@ -1,17 +1,40 @@ Tokenizer ---------------------------------------------------- -A tokenizer is in charge of preparing the inputs for a model. The library comprise tokenizers for all the models. Most of the tokenizers are available in two flavors: a full python implementation and a "Fast" implementation based on the Rust library `tokenizers`. The "Fast" implementations allows (1) a significant speed-up in particular when doing batched tokenization and (2) additional methods to map between the original string (character and words) and the token space (e.g. getting the index of the token comprising a given character or the span of characters corresponding to a given token). Currently no "Fast" implementation is available for the SentencePiece-based tokenizers (for T5, ALBERT, CamemBERT, XLMRoBERTa and XLNet models). +A tokenizer is in charge of preparing the inputs for a model. The library contains tokenizers for all the models. Most +of the tokenizers are available in two flavors: a full python implementation and a "Fast" implementation based on the +Rust library `tokenizers `__. The "Fast" implementations allows: + +1. a significant speed-up in particular when doing batched tokenization and +2. additional methods to map between the original string (character and words) and the token space (e.g. getting the + index of the token comprising a given character or the span of characters corresponding to a given token). Currently + no "Fast" implementation is available for the SentencePiece-based tokenizers (for T5, ALBERT, CamemBERT, XLMRoBERTa + and XLNet models). + +The base classes :class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast` +implement the common methods for encoding string inputs in model inputs (see below) and instantiating/saving python and +"Fast" tokenizers either from a local file or directory or from a pretrained tokenizer provided by the library +(downloaded from HuggingFace's AWS S3 repository). They both rely on +:class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase` that contains the common methods, and +:class:`~transformers.tokenization_utils_base.SpecialTokensMixin`. + +:class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast` thus implement the main +methods for using all the tokenizers: + +- Tokenizing (splitting strings in sub-word token strings), converting tokens strings to ids and back, and + encoding/decoding (i.e., tokenizing and converting to integers). +- Adding new tokens to the vocabulary in a way that is independent of the underlying structure (BPE, SentencePiece...). +- Managing special tokens (like mask, beginning-of-sentence, etc.): adding them, assigning them to attributes in the + tokenizer for easy access and making sure they are not split during tokenization. + +:class:`~transformers.BatchEncoding` holds the output of the tokenizer's encoding methods (``__call__``, +``encode_plus`` and ``batch_encode_plus``) and is derived from a Python dictionary. When the tokenizer is a pure python +tokenizer, this class behaves just like a standard python dictionary and holds the various model inputs computed by these +methods (``input_ids``, ``attention_mask``...). When the tokenizer is a "Fast" tokenizer (i.e., backed by HuggingFace +`tokenizers library `__), this class provides in addition several advanced +alignment methods which can be used to map between the original string (character and words) and the token space (e.g., +getting the index of the token comprising a given character or the span of characters corresponding to a given token). -The base classes ``PreTrainedTokenizer`` and ``PreTrainedTokenizerFast`` implements the common methods for encoding string inputs in model inputs (see below) and instantiating/saving python and "Fast" tokenizers either from a local file or directory or from a pretrained tokenizer provided by the library (downloaded from HuggingFace's AWS S3 repository). - -``PreTrainedTokenizer`` and ``PreTrainedTokenizerFast`` thus implements the main methods for using all the tokenizers: - -- tokenizing (spliting strings in sub-word token strings), converting tokens strings to ids and back, and encoding/decoding (i.e. tokenizing + convert to integers), -- adding new tokens to the vocabulary in a way that is independant of the underlying structure (BPE, SentencePiece...), -- managing special tokens like mask, beginning-of-sentence, etc tokens (adding them, assigning them to attributes in the tokenizer for easy access and making sure they are not split during tokenization) - -``BatchEncoding`` holds the output of the tokenizer's encoding methods (``__call__``, ``encode_plus`` and ``batch_encode_plus``) and is derived from a Python dictionary. When the tokenizer is a pure python tokenizer, this class behave just like a standard python dictionary and hold the various model inputs computed by these methodes (``input_ids``, ``attention_mask``...). When the tokenizer is a "Fast" tokenizer (i.e. backed by HuggingFace tokenizers library), this class provides in addition several advanced alignement methods which can be used to map between the original string (character and words) and the token space (e.g. getting the index of the token comprising a given character or the span of characters corresponding to a given token). ``PreTrainedTokenizer`` ~~~~~~~~~~~~~~~~~~~~~~~~ @@ -20,6 +43,7 @@ The base classes ``PreTrainedTokenizer`` and ``PreTrainedTokenizerFast`` impleme :special-members: __call__ :members: + ``PreTrainedTokenizerFast`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -27,14 +51,9 @@ The base classes ``PreTrainedTokenizer`` and ``PreTrainedTokenizerFast`` impleme :special-members: __call__ :members: + ``BatchEncoding`` ~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.BatchEncoding :members: - -``SpecialTokensMixin`` -~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autoclass:: transformers.SpecialTokensMixin - :members: diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index d1d548ffbd78..b06a904c9ed4 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -646,7 +646,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`): Whether or not to delete incompletely received files. Will attempt to resume the download if such a file exists. - proxies: (:obj:`Dict[str, str], `optional`): + proxies (:obj:`Dict[str, str], `optional`): A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py index d878210f407d..cbe9b34beeff 100644 --- a/src/transformers/tokenization_utils.py +++ b/src/transformers/tokenization_utils.py @@ -20,12 +20,13 @@ import logging import re import unicodedata -from typing import Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union from .file_utils import add_end_docstrings from .tokenization_utils_base import ( ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING, + INIT_TOKENIZER_DOCSTRING, AddedToken, BatchEncoding, EncodedInput, @@ -45,7 +46,7 @@ def _is_whitespace(char): - """Checks whether `chars` is a whitespace character.""" + """Checks whether `char` is a whitespace character.""" # \t, \n, and \r are technically contorl characters but we treat them # as whitespace since they are generally considered as such. if char == " " or char == "\t" or char == "\n" or char == "\r": @@ -57,7 +58,7 @@ def _is_whitespace(char): def _is_control(char): - """Checks whether `chars` is a control character.""" + """Checks whether `char` is a control character.""" # These are technically control characters but we count them as whitespace # characters. if char == "\t" or char == "\n" or char == "\r": @@ -69,7 +70,7 @@ def _is_control(char): def _is_punctuation(char): - """Checks whether `chars` is a punctuation character.""" + """Checks whether `char` is a punctuation character.""" cp = ord(char) # We treat all non-letter/number ASCII as punctuation. # Characters such as "^", "$", and "`" are not in the Unicode @@ -95,8 +96,12 @@ def _is_start_of_word(text): return bool(_is_control(first_char) | _is_punctuation(first_char) | _is_whitespace(first_char)) +@add_end_docstrings(INIT_TOKENIZER_DOCSTRING, """ .. automethod:: __call__""") class PreTrainedTokenizer(PreTrainedTokenizerBase): - """ Base class for all slow tokenizers. + """ + Base class for all slow tokenizers. + + Inherits from :class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase`. Handle all the shared methods for tokenization and special tokens as well as methods downloading/caching/loading pretrained tokenizers as well as adding tokens to the vocabulary. @@ -104,53 +109,6 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...). - - Class attributes (overridden by derived classes): - - - ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of each vocabulary file - required by the model, and as associated values, the filename for saving the associated file (string). - - ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys - being the ``__init__`` keyword name of each vocabulary file required by the model, the low-level being the - `short-cut-names` (string) of the pretrained models with, as associated values, the `url` (string) to the - associated pretrained vocabulary file. - - ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained - models, and as associated values, the maximum length of the sequence inputs of this model, or None if the - model has no maximum input size. - - ``pretrained_init_configuration``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the - pretrained models, and as associated values, a dictionnary of specific arguments to pass to the - ``__init__``method of the tokenizer class for this pretrained model when loading the tokenizer with the - ``from_pretrained()`` method. - - Args: - - ``model_max_length``: (`Optional`) int: the maximum length in number of tokens for the inputs to the transformer model. - When the tokenizer is loaded with `from_pretrained`, this will be set to the value stored for the associated - model in ``max_model_input_sizes`` (see above). If no value is provided, will default to VERY_LARGE_INTEGER (`int(1e30)`). - no associated max_length can be found in ``max_model_input_sizes``. - - ``padding_side``: (`Optional`) string: the side on which the model should have padding applied. - Should be selected between ['right', 'left'] - - ``model_input_names``: (`Optional`) List[string]: the list of the forward pass inputs accepted by the - model ("token_type_ids", "attention_mask"...). - - ``bos_token``: (`Optional`) string: a beginning of sentence token. - Will be associated to ``self.bos_token`` and ``self.bos_token_id`` - - ``eos_token``: (`Optional`) string: an end of sentence token. - Will be associated to ``self.eos_token`` and ``self.eos_token_id`` - - ``unk_token``: (`Optional`) string: an unknown token. - Will be associated to ``self.unk_token`` and ``self.unk_token_id`` - - ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). - Will be associated to ``self.sep_token`` and ``self.sep_token_id`` - - ``pad_token``: (`Optional`) string: a padding token. - Will be associated to ``self.pad_token`` and ``self.pad_token_id`` - - ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence - leveraging self-attention along the full depth of the model). - Will be associated to ``self.cls_token`` and ``self.cls_token_id`` - - ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language - modeling). Will be associated to ``self.mask_token`` and ``self.mask_token_id`` - - ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. - Adding all special tokens here ensure they won't be split by the tokenization process. - Will be associated to ``self.additional_special_tokens`` and ``self.additional_special_tokens_ids`` - - - .. automethod:: __call__ """ def __init__(self, **kwargs): @@ -168,31 +126,52 @@ def is_fast(self) -> bool: @property def vocab_size(self) -> int: - """ Size of the base vocabulary (without the added tokens) """ + """ + :obj:`int`: Size of the base vocabulary (without the added tokens). + """ raise NotImplementedError - def get_vocab(self): - """ Returns the vocabulary as a dict of {token: index} pairs. `tokenizer.get_vocab()[token]` is equivalent to `tokenizer.convert_tokens_to_ids(token)` when `token` is in the vocab. """ + def get_vocab(self) -> Dict[str, int]: + """ + Returns the vocabulary as a dictionary of token to index. + + :obj:`tokenizer.get_vocab()[token]` is equivalent to :obj:`tokenizer.convert_tokens_to_ids(token)` when + :obj:`token` is in the vocab. + + Returns: + :obj:`Dict[str, int]`: The vocabulary. + """ raise NotImplementedError() def get_added_vocab(self) -> Dict[str, int]: + """ + Returns the added tokens in the vocabulary as a dictionary of token to index. + + Returns: + :obj:`Dict[str, int]`: The added tokens. + """ return self.added_tokens_encoder def __len__(self): - """ Size of the full vocabulary with the added tokens """ + """ + Size of the full vocabulary with the added tokens. + """ return self.vocab_size + len(self.added_tokens_encoder) - def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens=False) -> int: + def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int: """ Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to it with indices starting from length of the current vocabulary. Args: - new_tokens: string or list of string. Each string is a token to add. Tokens are only added if they are not - already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them). + new_tokens (:obj:`List[str]`or :obj:`List[tokenizers.AddedToken]`): + Token(s) to add in vocabulary. A token is only added if it's not already in the vocabulary (tested by + checking if the tokenizer assign the index of the ``unk_token`` to them). + special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not the tokens should be added as special tokens. Returns: - Number of tokens added to the vocabulary. + :obj:`int`: The number of tokens actually added to the vocabulary. Examples:: @@ -202,7 +181,8 @@ def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_to num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2']) print('We have added', num_added_toks, 'tokens') - model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer. + # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer. + model.resize_token_embeddings(len(tokenizer)) """ new_tokens = [str(tok) for tok in new_tokens] @@ -234,35 +214,41 @@ def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_to return len(tokens_to_add) - def num_special_tokens_to_add(self, pair=False): + def num_special_tokens_to_add(self, pair: bool = False) -> int: """ Returns the number of added tokens when encoding a sequence with special tokens. - Note: - This encodes inputs and checks the number of added tokens, and is therefore not efficient. Do not put this - inside your training loop. + .. note:: + This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not + put this inside your training loop. Args: - pair: Returns the number of added tokens in the case of a sequence pair if set to True, returns the - number of added tokens in the case of a single sequence if set to False. + pair (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether the number of added tokens should be computed in the case of a sequence pair or a single + sequence. Returns: - Number of tokens added to sequences + :obj:`int`: Number of special tokens added to sequences. """ token_ids_0 = [] token_ids_1 = [] return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None)) - def tokenize(self, text: TextInput, **kwargs): - """ Converts a string in a sequence of tokens (string), using the tokenizer. - Split in words for word-based vocabulary or sub-words for sub-word-based - vocabularies (BPE/SentencePieces/WordPieces). + def tokenize(self, text: TextInput, **kwargs) -> List[str]: + """ + Converts a string in a sequence of tokens, using the tokenizer. - Take care of added tokens. + Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces). + Takes care of added tokens. - Args: - text (:obj:`string`): The sequence to be encoded. - **kwargs (:obj: `dict`): Arguments passed to the model-specific `prepare_for_tokenization` preprocessing method. + Args: + text (:obj:`str`): + The sequence to be encoded. + **kwargs (additional keyword arguments): + Passed along to the model-specific ``prepare_for_tokenization`` preprocessing method. + + Returns: + :obj:`List[str]`: The list of tokens. """ # Simple mapping string => AddedToken for special tokens with specific tokenization behaviors all_special_tokens_extended = dict( @@ -365,17 +351,25 @@ def split_on_tokens(tok_list, text): return tokenized_text def _tokenize(self, text, **kwargs): - """ Converts a string in a sequence of tokens (string), using the tokenizer. - Split in words for word-based vocabulary or sub-words for sub-word-based - vocabularies (BPE/SentencePieces/WordPieces). + """ + Converts a string in a sequence of tokens (string), using the tokenizer. + Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies + (BPE/SentencePieces/WordPieces). - Do NOT take care of added tokens. + Do NOT take care of added tokens. """ raise NotImplementedError - def convert_tokens_to_ids(self, tokens): - """ Converts a token string (or a sequence of tokens) in a single integer id - (or a sequence of ids), using the vocabulary. + def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]: + """ + Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the + vocabulary. + + Args: + token (:obj:`str` or :obj:`List[str]`): One or several token(s) to convert to token id(s). + + Returns: + :obj:`int` or :obj:`List[int]`: The token id or list of token ids. """ if tokens is None: return None @@ -574,7 +568,8 @@ def _batch_prepare_for_model( return_length: bool = False, verbose: bool = True, ) -> BatchEncoding: - """ Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. + """ + Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It adds special tokens, truncates sequences if overflowing while taking into account the special tokens and manages a moving window (with user defined stride) for overflowing tokens @@ -620,11 +615,25 @@ def _batch_prepare_for_model( return batch_outputs - def prepare_for_tokenization(self, text: str, is_pretokenized=False, **kwargs) -> (str, dict): - """ Performs any necessary transformations before tokenization. + def prepare_for_tokenization( + self, text: str, is_pretokenized: bool = False, **kwargs + ) -> Tuple[str, Dict[str, Any]]: + """ + Performs any necessary transformations before tokenization. + + This method should pop the arguments from kwargs and return the remaining :obj:`kwargs` as well. + We test the :obj:`kwargs` at the end of the encoding process to be sure all the arguments have been used. + + Args: + test (:obj:`str`): + The text to prepare. + is_pretokenized (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not the text has been pretokenized. + kwargs: + Keyword arguments to use for the tokenization. - This method should pop the arguments from kwargs and return kwargs as well. - We test kwargs at the end of the encoding process to be sure all the arguments have been used. + Returns: + :obj:`Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs. """ return (text, kwargs) @@ -633,14 +642,15 @@ def get_special_tokens_mask( ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer ``prepare_for_model`` method. + special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. Args: - token_ids_0: list of ids (must not contain special tokens) - token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids - for sequence pairs - already_has_special_tokens: (default False) Set to True if the token list is already formated with - special tokens for the model + token_ids_0 (:obj:`List[int]`): + List of ids of the first sequence. + token_ids_1 (:obj:`List[int]`, `optional`): + List of ids of the second sequence. + already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Wheter or not the token list is already formated with special tokens for the model. Returns: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. @@ -650,11 +660,18 @@ def get_special_tokens_mask( def convert_ids_to_tokens( self, ids: Union[int, List[int]], skip_special_tokens: bool = False ) -> Union[str, List[str]]: - """ Converts a single index or a sequence of indices (integers) in a token " - (resp.) a sequence of tokens (str), using the vocabulary and added tokens. + """ + Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary + and added tokens. - Args: - skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False + Args: + ids (:obj:`int` or :obj:`List[int]`): + The token id (or token ids) to convert to tokens. + skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to remove special tokens in the decoding. + + Returns: + :obj:`str` or :obj:`List[str]`: The decoded token(s). """ if isinstance(ids, int): if ids in self.added_tokens_decoder: @@ -676,15 +693,39 @@ def _convert_id_to_token(self, index: int) -> str: raise NotImplementedError def convert_tokens_to_string(self, tokens: List[str]) -> str: - """ Converts a sequence of tokens (string) in a single string. - The most simple way to do it is ' '.join(self.convert_ids_to_tokens(token_ids)) - but we often want to remove sub-word tokenization artifacts at the same time. """ - return " ".join(self.convert_ids_to_tokens(tokens)) + Converts a sequence of token ids in a single string. + + The most simple way to do it is ``" ".join(tokens)`` but we often want to remove + sub-word tokenization artifacts at the same time. + + Args: + tokens (:obj:`List[str]`): The token to join in a string. + + Return: The joined tokens. + """ + return " ".join(tokens) def decode( self, token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True ) -> str: + """ + Converts a sequence of ids in a string, using the tokenizer and vocabulary + with options to remove special tokens and clean up tokenization spaces. + + Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``. + + Args: + token_ids (:obj:`List[int]`): + List of tokenized input ids. Can be obtained using the ``__call__`` method. + skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to remove special tokens in the decoding. + clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to clean up the tokenization spaces. + + Returns: + :obj:`str`: The decoded sentence. + """ filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens) # To avoid mixing byte-level and unicode for byte-level BPT @@ -713,11 +754,18 @@ def decode( return text def save_vocabulary(self, save_directory) -> Tuple[str]: - """ Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens - and special token mappings. + """ + Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens + and special token mappings. + + .. warning:: + Please use :meth:`~transformers.PreTrainedTokenizer.save_pretrained` to save the full tokenizer state if + you want to reload it using the :meth:`~transformers.PreTrainedTokenizer.from_pretrained` class method. - Please use :func:`~transformers.PreTrainedTokenizer.save_pretrained` `()` to save the full - Tokenizer state if you want to reload it using the :func:`~transformers.PreTrainedTokenizer.from_pretrained` - class method. + Args: + save_directory (:obj:`str`): The path to adirectory where the tokenizer will be saved. + + Returns: + A tuple of :obj:`str`: The files saved. """ raise NotImplementedError diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 6b424606f7ff..d63c4bb5545a 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -72,7 +72,8 @@ class ExplicitEnum(Enum): - """ Enum with more explicit error message for missing values. + """ + Enum with more explicit error message for missing values. """ @classmethod @@ -84,6 +85,11 @@ def _missing_(cls, value): class TruncationStrategy(ExplicitEnum): + """ + Possible values for the ``truncation`` argument in :meth:`PreTrainedTokenizerBase.__call__`. + Useful for tab-completion in an IDE. + """ + ONLY_FIRST = "only_first" ONLY_SECOND = "only_second" LONGEST_FIRST = "longest_first" @@ -91,23 +97,34 @@ class TruncationStrategy(ExplicitEnum): class PaddingStrategy(ExplicitEnum): + """ + Possible values for the ``padding`` argument in :meth:`PreTrainedTokenizerBase.__call__`. + Useful for tab-completion in an IDE. + """ + LONGEST = "longest" MAX_LENGTH = "max_length" DO_NOT_PAD = "do_not_pad" class TensorType(ExplicitEnum): + """ + Possible values for the ``return_tensors`` argument in :meth:`PreTrainedTokenizerBase.__call__`. + Useful for tab-completion in an IDE. + """ + PYTORCH = "pt" TENSORFLOW = "tf" NUMPY = "np" class CharSpan(NamedTuple): - """ Character span in the original string + """ + Character span in the original string. - Args: - start: index of the first character in the original string - end: index of the character following the last character in the original string + Args: + start (:obj:`int`): Index of the first character in the original string. + end (:obj:`int`): Index of the character following the last character in the original string. """ start: int @@ -115,11 +132,12 @@ class CharSpan(NamedTuple): class TokenSpan(NamedTuple): - """ Token span in an encoded string (list of tokens) + """ + Token span in an encoded string (list of tokens). - Args: - start: index of the first token in the span - end: index of the token following the last token in the span + Args: + start (:obj:`int`): Index of the first token in the span. + end (:obj:`int`): Index of the token following the last token in the span. """ start: int @@ -127,19 +145,27 @@ class TokenSpan(NamedTuple): class BatchEncoding(UserDict): - """ BatchEncoding hold the output of the encode and batch_encode methods (tokens, attention_masks, etc). - This class is derived from a python Dictionary and can be used as a dictionnary. - In addition, this class expose utility methods to map from word/char space to token space. - - Args: - data (:obj:`dict`): Dictionary of lists/arrays returned by the encode/batch_encode methods ('input_ids', 'attention_mask'...) - encoding (:obj:`EncodingFast`, :obj:`list(EncodingFast)`, `optional`, defaults to :obj:`None`): - If the tokenizer is a fast tokenizer which outputs additional informations like mapping from word/char space to token space - the `EncodingFast` instance or list of instance (for batches) hold these informations. - tensor_type (:obj:`Union[None, str, TensorType]`, `optional`, defaults to :obj:`None`): - You can give a tensor_type here to convert the lists of integers in PyTorch/TF/Numpy Tensors at initialization - prepend_batch_axis (:obj:`bool`, `optional`, defaults to :obj:`False`): - Set to True to add a batch axis when converting in Tensors (see :obj:`tensor_type` above) + """ + Holds the output of the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.encode_plus` + and :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.batch_encode` methods (tokens, + attention_masks, etc). + + This class is derived from a python dictionary and can be used as a dictionary. In addition, this class exposes + utility methods to map from word/character space to token space. + + Args: + data (:obj:`dict`): + Dictionary of lists/arrays/tensors returned by the encode/batch_encode methods ('input_ids', + 'attention_mask', etc.). + encoding (:obj:`tokenizers.Encoding` or :obj:`Sequence[tokenizers.Encoding]`, `optional`): + If the tokenizer is a fast tokenizer which outputs additional informations like mapping from word/character + space to token space the :obj:`tokenizers.Encoding` instance or list of instance (for batches) hold these + informations. + tensor_type (:obj:`Union[None, str, TensorType]`, `optional`): + You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at + initialization. + prepend_batch_axis (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to add a batch axis when converting to tensors (see :obj:`tensor_type` above). """ def __init__( @@ -159,16 +185,19 @@ def __init__( self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis) @property - def is_fast(self): + def is_fast(self) -> bool: """ - Indicate if this BatchEncoding was generated from the result of a PreTrainedTokenizerFast - Returns: True if generated from subclasses of PreTrainedTokenizerFast, else otherwise + :obj:`bool`: Indicate whether this :class:`~transformers.BatchEncoding` was generated from the result of a + :class:`~transformers.PreTrainedTokenizerFast` or not. """ return self._encodings is not None - def __getitem__(self, item: Union[int, str]) -> EncodingFast: - """ If the key is a string, get the value of the dict associated to `key` ('input_ids', 'attention_mask'...) - If the key is an integer, get the EncodingFast for batch item with index `key` + def __getitem__(self, item: Union[int, str]) -> Union[Any, EncodingFast]: + """ + If the key is a string, returns the value of the dict associated to :obj:`key` ('input_ids', + 'attention_mask', etc.). + + If the key is an integer, get the :obj:`tokenizers.Encoding` for batch item with index :obj:`key`. """ if isinstance(item, str): return self.data[item] @@ -212,20 +241,40 @@ def items(self): @property def encodings(self) -> Optional[List[EncodingFast]]: """ - Return the list all encoding from the tokenization process - - Returns: List[EncodingFast] or None if input was tokenized through Python (i.e. not fast) tokenizer + :obj:`Optional[List[tokenizers.Encoding]]`: The list all encodings from the tokenization process. + Returns :obj:`None` if the input was tokenized through Python (i.e., not a fast) tokenizer. """ return self._encodings def tokens(self, batch_index: int = 0) -> List[str]: + """ + Return the list of tokens (sub-parts of the input strings after word/subword splitting and before converstion + to integer indices) at a given batch index (only works for the output of a fast tokenizer). + + Args: + batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch. + + Returns: + :obj:`List[str]`: The list of tokens at that index. + """ if not self._encodings: - raise ValueError("tokens() is not available when using Python based tokenizers") + raise ValueError("tokens() is not available when using Python-based tokenizers") return self._encodings[batch_index].tokens def words(self, batch_index: int = 0) -> List[Optional[int]]: + """ + Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer. + + Args: + batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch. + + Returns: + :obj:`List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by + the tokenizer are mapped to :obj:`None` and other tokens are mapped to the index of their corresponding + word (several tokens will be mapped to the same word index if they are parts of that word). + """ if not self._encodings: - raise ValueError("words() is not available when using Python based tokenizers") + raise ValueError("words() is not available when using Python-based tokenizers") return self._encodings[batch_index].words def token_to_word(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int: @@ -239,21 +288,19 @@ def token_to_word(self, batch_or_token_index: int, token_index: Optional[int] = - ``self.token_to_word(batch_index, token_index)`` if batch size is greater than 1 This method is particularly suited when the input sequences are provided as - pre-tokenized sequences (i.e. words are defined by the user). In this case it allows + pre-tokenized sequences (i.e., words are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized words. Args: batch_or_token_index (:obj:`int`): Index of the sequence in the batch. If the batch only comprise one sequence, - this can be the index of the token in the sequence + this can be the index of the token in the sequence. token_index (:obj:`int`, `optional`): If a batch index is provided in `batch_or_token_index`, this can be the index of the token in the sequence. Returns: - :obj:`int`: - index of the word in the input sequence. - + :obj:`int`: Index of the word in the input sequence. """ if not self._encodings: @@ -273,10 +320,10 @@ def word_to_tokens(self, batch_or_word_index: int, word_index: Optional[int] = N """ Get the encoded token span corresponding to a word in the sequence of the batch. - Token spans are returned as a TokenSpan NamedTuple with: + Token spans are returned as a :class:`~transformers.tokenization_utils_base.TokenSpan` with: - - start: index of the first token - - end: index of the token following the last token + - **start** -- Index of the first token. + - **end** -- Index of the token following the last token. Can be called as: @@ -290,19 +337,14 @@ def word_to_tokens(self, batch_or_word_index: int, word_index: Optional[int] = N Args: batch_or_word_index (:obj:`int`): Index of the sequence in the batch. If the batch only comprises one sequence, - this can be the index of the word in the sequence + this can be the index of the word in the sequence. word_index (:obj:`int`, `optional`): If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the sequence. Returns: - :obj:`TokenSpan`: - Span of tokens in the encoded sequence. - - :obj:`TokenSpan` are NamedTuple with: - - - start: index of the first token - - end: index of the token following the last token + :class:`~transformers.tokenization_utils_base.TokenSpan` + Span of tokens in the encoded sequence. """ if not self._encodings: @@ -322,10 +364,11 @@ def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = """ Get the character span corresponding to an encoded token in a sequence of the batch. - Character spans are returned as a CharSpan NamedTuple with: + Character spans are returned as a :class:`~transformers.tokenization_utils_base.CharSpan` with: - - start: index of the first character in the original string associated to the token - - end: index of the character following the last character in the original string associated to the token + - **start** -- Index of the first character in the original string associated to the token. + - **end** -- Index of the character following the last character in the original string associated to the + token. Can be called as: @@ -335,19 +378,14 @@ def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = Args: batch_or_token_index (:obj:`int`): Index of the sequence in the batch. If the batch only comprise one sequence, - this can be the index of the token in the sequence + this can be the index of the token in the sequence. token_index (:obj:`int`, `optional`): If a batch index is provided in `batch_or_token_index`, this can be the index of the token or tokens in the sequence. Returns: - :obj:`CharSpan`: - Span of characters in the original string. - - :obj:`CharSpan` are NamedTuple with: - - - start: index of the first character in the original string - - end: index of the character following the last character in the original string + :class:`~transformers.tokenization_utils_base.CharSpan`: + Span of characters in the original string. """ if not self._encodings: @@ -473,7 +511,19 @@ def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = Non char_index = batch_or_char_index return self._encodings[batch_index].char_to_word(char_index) - def convert_to_tensors(self, tensor_type: Union[None, str, TensorType], prepend_batch_axis: bool = False): + def convert_to_tensors( + self, tensor_type: Optional[Union[str, TensorType]] = None, prepend_batch_axis: bool = False + ): + """ + Convert the inner content to tensors. + + Args: + tensor_type (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`): + The type of tensors to use. If :obj:`str`, should be one of the values of the enum + :class:`~transformers.tokenization_utils_base.TensorType`. If :obj:`None`, no modification is done. + prepend_batch_axis (:obj:`int`, `optional`, defaults to :obj:`False`): + Whether or not to add the batch dimension during the conversion. + """ if tensor_type is None: return self @@ -524,8 +574,17 @@ def convert_to_tensors(self, tensor_type: Union[None, str, TensorType], prepend_ return self @torch_required - def to(self, device: str): - """Send all values to device by calling v.to(device)""" + def to(self, device: str) -> "BatchEncoding": + """ + Send all values to device by calling :obj:`v.to(device)` (PyTorch only). + + Args: + device (:obj:`str` or :obj:`torch.device`): The device to put the tensors on. + + Returns: + :class:`~transformers.BatchEncoding`: + The same instance of :class:`~transformers.BatchEncoding` after modification. + """ self.data = {k: v.to(device) for k, v in self.data.items()} return self @@ -568,10 +627,31 @@ def to(self, device: str): class SpecialTokensMixin: - """ SpecialTokensMixin is derived by ``PreTrainedTokenizer`` and ``PreTrainedTokenizerFast`` and - handles specific behaviors related to special tokens. In particular, this class hold the - attributes which can be used to directly access to these special tokens in a - model-independant manner and allow to set and update the special tokens. + """ + A mixin derived by :class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast` + to handle specific behaviors related to special tokens. In particular, this class hold the attributes which can be + used to directly access these special tokens in a model-independant manner and allow to set and update the special + tokens. + + Args: + bos_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`): + A special token representing the beginning of a sentence. + eos_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`): + A special token representing the end of a sentence. + unk_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`): + A special token representing an out-of-vocabulary token. + sep_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`): + A special token separating two different sentences in the same input (used by BERT for instance). + pad_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`): + A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by + attention mechanisms or loss computation. + cls_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`): + A special token representing the class of the input (used by BERT for instance). + mask_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`): + A special token representing a masked token (used by masked-language modeling pretraining objectives, like + BERT). + additional_special_tokens (tuple or list of :obj:`str` or :obj:`tokenizers.AddedToken`, `optional`): + A tuple or a list of additional special tokens. """ SPECIAL_TOKENS_ATTRIBUTES = [ @@ -613,36 +693,44 @@ def __init__(self, verbose=True, **kwargs): ) def sanitize_special_tokens(self) -> int: - """ Make sure that all the special tokens attributes of the tokenizer (tokenizer.mask_token, tokenizer.cls_token, ...) - are in the vocabulary. Add the missing ones to the vocabulary if needed. + """ + Make sure that all the special tokens attributes of the tokenizer (:obj:`tokenizer.mask_token`, + :obj:`tokenizer.cls_token`, etc.) are in the vocabulary. - Return: - Number of tokens added in the vocaulary during the operation. + Add the missing ones to the vocabulary if needed. + + Return: + :obj:`int`: The number of tokens added in the vocaulary during the operation. """ return self.add_tokens(self.all_special_tokens_extended, special_tokens=True) def add_special_tokens(self, special_tokens_dict: Dict[str, Union[str, AddedToken]]) -> int: """ - Add a dictionary of special tokens (eos, pad, cls...) to the encoder and link them - to class attributes. If special tokens are NOT in the vocabulary, they are added - to it (indexed starting from the last index of the current vocabulary). + Add a dictionary of special tokens (eos, pad, cls, etc.) to the encoder and link them to class attributes. If + special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last index of the + current vocabulary). - Using `add_special_tokens` will ensure your special tokens can be used in several ways: + Using : obj:`add_special_tokens` will ensure your special tokens can be used in several ways: - - special tokens are carefully handled by the tokenizer (they are never split) - - you can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This makes it easy to develop model-agnostic training and fine-tuning scripts. + - Special tokens are carefully handled by the tokenizer (they are never split). + - You can easily refer to special tokens using tokenizer class attributes like :obj:`tokenizer.cls_token`. This + makes it easy to develop model-agnostic training and fine-tuning scripts. - When possible, special tokens are already registered for provided pretrained models (ex: BertTokenizer cls_token is already registered to be '[CLS]' and XLM's one is also registered to be '') + When possible, special tokens are already registered for provided pretrained models (for instance + :class:`~transformers.BertTokenizer` :obj:`cls_token` is already registered to be :obj`'[CLS]'` and XLM's one + is also registered to be :obj:`''`). Args: - special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes: - [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, + special_tokens_dict (dictionary `str` to `str` or :obj:`tokenizers.AddedToken`): + Keys should be in the list of predefined special attributes: [``bos_token``, ``eos_token``, + ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``]. - Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them). + Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer + assign the index of the ``unk_token`` to them). Returns: - Number of tokens added to the vocabulary. + :obj:`int`: Number of tokens added to the vocabulary. Examples:: @@ -654,7 +742,8 @@ def add_special_tokens(self, special_tokens_dict: Dict[str, Union[str, AddedToke num_added_toks = tokenizer.add_special_tokens(special_tokens_dict) print('We have added', num_added_toks, 'tokens') - model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer. + # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer. + model.resize_token_embeddings(len(tokenizer)) assert tokenizer.cls_token == '' """ @@ -682,24 +771,27 @@ def add_special_tokens(self, special_tokens_dict: Dict[str, Union[str, AddedToke return added_tokens - def add_tokens(self, new_tokens: Union[str, AddedToken, List[str], List[AddedToken]], special_tokens=False) -> int: + def add_tokens( + self, new_tokens: Union[str, AddedToken, List[Union[str, AddedToken]]], special_tokens: bool = False + ) -> int: """ - Add a list of new tokens to the tokenizer class. If the new tokens are not in the - vocabulary, they are added to it with indices starting from length of the current vocabulary. + Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to + it with indices starting from length of the current vocabulary. Args: - new_tokens: string or list of string or :class:`~transformers.AddedToken`. Each string is a token to add. - Tokens are only added if they are not already in the vocabulary. AddedToken wrap a string token to - let you personnalize it's behavior (Whether this token should only match against single word, whether - this token should strip all potential whitespaces on the left side, Whether this token should strip - all potential whitespaces on the right side...). - special_token: can be used to specify if the token is a special token. This mostly change the normalization - behavior (special tokens like CLS or [MASK] are usually not lower-cased for instance) + new_tokens (:obj:`str`, :obj:`tokenizers.AddedToken` or a list of `str` or :obj:`tokenizers.AddedToken`): + Tokens are only added if they are not already in the vocabulary. :obj:`tokenizers.AddedToken` wraps a + string token to let you personalize its behavior: whether this token should only match against a single + word, whether this token should strip all potential whitespaces on the left side, whether this token + should strip all potential whitespaces on the right side, etc. + special_token (:obj:`bool`, `optional`, defaults to :obj:`False`): + Can be used to specify if the token is a special token. This mostly change the normalization behavior + (special tokens like CLS or [MASK] are usually not lower-cased for instance). - See details for :class:`~transformers.AddedToken` in HuggingFace tokenizers library. + See details for :obj:`tokenizers.AddedToken` in HuggingFace tokenizers library. Returns: - Number of tokens added to the vocabulary. + :obj:`int`: Number of tokens added to the vocabulary. Examples:: @@ -709,7 +801,8 @@ def add_tokens(self, new_tokens: Union[str, AddedToken, List[str], List[AddedTok num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2']) print('We have added', num_added_toks, 'tokens') - model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer. + # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer. + model.resize_token_embeddings(len(tokenizer)) """ if not new_tokens: return 0 @@ -720,64 +813,84 @@ def add_tokens(self, new_tokens: Union[str, AddedToken, List[str], List[AddedTok return self._add_tokens(new_tokens, special_tokens=special_tokens) @property - def bos_token(self): - """ Beginning of sentence token (string). Log an error if used while not having been set. """ + def bos_token(self) -> str: + """ + :obj:`str`: Beginning of sentence token. Log an error if used while not having been set. + """ if self._bos_token is None and self.verbose: logger.error("Using bos_token, but it is not set yet.") return None return str(self._bos_token) @property - def eos_token(self): - """ End of sentence token (string). Log an error if used while not having been set. """ + def eos_token(self) -> str: + """ + :obj:`str`: End of sentence token. Log an error if used while not having been set. + """ if self._eos_token is None and self.verbose: logger.error("Using eos_token, but it is not set yet.") return None return str(self._eos_token) @property - def unk_token(self): - """ Unknown token (string). Log an error if used while not having been set. """ + def unk_token(self) -> str: + """ + :obj:`str`: Unknown token. Log an error if used while not having been set. + """ if self._unk_token is None and self.verbose: logger.error("Using unk_token, but it is not set yet.") return None return str(self._unk_token) @property - def sep_token(self): - """ Separation token (string). E.g. separate context and query in an input sequence. Log an error if used while not having been set. """ + def sep_token(self) -> str: + """ + :obj:`str`: Separation token, to separate context and query in an input sequence. + Log an error if used while not having been set. + """ if self._sep_token is None and self.verbose: logger.error("Using sep_token, but it is not set yet.") return None return str(self._sep_token) @property - def pad_token(self): - """ Padding token (string). Log an error if used while not having been set. """ + def pad_token(self) -> str: + """ + :obj:`str`: Padding token. Log an error if used while not having been set. + """ if self._pad_token is None and self.verbose: logger.error("Using pad_token, but it is not set yet.") return None return str(self._pad_token) @property - def cls_token(self): - """ Classification token (string). E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """ + def cls_token(self) -> str: + """ + :obj:`str`: Classification token, to extract a summary of an input sequence leveraging self-attention along + the full depth of the model. Log an error if used while not having been set. + """ if self._cls_token is None and self.verbose: logger.error("Using cls_token, but it is not set yet.") return None return str(self._cls_token) @property - def mask_token(self): - """ Mask token (string). E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """ + def mask_token(self) -> str: + """ + :obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while + not having been set. + """ if self._mask_token is None and self.verbose: logger.error("Using mask_token, but it is not set yet.") return None return str(self._mask_token) @property - def additional_special_tokens(self): - """ All the additional special tokens you may want to use (list of strings). Log an error if used while not having been set. """ + def additional_special_tokens(self) -> List[str]: + """ + :obj:`List[str]`: All the additional special tokens you may want to use. Log an error if used while not having + been set. + """ if self._additional_special_tokens is None and self.verbose: logger.error("Using additional_special_tokens, but it is not set yet.") return None @@ -816,70 +929,99 @@ def additional_special_tokens(self, value): self._additional_special_tokens = value @property - def bos_token_id(self): - """ Id of the beginning of sentence token in the vocabulary. Log an error if used while not having been set. """ + def bos_token_id(self) -> Optional[int]: + """ + :obj:`Optional[int]`: Id of the beginning of sentence token in the vocabulary. Returns :obj:`None` if the token + has not been set. + """ if self._bos_token is None: return None return self.convert_tokens_to_ids(self.bos_token) @property - def eos_token_id(self): - """ Id of the end of sentence token in the vocabulary. Log an error if used while not having been set. """ + def eos_token_id(self) -> Optional[int]: + """ + :obj:`Optional[int]`: Id of the end of sentence token in the vocabulary. Returns :obj:`None` if the token has + not been set. + """ if self._eos_token is None: return None return self.convert_tokens_to_ids(self.eos_token) @property - def unk_token_id(self): - """ Id of the unknown token in the vocabulary. Log an error if used while not having been set. """ + def unk_token_id(self) -> Optional[int]: + """ + :obj:`Optional[int]`: Id of the unknown token in the vocabulary. Returns :obj:`None` if the token has not been + set. + """ if self._unk_token is None: return None return self.convert_tokens_to_ids(self.unk_token) @property - def sep_token_id(self): - """ Id of the separation token in the vocabulary. E.g. separate context and query in an input sequence. Log an error if used while not having been set. """ + def sep_token_id(self) -> Optional[int]: + """ + :obj:`Optional[int]`: Id of the separation token in the vocabulary, to separate context and query in an input + sequence. Returns :obj:`None` if the token has not been set. + """ if self._sep_token is None: return None return self.convert_tokens_to_ids(self.sep_token) @property - def pad_token_id(self): - """ Id of the padding token in the vocabulary. Log an error if used while not having been set. """ + def pad_token_id(self) -> Optional[int]: + """ + :obj:`Optional[int]`: Id of the padding token in the vocabulary. Returns :obj:`None` if the token has not been + set. + """ if self._pad_token is None: return None return self.convert_tokens_to_ids(self.pad_token) @property - def pad_token_type_id(self): - """ Id of the padding token type in the vocabulary.""" + def pad_token_type_id(self) -> int: + """ + :obj:`int`: Id of the padding token type in the vocabulary. + """ return self._pad_token_type_id @property - def cls_token_id(self): - """ Id of the classification token in the vocabulary. E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """ + def cls_token_id(self) -> Optional[int]: + """ + :obj:`Optional[int]`: Id of the classification token in the vocabulary, to extract a summary of an input + sequence leveraging self-attention along the full depth of the model. + + Returns :obj:`None` if the token has not been set. + """ if self._cls_token is None: return None return self.convert_tokens_to_ids(self.cls_token) @property - def mask_token_id(self): - """ Id of the mask token in the vocabulary. E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """ + def mask_token_id(self) -> Optional[int]: + """ + :obj:`Optional[int]`: Id of the mask token in the vocabulary, used when training a model with masked-language + modeling. Returns :obj:`None` if the token has not been set. + """ if self._mask_token is None: return None return self.convert_tokens_to_ids(self.mask_token) @property - def additional_special_tokens_ids(self): - """ Ids of all the additional special tokens in the vocabulary (list of integers). Log an error if used while not having been set. """ + def additional_special_tokens_ids(self) -> List[int]: + """ + :obj:`List[int]`: Ids of all the additional special tokens in the vocabulary. + Log an error if used while not having been set. + """ return self.convert_tokens_to_ids(self.additional_special_tokens) @property - def special_tokens_map(self): - """ A dictionary mapping special token class attribute (cls_token, unk_token...) to their - values ('', ''...) - Convert tokens of AddedToken type in string. - All returned tokens are strings + def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]: + """ + :obj:`Dict[str, Union[str, List[str]]]`: A dictionary mapping special token class attributes + (:obj:`cls_token`, :obj:`unk_token`, etc.) to their values (:obj:`''`, :obj:`''`, etc.). + + Convert potential tokens of :obj:`tokenizers.AddedToken` type to string. """ set_attr = {} for attr in self.SPECIAL_TOKENS_ATTRIBUTES: @@ -889,12 +1031,14 @@ def special_tokens_map(self): return set_attr @property - def special_tokens_map_extended(self): - """ A dictionary mapping special token class attribute (cls_token, unk_token...) to their - values ('', ''...) - Keep the tokens as AddedToken if they are of this type. + def special_tokens_map_extended(self) -> Dict[str, Union[str, AddedToken, List[Union[str, AddedToken]]]]: + """ + :obj:`Dict[str, Union[str, tokenizers.AddedToken, List[Union[str, tokenizers.AddedToken]]]]`: A dictionary + mapping special token class attributes (:obj:`cls_token`, :obj:`unk_token`, etc.) to their values + (:obj:`''`, :obj:`''`, etc.). - AddedToken can be used to control more finely how special tokens are tokenized. + Don't convert tokens of :obj:`tokenizers.AddedToken` type to string so they can be used to control more finely + how special tokens are tokenized. """ set_attr = {} for attr in self.SPECIAL_TOKENS_ATTRIBUTES: @@ -904,21 +1048,23 @@ def special_tokens_map_extended(self): return set_attr @property - def all_special_tokens(self): - """ List all the special tokens ('', ''...) mapped to class attributes - Convert tokens of AddedToken type in string. - All returned tokens are strings - (cls_token, unk_token...). + def all_special_tokens(self) -> List[str]: + """ + :obj:`List[str]`: All the special tokens (:obj:`''`, :obj:`''`, etc.) mapped to class attributes. + + Convert tokens of :obj:`tokenizers.AddedToken` type to string. """ all_toks = [str(s) for s in self.all_special_tokens_extended] return all_toks @property - def all_special_tokens_extended(self): - """ List all the special tokens ('', ''...) mapped to class attributes - Keep the tokens as AddedToken if they are of this type. + def all_special_tokens_extended(self) -> List[Union[str, AddedToken]]: + """ + :obj:`List[Union[str, tokenizers.AddedToken]]`: All the special tokens (:obj:`''`, :obj:`''`, etc.) + mapped to class attributes. - AddedToken can be used to control more finely how special tokens are tokenized. + Don't convert tokens of :obj:`tokenizers.AddedToken` type to string so they can be used to control more finely + how special tokens are tokenized. """ all_toks = [] set_attr = self.special_tokens_map_extended @@ -928,9 +1074,10 @@ def all_special_tokens_extended(self): return all_toks @property - def all_special_ids(self): - """ List the vocabulary indices of the special tokens ('', ''...) mapped to - class attributes (cls_token, unk_token...). + def all_special_ids(self) -> List[int]: + """ + :obj:`List[int]`: List the ids of the special tokens(:obj:`''`, :obj:`''`, etc.) mapped to class + attributes. """ all_toks = self.all_special_tokens all_ids = self.convert_tokens_to_ids(all_toks) @@ -939,96 +1086,181 @@ class attributes (cls_token, unk_token...). ENCODE_KWARGS_DOCSTRING = r""" add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`): - If set to ``True``, the sequences will be encoded with the special tokens relative - to their model. - `padding` (:obj:`Union[bool, str]`, `optional`, defaults to :obj:`False`): - Activate and control padding. Accepts the following values: - - * `True` or `'longest'`: pad to the longest sequence in the batch (or no padding if only a single sequence if provided), - * `'max_length'`: pad to a max length specified in `max_length` or to the max acceptable input length for the model if no length is provided (`max_length=None`) - * `False` or `'do_not_pad'` (default): No padding (i.e. can output batch with sequences of uneven lengths) - `truncation` (:obj:`Union[bool, str]`, `optional`, defaults to :obj:`False`): - Activate and control truncation. Accepts the following values: - - * `True` or `'longest_first'`: truncate to a max length specified in `max_length` or to the max acceptable input length for the model if no length is provided (`max_length=None`). This will truncate token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a batch of pairs) is provided, - * `'only_first'`: truncate to a max length specified in `max_length` or to the max acceptable input length for the model if no length is provided (`max_length=None`). This will only truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided, - * `'only_second'`: truncate to a max length specified in `max_length` or to the max acceptable input length for the model if no length is provided (`max_length=None`). This will only truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided, - * `False` or `'do_not_truncate'` (default): No truncation (i.e. can output batch with sequences length greater than the model max admissible input size) - `max_length` (:obj:`Union[int, None]`, `optional`, defaults to :obj:`None`): - Control the length for padding/truncation. Accepts the following values - - * `None` (default): This will use the predefined model max length if required by one of the truncation/padding parameters. If the model has no specific max input length (e.g. XLNet) truncation/padding to max length is deactivated. - * `any integer value` (e.g. `42`): Use this specific maximum length value if required by one of the truncation/padding parameters. - stride (:obj:`int`, `optional`, defaults to ``0``): - If set to a number along with max_length, the overflowing tokens returned when `return_overflowing_tokens=True` - will contain some tokens from the end of the truncated sequence returned to provide some overlap between truncated and overflow ing sequences. - The value of this argument defines the number of overlapping tokens. - is_pretokenized (:obj:`bool`, defaults to :obj:`False`): - Set to True to indicate the input is already tokenized - pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. - This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability - >= 7.5 (Volta). - return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`): - Can be set to 'tf', 'pt' or 'np' to return respectively TensorFlow :obj:`tf.constant`, - PyTorch :obj:`torch.Tensor` or Numpy :obj: `np.ndarray` instead of a list of python integers. + Whether or not to encode the sequences with the special tokens relative to their model. + padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`): + Activates and controls padding. Accepts the following values: + + * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a + single sequence if provided). + * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the + maximum acceptable input length for the model if that argument is not provided. + * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of + different lengths). + truncation (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`False`): + Activates and controls truncation. Accepts the following values: + + * :obj:`True` or :obj:`'longest_first'`: Truncate to a maximum length specified with the argument + :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not + provided. This will truncate token by token, removing a token from the longest sequence in the pair + if a pair of sequences (or a batch of pairs) is provided. + * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to + the maximum acceptable input length for the model if that argument is not provided. This will only + truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or + to the maximum acceptable input length for the model if that argument is not provided. This will only + truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + * :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with + sequence lengths greater than the model maximum admissible input size). + max_length (:obj:`int`, `optional`): + Controls the maximum length to use by one of the truncation/padding parameters. + + If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum + length is required by one of the truncation/padding parameters. If the model has no specific maximum + input length (like XLNet) truncation/padding to a maximum length will be deactivated. + stride (:obj:`int`, `optional`, defaults to 0): + If set to a number along with :obj:`max_length`, the overflowing tokens returned when + :obj:`return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence + returned to provide some overlap between truncated and overflowing sequences. The value of this + argument defines the number of overlapping tokens. + is_pretokenized (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not the input is already tokenized. + pad_to_multiple_of (:obj:`int`, `optional`): + If set will pad the sequence to a multiple of the provided value. This is especially useful to enable + the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). + return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`): + If set, will return tensors instead of list of python integers. Acceptable values are: + + * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. + * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects. + * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects. """ ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r""" - return_token_type_ids (:obj:`bool`, `optional`, defaults to :obj:`None`): + return_token_type_ids (:obj:`bool`, `optional`): Whether to return token type IDs. If left to the default, will return the token type IDs according to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. - `What are token type IDs? <../glossary.html#token-type-ids>`_ - return_attention_mask (:obj:`bool`, `optional`, defaults to :obj:`none`): + `What are token type IDs? <../glossary.html#token-type-ids>`__ + return_attention_mask (:obj:`bool`, `optional`): Whether to return the attention mask. If left to the default, will return the attention mask according to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. `What are attention masks? <../glossary.html#attention-mask>`__ return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): - Set to True to return overflowing token sequences (default False). + Whether or not to return overflowing token sequences. return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`): - Set to True to return special tokens mask information (default False). + Wheter or not to return special tokens mask information. return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`): - Set to True to return (char_start, char_end) for each token (default False). - If using Python's tokenizer, this method will raise NotImplementedError. - This one is only available on fast tokenizers inheriting from PreTrainedTokenizerFast. - **kwargs: passed to the `self.tokenize()` method + Whether or not to return :obj:`(char_start, char_end)` for each token. - Return: - A Dictionary of shape:: - - { - input_ids: list[int], - token_type_ids: list[int] if return_token_type_ids is True (default) - attention_mask: list[int] if return_attention_mask is True (default) - overflowing_tokens: list[int] if the tokenizer is a slow tokenize, else a List[List[int]] if a ``max_length`` is specified and ``return_overflowing_tokens=True`` - special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` - and return_special_tokens_mask is True - } + This is only available on fast tokenizers inheriting from + :class:`~transformers.PreTrainedTokenizerFast`, if using Python's tokenizer, this method will raise + :obj:`NotImplementedError`. + return_length (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to return the lengths of the encoded inputs. + verbose (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to print informations and warnings. + **kwargs: passed to the :obj:`self.tokenize()` method - With the fields: + Return: + :class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. + + `What are input IDs? <../glossary.html#input-ids>`__ + - **token_type_ids** -- List of token type ids to be fed to a model (when :obj:`return_token_type_ids=True` + or if `"token_type_ids"` is in :obj:`self.model_input_names`). + + `What are token type IDs? <../glossary.html#token-type-ids>`__ + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + :obj:`return_attention_mask=True` or if `"attention_mask"` is in :obj:`self.model_input_names`). + + `What are attention masks? <../glossary.html#attention-mask>`__ + - **overflowing_tokens** -- List of overflowing tokens sequences (when a :obj:`max_length` is specified and + :obj:`return_overflowing_tokens=True`). + - **num_truncated_tokens** -- Number of tokens truncated (when a :obj:`max_length` is specified and + :obj:`return_overflowing_tokens=True`). + - **special_tokens_mask** -- List of 0s and 1s, with 0 specifying added special tokens and 1 specifying + regual sequence tokens (when :obj:`add_special_tokens=True` and :obj:`return_special_tokens_mask=True`). + - **length** -- The length of the inputs (when :obj:`return_length=True`) +""" - - ``input_ids``: list of token ids to be fed to a model - - ``token_type_ids``: list of token type ids to be fed to a model - - ``attention_mask``: list of indices specifying which tokens should be attended to by the model - - ``overflowing_tokens``: list of overflowing tokens sequences if a max length is specified and ``return_overflowing_tokens=True``. - - ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added - tokens and 1 specifying sequence tokens. +INIT_TOKENIZER_DOCSTRING = r""" + Class attributes (overridden by derived classes) + - **vocab_files_names** (:obj:`Dict[str, str]`) -- A ditionary with, as keys, the ``__init__`` keyword name of + each vocabulary file required by the model, and as associated values, the filename for saving the associated + file (string). + - **pretrained_vocab_files_map** (:obj:`Dict[str, Dict[str, str]]`) -- A dictionary of dictionaries, with the + high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the + low-level being the :obj:`short-cut-names` of the pretrained models with, as associated values, the + :obj:`url` to the associated pretrained vocabulary file. + - **max_model_input_sizes** (:obj:`Dict[str, Optinal[int]]`) -- A dictionary with, as keys, the + :obj:`short-cut-names` of the pretrained models, and as associated values, the maximum length of the sequence + inputs of this model, or :obj:`None` if the model has no maximum input size. + - **pretrained_init_configuration** (:obj:`Dict[str, Dict[str, Any]]`) -- A dictionary with, as keys, the + :obj:`short-cut-names` of the pretrained models, and as associated values, a dictionnary of specific + arguments to pass to the ``__init__`` method of the tokenizer class for this pretrained model when loading the + tokenizer with the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained` + method. + - **model_input_names** (:obj:`List[str]`) -- A list of inputs expected in the forward pass of the model. + - **padding_side** (:obj:`str`) -- The default value for the side on which the model should have padding + applied. Should be :obj:`'right'` or :obj:`'left'`. + + Args: + model_max_length (:obj:`int`, `optional`): + The maximum length (in number of tokens) for the inputs to the transformer model. + When the tokenizer is loaded with + :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`, this will be set to + the value stored for the associated model in ``max_model_input_sizes`` (see above). If no value is + provided, will default to VERY_LARGE_INTEGER (:obj:`int(1e30)`). + padding_side: (:obj:`str`, `optional`): + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. + model_input_names (:obj:`List[string]`, `optional`): + The list of inputs accepted by the forward pass of the model (like :obj:`"token_type_ids"` or + :obj:`"attention_mask"`). Default value is picked from the class attribute of the same name. + bos_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`): + A special token representing the beginning of a sentence. Will be associated to ``self.bos_token`` and + ``self.bos_token_id``. + eos_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`): + A special token representing the end of a sentence. Will be associated to ``self.eos_token`` and + ``self.eos_token_id``. + unk_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`): + A special token representing an out-of-vocabulary token. Will be associated to ``self.unk_token`` and + ``self.unk_token_id``. + sep_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`): + A special token separating two different sentences in the same input (used by BERT for instance). Will be + associated to ``self.sep_token`` and ``self.sep_token_id``. + pad_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`): + A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by + attention mechanisms or loss computation. Will be associated to ``self.pad_token`` and + ``self.pad_token_id``. + cls_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`): + A special token representing the class of the input (used by BERT for instance). Will be associated to + ``self.cls_token`` and ``self.cls_token_id``. + mask_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`): + A special token representing a masked token (used by masked-language modeling pretraining objectives, like + BERT). Will be associated to ``self.mask_token`` and ``self.mask_token_id``. + additional_special_tokens (tuple or list of :obj:`str` or :obj:`tokenizers.AddedToken`, `optional`): + A tuple or a list of additional special tokens. Add them here to ensure they won't be split by the + tokenization process. Will be associated to ``self.additional_special_tokens`` and + ``self.additional_special_tokens_ids``. """ +@add_end_docstrings(INIT_TOKENIZER_DOCSTRING) class PreTrainedTokenizerBase(SpecialTokensMixin): - """ Base class for slow and fast tokenizers. + """ + Base class for :class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast`. - Handle shared (mostly boiler plate) methods for slow and fast tokenizers. + Handles shared (mostly boiler plate) methods for those two classes. """ vocab_files_names: Dict[str, str] = {} pretrained_vocab_files_map: Dict[str, Dict[str, str]] = {} pretrained_init_configuration: Dict[str, Dict[str, Any]] = {} - max_model_input_sizes: Dict[str, int] = {} + max_model_input_sizes: Dict[str, Optional[int]] = {} model_input_names: List[str] = ["token_type_ids", "attention_mask"] - padding_side: str = "right" def __init__(self, **kwargs): @@ -1052,22 +1284,33 @@ def __init__(self, **kwargs): @property def max_len(self) -> int: - """ Kept here for backward compatibility. - Now renamed to `model_max_length` to avoid ambiguity. """ + :obj:`int`: **Deprecated** Kept here for backward compatibility. Now renamed to :obj:`model_max_length` to + avoid ambiguity. + """ + warnings.warn( + "The `max_len` attribute has been deprecated and will be removed in a future version, use `model_max_length` instead.", + FutureWarning, + ) return self.model_max_length @property def max_len_single_sentence(self) -> int: + """ + :obj:`int`: The maximum length of a sentence that can be fed to the model. + """ return self.model_max_length - self.num_special_tokens_to_add(pair=False) @property def max_len_sentences_pair(self) -> int: + """ + :obj:`int`: The maximum combined length of a pair of sentences that can be fed to the model. + """ return self.model_max_length - self.num_special_tokens_to_add(pair=True) @max_len_single_sentence.setter def max_len_single_sentence(self, value) -> int: - """ For backward compatibility, allow to try to setup 'max_len_single_sentence' """ + # For backward compatibility, allow to try to setup 'max_len_single_sentence'. if value == self.model_max_length - self.num_special_tokens_to_add(pair=False) and self.verbose: logger.warning( "Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up." @@ -1079,7 +1322,7 @@ def max_len_single_sentence(self, value) -> int: @max_len_sentences_pair.setter def max_len_sentences_pair(self, value) -> int: - """ For backward compatibility, allow to try to setup 'max_len_sentences_pair' """ + # For backward compatibility, allow to try to setup 'max_len_sentences_pair'. if value == self.model_max_length - self.num_special_tokens_to_add(pair=True) and self.verbose: logger.warning( "Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up." @@ -1092,37 +1335,46 @@ def max_len_sentences_pair(self, value) -> int: @classmethod def from_pretrained(cls, *inputs, **kwargs): r""" - Instantiate a :class:`~transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer. + Instantiate a :class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase` (or a derived class) from + a predefined tokenizer. Args: - pretrained_model_name_or_path: either: - - - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``. - - a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``. - - (not applicable to all derived classes, deprecated) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``. - - cache_dir: (`optional`) string: - Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used. - - force_download: (`optional`) boolean, default False: - Force to (re-)download the vocabulary files and override the cached versions if they exists. - - resume_download: (`optional`) boolean, default False: - Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. - - proxies: (`optional`) dict, default None: - A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. - The proxies are used on each request. - - inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method. - - kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~transformers.PreTrainedTokenizer` for details. + pretrained_model_name_or_path (:obj:`str`): + Can be either: + + - A string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g., + ``bert-base-uncased``. + - A string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g., + ``dbmdz/bert-base-german-cased``. + - A path to a `directory` containing vocabulary files required by the tokenizer, for instance saved + using the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained` + method, e.g., ``./my_model_directory/``. + - (**Deprecated**, not applicable to all derived classes) A path or url to a single saved vocabulary + file (if and only if the tokenizer only requires a single vocabulary file like Bert or XLNet), e.g., + ``./my_model_directory/vocab.txt``. + cache_dir (:obj:`str`, `optional`): + Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the + standard cache should not be used. + force_download (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to force the (re-)download the vocabulary files and override the cached versions if they + exist. + resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to delete incompletely received files. Attempt to resume the download if such a file + exists. + proxies (:obj:`Dict[str, str], `optional`): + A dictionary of proxy servers to use by protocol or endpoint, e.g., + :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each + request. + inputs (additional positional arguments, `optional`): + Will be passed along to the Tokenizer ``__init__`` method. + kwargs (additional keyword arguments, `optional`): + Will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like + ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, + ``mask_token``, ``additional_special_tokens``. See parameters in the ``__init__`` for more details. Examples:: - # We can't instantiate directly the base class `PreTrainedTokenizer` so let's show our examples on a derived class: BertTokenizer - + # We can't instantiate directly the base class `PreTrainedTokenizerBase` so let's show our examples on a derived class: BertTokenizer # Download vocabulary from S3 and cache. tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') @@ -1336,17 +1588,26 @@ def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs) return tokenizer - def save_pretrained(self, save_directory) -> Tuple[str]: - """ Save the tokenizer vocabulary files together with: - - added tokens, - - special-tokens-to-class-attributes-mapping, - - tokenizer instantiation positional and keywords inputs (e.g. do_lower_case for Bert). + def save_pretrained(self, save_directory: str) -> Tuple[str]: + """ + Save the tokenizer vocabulary files together with: + + - added tokens, + - special tokens to class attributes mapping, + - tokenizer instantiation positional and keywords inputs (e.g. do_lower_case for Bert). - Warning: This won't save modifications you may have applied to the tokenizer after the instantiation - (e.g. modifying tokenizer.do_lower_case after creation). + This method make sure the full tokenizer can then be re-loaded using the + :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained` class method. + + .. Warning:: + This won't save modifications you may have applied to the tokenizer after the instantiation (for instance, + modifying :obj:`tokenizer.do_lower_case` after creation). + + Args: + save_directory (:obj:`str`): The path to adirectory where the tokenizer will be saved. - This method make sure the full tokenizer can then be re-loaded using the - :func:`~transformers.PreTrainedTokenizer.from_pretrained` class method. + Returns: + A tuple of :obj:`str`: The files saved. """ if os.path.isfile(save_directory): logger.error("Provided path ({}) should be a directory, not a file".format(save_directory)) @@ -1388,35 +1649,40 @@ def save_pretrained(self, save_directory) -> Tuple[str]: @add_end_docstrings( ENCODE_KWARGS_DOCSTRING, """ - **kwargs: passed to the `self.tokenize()` method. - """, + **kwargs: Passed along to the `.tokenize()` method. + """, + """ + Returns: + :obj:`List[int]`, :obj:`torch.Tensor`, :obj:`tf.Tensor` or :obj:`np.ndarray`: + The tokenized ids of the text. + """, ) def encode( self, text: Union[TextInput, PreTokenizedInput, EncodedInput], text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, add_special_tokens: bool = True, - padding: Union[bool, str] = False, - truncation: Union[bool, str] = False, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = False, max_length: Optional[int] = None, stride: int = 0, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs - ): + ) -> List[int]: """ - Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary. + Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary. Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``. Args: text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`): The first sequence to be encoded. This can be a string, a list of strings (tokenized string using - the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` - method) - text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`, defaults to :obj:`None`): + the ``tokenize`` method) or a list of integers (tokenized string ids using the + ``convert_tokens_to_ids`` method). + text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`): Optional second sequence to be encoded. This can be a string, a list of strings (tokenized - string using the `tokenize` method) or a list of integers (tokenized string ids using the - `convert_tokens_to_ids` method) + string using the ``tokenize`` method) or a list of integers (tokenized string ids using the + ``convert_tokens_to_ids`` method). """ encoded_inputs = self.encode_plus( text, @@ -1438,8 +1704,9 @@ def num_special_tokens_to_add(self, pair: bool = False) -> int: def _get_padding_truncation_strategies( self, padding=False, truncation=False, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs ): - """ Find the correct padding/truncation strategy with backward compatibility - for old arguments (truncation_strategy and pad_to_max_length) and behaviors. + """ + Find the correct padding/truncation strategy with backward compatibility + for old arguments (truncation_strategy and pad_to_max_length) and behaviors. """ old_truncation_strategy = kwargs.pop("truncation_strategy", "do_not_truncate") old_pad_to_max_length = kwargs.pop("pad_to_max_length", False) @@ -1558,8 +1825,8 @@ def __call__( text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]], text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, add_special_tokens: bool = True, - padding: Union[bool, str] = False, - truncation: Union[bool, str] = False, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = False, max_length: Optional[int] = None, stride: int = 0, is_pretokenized: bool = False, @@ -1575,20 +1842,20 @@ def __call__( **kwargs ) -> BatchEncoding: """ - Returns a dictionary containing the encoded sequence or sequence pair and additional information: - the mask for sequence classification and the overflowing elements if a ``max_length`` is specified. + Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of + sequences. Args: - text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]``): + text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`): The sequence or batch of sequences to be encoded. - Each sequence can be a string or a list of strings (pre-tokenized string). - If the sequences are provided as list of strings (pretokenized), you must set `is_pretokenized=True` - (to lift the ambiguity with a batch of sequences) - text_pair (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]``): + Each sequence can be a string or a list of strings (pretokenized string). + If the sequences are provided as list of strings (pretokenized), you must set + :obj:`is_pretokenized=True` (to lift the ambiguity with a batch of sequences). + text_pair (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`): The sequence or batch of sequences to be encoded. - Each sequence can be a string or a list of strings (pre-tokenized string). - If the sequences are provided as list of strings (pretokenized), you must set `is_pretokenized=True` - (to lift the ambiguity with a batch of sequences) + Each sequence can be a string or a list of strings (pretokenized string). + If the sequences are provided as list of strings (pretokenized), you must set + :obj:`is_pretokenized=True` (to lift the ambiguity with a batch of sequences). """ # Input type checking for clearer error assert isinstance(text, str) or ( @@ -1680,8 +1947,8 @@ def encode_plus( text: Union[TextInput, PreTokenizedInput, EncodedInput], text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, add_special_tokens: bool = True, - padding: Union[bool, str] = False, - truncation: Union[bool, str] = False, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = False, max_length: Optional[int] = None, stride: int = 0, is_pretokenized: bool = False, @@ -1697,18 +1964,20 @@ def encode_plus( **kwargs ) -> BatchEncoding: """ - Returns a dictionary containing the encoded sequence or sequence pair and additional information: - the mask for sequence classification and the overflowing elements if a ``max_length`` is specified. + Tokenize and prepare for the model a sequence or a pair of sequences. + + .. warning:: + This method is deprecated, ``__call__`` should be used instead. Args: - text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]` (the later only for not-fast tokenizers)): + text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]` (the latter only for not-fast tokenizers)): The first sequence to be encoded. This can be a string, a list of strings (tokenized string using - the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` - method) - text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`, defaults to :obj:`None`): + the ``tokenize`` method) or a list of integers (tokenized string ids using the + ``convert_tokens_to_ids`` method). + text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`): Optional second sequence to be encoded. This can be a string, a list of strings (tokenized - string using the `tokenize` method) or a list of integers (tokenized string ids using the - `convert_tokens_to_ids` method) + string using the ``tokenize`` method) or a list of integers (tokenized string ids using the + ``convert_tokens_to_ids`` method). """ # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' @@ -1777,8 +2046,8 @@ def batch_encode_plus( List[EncodedInputPair], ], add_special_tokens: bool = True, - padding: Union[bool, str] = False, - truncation: Union[bool, str] = False, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = False, max_length: Optional[int] = None, stride: int = 0, is_pretokenized: bool = False, @@ -1794,17 +2063,16 @@ def batch_encode_plus( **kwargs ) -> BatchEncoding: """ - Returns a dictionary containing the encoded sequence or sequence pair and additional information: - the mask for sequence classification and the overflowing elements if a ``max_length`` is specified. + Tokenize and prepare for the model a list of sequences or a list of pairs of sequences. + + .. warning:: + This method is deprecated, ``__call__`` should be used instead. Args: - batch_text_or_text_pairs (:obj:`List[str]`, :obj:`List[Tuple[str, str]]`, - :obj:`List[List[str]]`, :obj:`List[Tuple[List[str], List[str]]]`, - and for not-fast tokenizers, also: - :obj:`List[List[int]]`, :obj:`List[Tuple[List[int], List[int]]]`): + batch_text_or_text_pairs (:obj:`List[str]`, :obj:`List[Tuple[str, str]]`, :obj:`List[List[str]]`, :obj:`List[Tuple[List[str], List[str]]]`, and for not-fast tokenizers, also :obj:`List[List[int]]`, :obj:`List[Tuple[List[int], List[int]]]`): Batch of sequences or pair of sequences to be encoded. This can be a list of string/string-sequences/int-sequences or a list of pair of - string/string-sequences/int-sequence (see details in encode_plus) + string/string-sequences/int-sequence (see details in ``encode_plus``). """ # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' @@ -1875,39 +2143,56 @@ def pad( Dict[str, List[EncodedInput]], List[Dict[str, EncodedInput]], ], - padding: Union[bool, str] = True, + padding: Union[bool, str, PaddingStrategy] = True, max_length: Optional[int] = None, pad_to_multiple_of: Optional[int] = None, return_attention_mask: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, verbose: bool = True, ) -> BatchEncoding: - """ Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length in the batch. + """ + Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length + in the batch. - Padding side (left/right) padding token ids are defined at the tokenizer level - (with ``self.padding_side``, ``self.pad_token_id`` and ``self.pad_token_type_id``) + Padding side (left/right) padding token ids are defined at the tokenizer level + (with ``self.padding_side``, ``self.pad_token_id`` and ``self.pad_token_type_id``) Args: - encoded_inputs: Dictionary of tokenized inputs (`Dict[str, List[int]]`) or batch of tokenized inputs. - Batch of tokenized inputs can be given as dicts of lists or lists of dicts, both work so you can - use ``tokenizer.pad()`` during pre-processing as well as in a PyTorch Dataloader collate function. - (`Dict[str, List[List[int]]]` or `List[Dict[str, List[int]]]`). - padding: Boolean or specific strategy to use for padding. - Select a strategy to pad the returned sequences (according to the model's padding side and padding index) among: - - 'longest' (or `True`) Pad to the longest sequence in the batch - - 'max_length': Pad to the max length (default) - - 'do_not_pad' (or `False`): Do not pad - max_length: maximum length of the returned list and optionally padding length (see below). - Will truncate by taking into account the special tokens. - pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. - This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + encoded_inputs (:class:`~transformers.BatchEncoding`, list of :class:`~transformers.BatchEncoding`, :obj:`Dict[str, List[int]]`, :obj:`Dict[str, List[List[int]]` or :obj:`List[Dict[str, List[int]]]`): + Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or + :obj:`Dict[str, List[int]]`) or a batch of tokenized inputs (list of + :class:`~transformers.BatchEncoding`, `Dict[str, List[List[int]]]` or `List[Dict[str, List[int]]]`) so + you can use this method during preprocessing as well as in a PyTorch Dataloader collate function. + padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`): + Select a strategy to pad the returned sequences (according to the model's padding side and padding + index) among: + + * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a + single sequence if provided). + * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the + maximum acceptable input length for the model if that argument is not provided. + * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of + different lengths). + max_length (:obj:`int`, `optional`): + Maximum length of the returned list and optionally padding length (see above). + pad_to_multiple_of (:obj:`int`, `optional`): + If set will pad the sequence to a multiple of the provided value. + + This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). - return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) - return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`): - Can be set to 'tf', 'pt' or 'np' to return respectively TensorFlow :obj:`tf.constant`, - PyTorch :obj:`torch.Tensor` or Numpy :obj: `np.ndarray` instead of a list of python integers. + return_attention_mask (:obj:`bool`, `optional`): + Whether to return the attention mask. If left to the default, will return the attention mask according + to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. + + `What are attention masks? <../glossary.html#attention-mask>`__ + return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`): + If set, will return tensors instead of list of python integers. Acceptable values are: + + * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. + * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects. + * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects. verbose (:obj:`bool`, `optional`, defaults to :obj:`True`): - Set to ``False`` to avoid printing infos and warnings. + Whether or not to print informations and warnings. """ # If we have a list of dicts, let's convert it in a dict of lists if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], (dict, BatchEncoding)): @@ -1966,15 +2251,41 @@ def pad( return BatchEncoding(batch_outputs, tensor_type=return_tensors) - def create_token_type_ids_from_sequences(self, token_ids_0: List, token_ids_1: Optional[List] = None) -> List[int]: + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create the token type IDs corresponding to the sequences passed. + `What are token type IDs? <../glossary.html#token-type-ids>`__ + + Should be overriden in a subclass if the model has a special way of building those. + + Args: + token_ids_0 (:obj:`List[int]`): The first tokenized sequence. + token_ids_1 (:obj:`List[int]`, `optional`): The second tokenized sequence. + + Returns: + :obj:`List[int]`: The token type ids. + """ if token_ids_1 is None: return len(token_ids_0) * [0] return [0] * len(token_ids_0) + [1] * len(token_ids_1) - def build_inputs_with_special_tokens(self, token_ids_0: List, token_ids_1: Optional[List] = None) -> List: + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks - by concatenating and adding special tokens. This implementation does not add special tokens. + by concatenating and adding special tokens. + + This implementation does not add special tokens and this method should be overriden in a subclass. + + Args: + token_ids_0 (:obj:`List[int]`): The first tokenized sequence. + token_ids_1 (:obj:`List[int]`, `optional`): The second tokenized sequence. + + Returns: + :obj:`List[int]`: The model input with special tokens. """ if token_ids_1 is None: return token_ids_0 @@ -1986,8 +2297,8 @@ def prepare_for_model( ids: List[int], pair_ids: Optional[List[int]] = None, add_special_tokens: bool = True, - padding: Union[bool, str] = False, - truncation: Union[bool, str] = False, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = False, max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, @@ -2002,15 +2313,18 @@ def prepare_for_model( prepend_batch_axis: bool = False, **kwargs ) -> BatchEncoding: - """ Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. + """ + Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It adds special tokens, truncates sequences if overflowing while taking into account the special tokens and manages a moving window (with user defined stride) for overflowing tokens Args: - ids: list of tokenized input ids. Can be obtained from a string by chaining the - `tokenize` and `convert_tokens_to_ids` methods. - pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the - `tokenize` and `convert_tokens_to_ids` methods. + ids (:obj:`List[int]`): + Tokenized input ids of the first sequence. Can be obtained from a string by chaining the + ``tokenize`` and ``convert_tokens_to_ids`` methods. + pair_ids (:obj:`List[int]`, `optional`): + Tokenized input ids of the second sequence. Can be obtained from a string by chaining the + ``tokenize`` and ``convert_tokens_to_ids`` methods. """ if "return_lengths" in kwargs: @@ -2113,27 +2427,46 @@ def truncate_sequences( truncation_strategy: Union[str, TruncationStrategy] = "longest_first", stride: int = 0, ) -> Tuple[List[int], List[int], List[int]]: - """ Truncates a sequence pair in place to the maximum length. + """ + Truncates a sequence pair in-place following the strategy. Args: - ids: list of tokenized input ids. Can be obtained from a string by chaining the - `tokenize` and `convert_tokens_to_ids` methods. - pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the - `tokenize` and `convert_tokens_to_ids` methods. - num_tokens_to_remove (:obj:`int`, `optional`, defaults to ``0``): - number of tokens to remove using the truncation strategy - truncation_strategy (:obj:`string`, `optional`, defaults to "longest_first"): - String selected in the following options: - - - 'longest_first' (default): Iteratively reduce the inputs sequence until the input is under max_length - starting from the longest one at each token (when there is a pair of input sequences). - Overflowing tokens only contains overflow from the first sequence. - - 'only_first': Only truncate the first sequence. raise an error if the first sequence is shorter or equal to than num_tokens_to_remove. - - 'only_second': Only truncate the second sequence - - 'do_not_truncate' - stride (:obj:`int`, `optional`, defaults to ``0``): - If set to a number along with max_length, the overflowing tokens returned will contain some tokens + ids (:obj:`List[int]`): + Tokenized input ids of the first sequence. Can be obtained from a string by chaining the + ``tokenize`` and ``convert_tokens_to_ids`` methods. + pair_ids (:obj:`List[int]`, `optional`): + Tokenized input ids of the second sequence. Can be obtained from a string by chaining the + ``tokenize`` and ``convert_tokens_to_ids`` methods. + num_tokens_to_remove (:obj:`int`, `optional`, defaults to 0): + Number of tokens to remove using the truncation strategy. + truncation (:obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`False`): + The strategy to follow for truncation. Can be: + + * :obj:`'longest_first'`: Truncate to a maximum length specified with the argument + :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not + provided. This will truncate token by token, removing a token from the longest sequence in the pair + if a pair of sequences (or a batch of pairs) is provided. + * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to + the maximum acceptable input length for the model if that argument is not provided. This will only + truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or + to the maximum acceptable input length for the model if that argument is not provided. This will only + truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + * :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with + sequence lengths greater than the model maximum admissible input size). + max_length (:obj:`int`, `optional`): + Controls the maximum length to use by one of the truncation/padding parameters. + + If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum + length is required by one of the truncation/padding parameters. If the model has no specific maximum + input length (like XLNet) truncation/padding to a maximum length will be deactivated. + stride (:obj:`int`, `optional`, defaults to 0): + If set to a positive number, the overflowing tokens returned will contain some tokens from the main sequence returned. The value of this argument defines the number of additional tokens. + + Returns: + :obj:`Tuple[List[int], List[int], List[int]]`: + The truncated ``ids``, the truncated ``pair_ids`` and the list of overflowing tokens. """ if num_tokens_to_remove <= 0: return ids, pair_ids, [] @@ -2193,7 +2526,8 @@ def _pad( pad_to_multiple_of: Optional[int] = None, return_attention_mask: Optional[bool] = None, ) -> dict: - """ Pad encoded inputs (on left/right and up to predefined legnth or max length in the batch) + """ + Pad encoded inputs (on left/right and up to predefined legnth or max length in the batch) Args: encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). @@ -2262,9 +2596,15 @@ def batch_decode( Convert a list of lists of token ids into a list of strings by calling decode. Args: - token_ids: list of tokenized input ids. Can be obtained using the `encode` or `encode_plus` methods. - skip_special_tokens: if set to True, will replace special tokens. - clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces. + sequences (:obj:`List[List[int]]`): + List of tokenized input ids. Can be obtained using the ``__call__`` method. + skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to remove special tokens in the decoding. + clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to clean up the tokenization spaces. + + Returns: + :obj:`List[str]`: The list of decoded sentences. """ return [ self.decode( @@ -2277,30 +2617,38 @@ def decode( self, token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True ) -> str: """ - Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary + Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special tokens and clean up tokenization spaces. + Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``. Args: - token_ids: list of tokenized input ids. Can be obtained using the `encode` or `encode_plus` methods. - skip_special_tokens: if set to True, will replace special tokens. - clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces. + token_ids (:obj:`List[int]`): + List of tokenized input ids. Can be obtained using the ``__call__`` method. + skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to remove special tokens in the decoding. + clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to clean up the tokenization spaces. + + Returns: + :obj:`str`: The decoded sentence. """ raise NotImplementedError def get_special_tokens_mask( - self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. Args: - token_ids_0: list of ids (must not contain special tokens) - token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids - for sequence pairs - already_has_special_tokens: (default False) Set to True if the token list is already formated with - special tokens for the model + token_ids_0 (:obj:`List[int]`): + List of ids of the first sequence. + token_ids_1 (:obj:`List[int]`, `optional`): + List of ids of the second sequence. + already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Wheter or not the token list is already formated with special tokens for the model. Returns: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. @@ -2320,7 +2668,14 @@ def get_special_tokens_mask( @staticmethod def clean_up_tokenization(out_string: str) -> str: - """ Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms. + """ + Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms. + + Args: + out_string (:obj:`str`): The text to clean up. + + Returns: + :obj:`str`: The cleaned-up string. """ out_string = ( out_string.replace(" .", ".") diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py index 60dc5c9a3a20..fd1778590d48 100644 --- a/src/transformers/tokenization_utils_fast.py +++ b/src/transformers/tokenization_utils_fast.py @@ -25,7 +25,9 @@ from tokenizers.decoders import Decoder as DecoderFast from tokenizers.implementations import BaseTokenizer as BaseTokenizerFast +from .file_utils import add_end_docstrings from .tokenization_utils_base import ( + INIT_TOKENIZER_DOCSTRING, AddedToken, BatchEncoding, PaddingStrategy, @@ -41,10 +43,17 @@ logger = logging.getLogger(__name__) +@add_end_docstrings( + INIT_TOKENIZER_DOCSTRING, + """ + .. automethod:: __call__ + """, +) class PreTrainedTokenizerFast(PreTrainedTokenizerBase): - """ Base class for all fast tokenizers (wrapping HuggingFace tokenizers library). + """ + Base class for all fast tokenizers (wrapping HuggingFace tokenizers library). - Inherits from PreTrainedTokenizerBase. + Inherits from :class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase`. Handles all the shared methods for tokenization and special tokens, as well as methods for downloading/caching/loading pretrained tokenizers, as well as adding tokens to the vocabulary. @@ -52,54 +61,6 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): This class also contains the added tokens in a unified way on top of all tokenizers so we don't have to handle the specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...). - - Class attributes (overridden by derived classes): - - - ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of each vocabulary file - required by the model, and as associated values, the filename for saving the associated file (string). - - ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys - being the ``__init__`` keyword name of each vocabulary file required by the model, the low-level being the - `short-cut-names` (string) of the pretrained models with, as associated values, the `url` (string) to the - associated pretrained vocabulary file. - - ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained - models, and as associated values, the maximum length of the sequence inputs of this model, or None if the - model has no maximum input size. - - ``pretrained_init_configuration``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the - pretrained models, and as associated values, a dictionnary of specific arguments to pass to the - ``__init__``method of the tokenizer class for this pretrained model when loading the tokenizer with the - ``from_pretrained()`` method. - - Args: - - ``tokenizer`` (`BaseTokenizerFast`): A Fast tokenizer from the HuggingFace tokenizer library (in low level Rust language) - - ``model_max_length``: (`Optional`) int: the maximum length in number of tokens for the inputs to the transformer model. - When the tokenizer is loaded with `from_pretrained`, this will be set to the value stored for the associated - model in ``max_model_input_sizes`` (see above). If no value is provided, will default to VERY_LARGE_INTEGER (`int(1e30)`). - no associated max_length can be found in ``max_model_input_sizes``. - - ``padding_side``: (`Optional`) string: the side on which the model should have padding applied. - Should be selected between ['right', 'left'] - - ``model_input_names``: (`Optional`) List[string]: the list of the forward pass inputs accepted by the - model ("token_type_ids", "attention_mask"...). - - ``bos_token``: (`Optional`) string: a beginning of sentence token. - Will be associated to ``self.bos_token`` and ``self.bos_token_id`` - - ``eos_token``: (`Optional`) string: an end of sentence token. - Will be associated to ``self.eos_token`` and ``self.eos_token_id`` - - ``unk_token``: (`Optional`) string: an unknown token. - Will be associated to ``self.unk_token`` and ``self.unk_token_id`` - - ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). - Will be associated to ``self.sep_token`` and ``self.sep_token_id`` - - ``pad_token``: (`Optional`) string: a padding token. - Will be associated to ``self.pad_token`` and ``self.pad_token_id`` - - ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence - leveraging self-attention along the full depth of the model). - Will be associated to ``self.cls_token`` and ``self.cls_token_id`` - - ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language - modeling). Will be associated to ``self.mask_token`` and ``self.mask_token_id`` - - ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. - Adding all special tokens here to ensure they won't be split by the tokenization process. - Will be associated to ``self.additional_special_tokens`` and ``self.additional_special_tokens_ids`` - - - .. automethod:: __call__ """ def __init__(self, tokenizer: BaseTokenizerFast, **kwargs): @@ -118,26 +79,53 @@ def is_fast(self) -> bool: @property def vocab_size(self) -> int: + """ + :obj:`int`: Size of the base vocabulary (without the added tokens). + """ return self._tokenizer.get_vocab_size(with_added_tokens=False) def get_vocab(self) -> Dict[str, int]: + """ + Returns the vocabulary as a dictionary of token to index. + + :obj:`tokenizer.get_vocab()[token]` is equivalent to :obj:`tokenizer.convert_tokens_to_ids(token)` when + :obj:`token` is in the vocab. + + Returns: + :obj:`Dict[str, int]`: The vocabulary. + """ return self._tokenizer.get_vocab(with_added_tokens=True) def get_added_vocab(self) -> Dict[str, int]: + """ + Returns the added tokens in the vocabulary as a dictionary of token to index. + + Returns: + :obj:`Dict[str, int]`: The added tokens. + """ base_vocab = self._tokenizer.get_vocab(with_added_tokens=False) full_vocab = self._tokenizer.get_vocab(with_added_tokens=True) added_vocab = dict((tok, index) for tok, index in full_vocab.items() if tok not in base_vocab) return added_vocab def __len__(self) -> int: + """ + Size of the full vocabulary with the added tokens. + """ return self._tokenizer.get_vocab_size(with_added_tokens=True) @property def backend_tokenizer(self) -> BaseTokenizerFast: + """ + :obj:`tokenizers.implementations.BaseTokenizer`: The Rust tokenizer used as a backend. + """ return self._tokenizer @property def decoder(self) -> DecoderFast: + """ + :obj:`tokenizers.decoders.Decoder`: The Rust decoder for this tokenizer. + """ return self._tokenizer._tokenizer.decoder def _convert_encoding( @@ -186,8 +174,15 @@ def _convert_encoding( return encoding_dict def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]: - """ Converts a token string (or a sequence of tokens) in a single integer id - (or a sequence of ids), using the vocabulary. + """ + Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the + vocabulary. + + Args: + token (:obj:`str` or :obj:`List[str]`): One or several token(s) to convert to token id(s). + + Returns: + :obj:`int` or :obj:`List[int]`: The token id or list of token ids. """ if tokens is None: return None @@ -216,16 +211,38 @@ def _add_tokens(self, new_tokens: List[Union[str, AddedToken]], special_tokens=F return self._tokenizer.add_tokens(new_tokens) def num_special_tokens_to_add(self, pair: bool = False) -> int: + """ + Returns the number of added tokens when encoding a sequence with special tokens. + + .. note:: + This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not + put this inside your training loop. + + Args: + pair (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether the number of added tokens should be computed in the case of a sequence pair or a single + sequence. + + Returns: + :obj:`int`: Number of special tokens added to sequences. + """ return self._tokenizer.num_special_tokens_to_add(pair) def convert_ids_to_tokens( self, ids: Union[int, List[int]], skip_special_tokens: bool = False ) -> Union[str, List[str]]: - """ Converts a single index or a sequence of indices (integers) in a token " - (resp.) a sequence of tokens (str), using the vocabulary and added tokens. + """ + Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary + and added tokens. + + Args: + ids (:obj:`int` or :obj:`List[int]`): + The token id (or token ids) to convert to tokens. + skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to remove special tokens in the decoding. - Args: - skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False + Returns: + :obj:`str` or :obj:`List[str]`: The decoded token(s). """ if isinstance(ids, int): return self._tokenizer.id_to_token(ids) @@ -238,6 +255,20 @@ def convert_ids_to_tokens( return tokens def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False) -> List[str]: + """ + Converts a string in a sequence of tokens, using the backend Rust tokenizer. + + Args: + text (:obj:`str`): + The sequence to be encoded. + pair (:obj:`str`, `optional`): + A second sequence to be encoded with the first. + add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to add the special tokens associated with the corresponding model. + + Returns: + :obj:`List[str]`: The list of tokens. + """ return self._tokenizer.encode(text, pair, add_special_tokens=add_special_tokens).tokens def set_truncation_and_padding( @@ -248,20 +279,26 @@ def set_truncation_and_padding( stride: int, pad_to_multiple_of: Optional[int], ): - """ Define the truncation and the padding strategies for fast tokenizers - (provided by HuggingFace tokenizers library) and restore the tokenizer settings afterwards. - - The provided tokenizer has no padding / truncation strategy - before the managed section. If your tokenizer set a padding / truncation strategy before, - then it will be reset to no padding/truncation when exiting the managed section. - - Args: - padding_strategy (:obj:`PaddingStrategy`): The kind of padding that will be applied to the input - truncation_strategy (:obj:`TruncationStrategy`): The kind of truncation that will be applied to the input - max_length (:obj:`int`): The maximum size of the sequence - stride (:obj:`int`): The stride to use when handling overflow - pad_to_multiple_of (:obj:`int`, `optional`, defaults to `None`) - + """ + Define the truncation and the padding strategies for fast tokenizers (provided by HuggingFace tokenizers + library) and restore the tokenizer settings afterwards. + + The provided tokenizer has no padding / truncation strategy before the managed section. If your tokenizer set a + padding / truncation strategy before, then it will be reset to no padding / truncation when exiting the managed + section. + + Args: + padding_strategy (:class:`~transformers.tokenization_utils_base.PaddingStrategy`): + The kind of padding that will be applied to the input + truncation_strategy (:class:`~transformers.tokenization_utils_base.TruncationStrategy`): + The kind of truncation that will be applied to the input + max_length (:obj:`int`): + The maximum size of a sequence. + stride (:obj:`int`): + The stride to use when handling overflow. + pad_to_multiple_of (:obj:`int`, `optional`): + If set will pad the sequence to a multiple of the provided value. This is especially useful to enable + the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). """ # Set truncation and padding on the backend tokenizer if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE: @@ -436,6 +473,23 @@ def _encode_plus( def decode( self, token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True ) -> str: + """ + Converts a sequence of ids in a string, using the tokenizer and vocabulary + with options to remove special tokens and clean up tokenization spaces. + + Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``. + + Args: + token_ids (:obj:`List[int]`): + List of tokenized input ids. Can be obtained using the ``__call__`` method. + skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to remove special tokens in the decoding. + clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to clean up the tokenization spaces. + + Returns: + :obj:`str`: The decoded sentence. + """ text = self._tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens) if clean_up_tokenization_spaces: @@ -445,6 +499,20 @@ def decode( return text def save_vocabulary(self, save_directory: str) -> Tuple[str]: + """ + Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens + and special token mappings. + + .. warning:: + Please use :meth:`~transformers.PreTrainedTokenizer.save_pretrained` to save the full tokenizer state if + you want to reload it using the :meth:`~transformers.PreTrainedTokenizer.from_pretrained` class method. + + Args: + save_directory (:obj:`str`): The path to adirectory where the tokenizer will be saved. + + Returns: + A tuple of :obj:`str`: The files saved. + """ if os.path.isdir(save_directory): files = self._tokenizer.save_model(save_directory) else: From a2f6d521c1d7ebd7e079bc62bee014c8d00b2547 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Thu, 30 Jul 2020 14:18:27 -0700 Subject: [PATCH 065/127] typos (#6162) * 2 small typos * more typos * correct path --- src/transformers/benchmark/benchmark_utils.py | 6 +++--- tests/test_benchmark.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/transformers/benchmark/benchmark_utils.py b/src/transformers/benchmark/benchmark_utils.py index 757669e2b83c..99a76bd7f9a4 100644 --- a/src/transformers/benchmark/benchmark_utils.py +++ b/src/transformers/benchmark/benchmark_utils.py @@ -165,7 +165,7 @@ class MemorySummary(NamedTuple): - `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace` by substracting the memory after executing each line from the memory before executing said line. - `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line - obtained by summing repeted memory increase for a line if it's executed several times. + obtained by summing repeated memory increase for a line if it's executed several times. The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released) - `total`: total memory increase during the full tracing as a `Memory` named tuple (see below). Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default). @@ -310,7 +310,7 @@ def start_memory_tracing( gpus_to_trace: Optional[List[int]] = None, ) -> MemoryTrace: """ Setup line-by-line tracing to record rss mem (RAM) at each line of a module or sub-module. - See `../../examples/benchmarks.py for a usage example. + See `../../../examples/benchmarking/` for usage examples. Current memory consumption is returned using psutil and in particular is the RSS memory "Resident Set Sizeâ€ (the non-swapped physical memory the process is using). See https://psutil.readthedocs.io/en/latest/#psutil.Process.memory_info @@ -468,7 +468,7 @@ def stop_memory_tracing( - `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace` by substracting the memory after executing each line from the memory before executing said line. - `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line - obtained by summing repeted memory increase for a line if it's executed several times. + obtained by summing repeated memory increase for a line if it's executed several times. The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released) - `total`: total memory increase during the full tracing as a `Memory` named tuple (see below). Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default). diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index fb8e3ae24ca1..127068ed90bf 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -86,7 +86,7 @@ def test_inference_fp16(self): self.check_results_dict_not_empty(results.time_inference_result) self.check_results_dict_not_empty(results.memory_inference_result) - def test_inference_no_model_no_architecuters(self): + def test_inference_no_model_no_architectures(self): MODEL_ID = "sshleifer/tiny-gpt2" config = AutoConfig.from_pretrained(MODEL_ID) # set architectures equal to `None` @@ -119,7 +119,7 @@ def test_train_no_configs(self): self.check_results_dict_not_empty(results.time_train_result) self.check_results_dict_not_empty(results.memory_train_result) - @unittest.skipIf(torch_device == "cpu", "Cant do half precision") + @unittest.skipIf(torch_device == "cpu", "Can't do half precision") def test_train_no_configs_fp16(self): MODEL_ID = "sshleifer/tiny-gpt2" benchmark_args = PyTorchBenchmarkArguments( From c0b93a1c7a961e30b30d02d641c9d22120ef5d73 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Thu, 30 Jul 2020 15:00:02 -0700 Subject: [PATCH 066/127] correct the correction (#6163) --- src/transformers/benchmark/benchmark_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/benchmark/benchmark_utils.py b/src/transformers/benchmark/benchmark_utils.py index 99a76bd7f9a4..9b64dd530873 100644 --- a/src/transformers/benchmark/benchmark_utils.py +++ b/src/transformers/benchmark/benchmark_utils.py @@ -310,7 +310,7 @@ def start_memory_tracing( gpus_to_trace: Optional[List[int]] = None, ) -> MemoryTrace: """ Setup line-by-line tracing to record rss mem (RAM) at each line of a module or sub-module. - See `../../../examples/benchmarking/` for usage examples. + See `./benchmark.py` for usage examples. Current memory consumption is returned using psutil and in particular is the RSS memory "Resident Set Sizeâ€ (the non-swapped physical memory the process is using). See https://psutil.readthedocs.io/en/latest/#psutil.Process.memory_info From 7231f7b503df98d2c29a193da969f92186e9852a Mon Sep 17 00:00:00 2001 From: Funtowicz Morgan Date: Fri, 31 Jul 2020 09:45:13 +0200 Subject: [PATCH 067/127] Enable ONNX/ONNXRuntime optimizations through converter script (#6131) * Add onnxruntime transformers optimization support Signed-off-by: Morgan Funtowicz * Added Optimization section in ONNX/ONNXRuntime documentation. Signed-off-by: Morgan Funtowicz * Improve note reference Signed-off-by: Morgan Funtowicz * Fixing imports order. Signed-off-by: Morgan Funtowicz * Add warning about different level of optimization between torch and tf export. Signed-off-by: Morgan Funtowicz * Address @LysandreJik wording suggestion Co-authored-by: Lysandre Debut * Address @LysandreJik wording suggestion Co-authored-by: Lysandre Debut * Always optimize model before quantization for maximum performances. Signed-off-by: Morgan Funtowicz * Address comments on the documentation. Signed-off-by: Morgan Funtowicz * Improve TensorFlow optimization message as suggested by @yufenglee Signed-off-by: Morgan Funtowicz * Removed --optimize parameter Signed-off-by: Morgan Funtowicz * Warn the user about current quantization limitation when model is larger than 2GB. Signed-off-by: Morgan Funtowicz * Trigger CI for last check * Small change in print for the optimization section. Signed-off-by: Morgan Funtowicz Co-authored-by: Lysandre Debut --- docs/source/serialization.rst | 37 +++++++++-- src/transformers/convert_graph_to_onnx.py | 76 ++++++++++++++++++----- 2 files changed, 95 insertions(+), 18 deletions(-) diff --git a/docs/source/serialization.rst b/docs/source/serialization.rst index 15a1f3771ec5..5026d2b7a0b0 100644 --- a/docs/source/serialization.rst +++ b/docs/source/serialization.rst @@ -5,7 +5,7 @@ Exporting transformers models ONNX / ONNXRuntime ============================================== -Projects ONNX (Open Neural Network eXchange) and ONNXRuntime (ORT) are part of an effort from leading industries in the AI field +Projects `ONNX (Open Neural Network eXchange) `_ and `ONNXRuntime (ORT) `_ are part of an effort from leading industries in the AI field to provide a unified and community-driven format to store and, by extension, efficiently execute neural network leveraging a variety of hardware and dedicated optimizations. @@ -34,9 +34,36 @@ The conversion tool works for both PyTorch and Tensorflow models and ensures: Also, the conversion tool supports different options which let you tune the behavior of the generated model: -* Change the target opset version of the generated model: More recent opset generally supports more operator and enables faster inference. -* Export pipeline specific prediction heads: Allow to export model along with its task-specific prediction head(s). -* Use the external data format (PyTorch only): Lets you export model which size is above 2Gb (`More info `_). +* **Change the target opset version of the generated model.** (More recent opset generally supports more operators and enables faster inference) + +* **Export pipeline-specific prediction heads.** (Allow to export model along with its task-specific prediction head(s)) + +* **Use the external data format (PyTorch only).** (Lets you export model which size is above 2Gb (`More info `_)) + + +Optimizations +------------------------------------------------ + +ONNXRuntime includes some transformers-specific transformations to leverage optimized operations in the graph. +Below are some of the operators which can be enabled to speed up inference through ONNXRuntime (*see note below*): + +* Constant folding +* Attention Layer fusing +* Skip connection LayerNormalization fusing +* FastGeLU approximation + + +Fortunately, you can let ONNXRuntime find all the possible optimized operators for you. Simply add ``--optimize`` +when exporting your model through ``convert_graph_to_onnx.py``. + +Example: + +.. code-block:: bash + + python convert_graph_to_onnx.py --framework --model bert-base-cased --optimize bert-base-cased.onnx + +.. note:: + For more information about the optimizations enabled by ONNXRuntime, please have a look at the (`ONNXRuntime Github `_) Quantization ------------------------------------------------ @@ -85,6 +112,8 @@ Example of quantized BERT model export: above command will contain the original ONNX model storing `float32` weights. The second one, with ``-quantized`` suffix, will hold the quantized parameters. +.. note:: + The quantization export gives the best performances when used in combination with ``--optimize``. TorchScript ======================================= diff --git a/src/transformers/convert_graph_to_onnx.py b/src/transformers/convert_graph_to_onnx.py index 72082ab0b5fb..c79fe644ba38 100644 --- a/src/transformers/convert_graph_to_onnx.py +++ b/src/transformers/convert_graph_to_onnx.py @@ -3,7 +3,7 @@ from pathlib import Path from typing import Dict, List, Optional, Tuple -from packaging.version import parse +from packaging.version import Version, parse from transformers import is_tf_available, is_torch_available from transformers.file_utils import ModelOutput @@ -72,7 +72,7 @@ def generate_identified_filename(filename: Path, identifier: str) -> Path: return filename.parent.joinpath(filename.stem + identifier).with_suffix(filename.suffix) -def ensure_onnxruntime_installed(): +def check_onnxruntime_requirements(minimum_version: Version): """ Check onnxruntime is installed and if the installed version match is recent enough. Raises: @@ -88,7 +88,7 @@ def ensure_onnxruntime_installed(): if ort_version < ORT_QUANTIZE_MINIMUM_VERSION: raise ImportError( f"We found an older version of onnxruntime ({onnxruntime.__version__}) " - f"but we require onnxruntime to be >= 1.4.0 to enable all the conversions options.\n" + f"but we require onnxruntime to be >= {minimum_version} to enable all the conversions options.\n" f"Please update onnxruntime by running `pip install --upgrade onnxruntime`" ) @@ -330,6 +330,30 @@ def convert( convert_tensorflow(nlp, opset, output) +def optimize(onnx_model_path: Path) -> Path: + """ + Load the model at the specified path and let onnxruntime look at transformations on the graph + to enable all the optimizations possible + Args: + onnx_model_path: filepath where the model binary description is stored + + Returns: Path where the optimized model binary description has been saved + + """ + from onnxruntime import SessionOptions, InferenceSession + + # Generate model name with suffix "optimized" + opt_model_path = generate_identified_filename(onnx_model_path, "-optimized") + sess_option = SessionOptions() + sess_option.optimized_model_filepath = opt_model_path.as_posix() + _ = InferenceSession(onnx_model_path.as_posix(), sess_option) + + print(f"Optimized model has been written at {opt_model_path}: \N{heavy check mark}") + print("/!\\ Optimized model contains hardware specific operators which might not be portable. /!\\") + + return opt_model_path + + def quantize(onnx_model_path: Path) -> Path: """ Quantize the weights of the model from float32 to in8 to allow very efficient inference on modern CPU. @@ -338,17 +362,18 @@ def quantize(onnx_model_path: Path) -> Path: Returns: The Path generated for the quantized """ - try: - ensure_onnxruntime_installed() import onnx - from onnxruntime import __version__ as ort_version from onnxruntime.quantization import quantize, QuantizationMode - print(f"Found ONNX: {onnx.__version__}") - print(f"Found ONNXRuntime: {ort_version}") - onnx_model = onnx.load(onnx_model_path.as_posix()) + + # Discussed with @yufenglee from ONNX runtime, this will be address in the next release of onnxruntime + print( + "As of onnxruntime 1.4.0, models larger than 2GB will fail to quantize due to protobuf constraint.\n" + "This limitation will be removed in the next release of onnxruntime." + ) + quantized_model = quantize( model=onnx_model, quantization_mode=QuantizationMode.IntegerOps, force_fusions=True, symmetric_weight=True, ) @@ -357,11 +382,11 @@ def quantize(onnx_model_path: Path) -> Path: quantized_model_path = generate_identified_filename(onnx_model_path, "-quantized") # Save model - print(f"Storing quantized model at {quantized_model_path}") - onnx.save(quantized_model, quantized_model_path.as_posix()) + print(f"Quantized model has been written at {quantized_model_path}: \N{heavy check mark}") + onnx.save_model(quantized_model, quantized_model_path.as_posix()) return quantized_model_path - except ImportError as ie: + except Exception as ie: print(f"Error while quantizing the model:\n{str(ie)}") @@ -369,7 +394,7 @@ def verify(path: Path): from onnxruntime import InferenceSession, SessionOptions from onnxruntime.capi.onnxruntime_pybind11_state import RuntimeException - print(f"Checking ONNX model loading from: {path}") + print(f"Checking ONNX model loading from: {path} ...") try: onnx_options = SessionOptions() _ = InferenceSession(path.as_posix(), onnx_options, providers=["CPUExecutionProvider"]) @@ -386,6 +411,7 @@ def verify(path: Path): args.output = Path(args.output).absolute() try: + print("\n====== Converting model to ONNX ======") # Convert convert( args.framework, @@ -398,12 +424,34 @@ def verify(path: Path): ) if args.quantize: - args.quantized_output = quantize(args.output) + # Ensure requirements for quantization on onnxruntime is met + check_onnxruntime_requirements(ORT_QUANTIZE_MINIMUM_VERSION) + + # onnxruntime optimizations doesn't provide the same level of performances on TensorFlow than PyTorch + if args.framework == "tf": + print( + "\t Using TensorFlow might not provide the same optimization level compared to PyTorch.\n" + "\t For TensorFlow users you can try optimizing the model directly through onnxruntime_tools.\n" + "\t For more information, please refer to the onnxruntime documentation:\n" + "\t\thttps://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers\n" + ) + + print("\n====== Optimizing ONNX model ======") + + # Quantization works best when using the optimized version of the model + args.optimized_output = optimize(args.output) + + # Do the quantization on the right graph + args.quantized_output = quantize(args.optimized_output) # And verify if args.check_loading: + print("\n====== Check exported ONNX model(s) ======") verify(args.output) + if hasattr(args, "optimized_output"): + verify(args.optimized_output) + if hasattr(args, "quantized_output"): verify(args.quantized_output) From 0034a1d248e1053dad743dc02c994bbe37a743af Mon Sep 17 00:00:00 2001 From: Prajjwal Bhargava Date: Fri, 31 Jul 2020 13:53:29 +0530 Subject: [PATCH 068/127] Add Pytorch Native AMP support in Trainer (#6151) * fixed type; add Pytorch Native CUDA AMP support * reverted commit on modeling_utils * confirming to HF black formatting rule * changed bool value of _use_apex * scaler support for gradient clipping * fix inplace operation of clip_grad_norm * removed not while version comparison --- src/transformers/trainer.py | 46 +++++++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 9 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index e2be9f5a7fcc..a0bdf30b78db 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -19,7 +19,7 @@ from tqdm.auto import tqdm, trange from .data.data_collator import DataCollator, default_data_collator -from .file_utils import is_apex_available, is_torch_tpu_available +from .file_utils import is_torch_tpu_available from .modeling_utils import PreTrainedModel from .optimization import AdamW, get_linear_schedule_with_warmup from .trainer_utils import ( @@ -33,8 +33,19 @@ from .training_args import TrainingArguments -if is_apex_available(): - from apex import amp +_use_native_amp = False +_use_apex = False + +# Check if Pytorch version >= 1.6 to switch between Native AMP and Apex +if version.parse(torch.__version__) < version.parse("1.6"): + from transformers.file_utils import is_apex_available + + if is_apex_available(): + from apex import amp + _use_apex = True +else: + _use_native_amp = True + from torch.cuda.amp import autocast if is_torch_tpu_available(): @@ -225,6 +236,8 @@ def __init__( ), FutureWarning, ) + if self.args.fp16 and _use_native_amp: + self.scaler = torch.cuda.amp.GradScaler() def _get_train_sampler(self) -> Optional[torch.utils.data.sampler.Sampler]: if isinstance(self.train_dataset, torch.utils.data.IterableDataset): @@ -428,7 +441,7 @@ def train(self, model_path: Optional[str] = None): scheduler.load_state_dict(torch.load(os.path.join(model_path, "scheduler.pt"))) model = self.model - if self.args.fp16: + if self.args.fp16 and _use_apex: if not is_apex_available(): raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=self.args.fp16_opt_level) @@ -525,13 +538,20 @@ def train(self, model_path: Optional[str] = None): len(epoch_iterator) <= self.args.gradient_accumulation_steps and (step + 1) == len(epoch_iterator) ): - if self.args.fp16: + if self.args.fp16 and _use_native_amp: + self.scaler.unscale_(optimizer) + torch.nn.utils.clip_grad_norm_(model.parameters(), self.args.max_grad_norm) + elif self.args.fp16 and _use_apex: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), self.args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), self.args.max_grad_norm) if is_torch_tpu_available(): xm.optimizer_step(optimizer) + + if self.args.fp16 and _use_native_amp: + self.scaler.step(optimizer) + self.scaler.update() else: optimizer.step() @@ -697,19 +717,27 @@ def training_step( model.train() inputs = self._prepare_inputs(inputs, model) - outputs = model(**inputs) - # We don't use .loss here since the model may return tuples instead of ModelOutput. - loss = outputs[0] + if self.args.fp16 and _use_native_amp: + with autocast(): + outputs = model(**inputs) + loss = outputs[0] + else: + outputs = model(**inputs) + # We don't use .loss here since the model may return tuples instead of ModelOutput. + loss = outputs[0] if self.args.past_index >= 0: self._past = outputs[self.args.past_index] if self.args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training + if self.args.gradient_accumulation_steps > 1: loss = loss / self.args.gradient_accumulation_steps - if self.args.fp16: + if self.args.fp16 and _use_native_amp: + self.scaler.scale(loss).backward() + elif self.args.fp16 and _use_apex: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: From 7d50af4b020eaf1456468cc1f202996ac638d12a Mon Sep 17 00:00:00 2001 From: kolk Date: Fri, 31 Jul 2020 13:58:35 +0530 Subject: [PATCH 069/127] Create README.md (#6169) --- .../deepset/minilm-uncased-squad2/README.md | 118 ++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 model_cards/deepset/minilm-uncased-squad2/README.md diff --git a/model_cards/deepset/minilm-uncased-squad2/README.md b/model_cards/deepset/minilm-uncased-squad2/README.md new file mode 100644 index 000000000000..ad5b46a8bfef --- /dev/null +++ b/model_cards/deepset/minilm-uncased-squad2/README.md @@ -0,0 +1,118 @@ +--- +datasets: +- squad_v2 +--- + +# MiniLM-L12-H384-uncased for QA + +## Overview +**Language model:** microsoft/MiniLM-L12-H384-uncased +**Language:** English +**Downstream-task:** Extractive QA +**Training data:** SQuAD 2.0 +**Eval data:** SQuAD 2.0 +**Code:** See [example](https://github.com/deepset-ai/FARM/blob/master/examples/question_answering.py) in [FARM](https://github.com/deepset-ai/FARM/blob/master/examples/question_answering.py) +**Infrastructure**: 1x Tesla v100 + +## Hyperparameters + +``` +seed=42 +batch_size = 12 +n_epochs = 4 +base_LM_model = "microsoft/MiniLM-L12-H384-uncased" +max_seq_len = 384 +learning_rate = 4e-5 +lr_schedule = LinearWarmup +warmup_proportion = 0.2 +doc_stride=128 +max_query_length=64 +grad_acc_steps=4 +``` + +## Performance +Evaluated on the SQuAD 2.0 dev set with the [official eval script](https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/). +``` +"exact": 76.13071675229513, +"f1": 79.49786500219953, +"total": 11873, +"HasAns_exact": 78.35695006747639, +"HasAns_f1": 85.10090269418276, +"HasAns_total": 5928, +"NoAns_exact": 73.91084945332211, +"NoAns_f1": 73.91084945332211, +"NoAns_total": 5945 +``` + +## Usage + +### In Transformers +```python +from transformers.pipelines import pipeline +from transformers.modeling_auto import AutoModelForQuestionAnswering +from transformers.tokenization_auto import AutoTokenizer + +model_name = "deepset/minilm-uncased-squad2" + +# a) Get predictions +nlp = pipeline('question-answering', model=model_name, tokenizer=model_name) +QA_input = { + 'question': 'Why is model conversion important?', + 'context': 'The option to convert models between FARM and transformers gives freedom to the user and let people easily switch between frameworks.' +} +res = nlp(QA_input) + +# b) Load model & tokenizer +model = AutoModelForQuestionAnswering.from_pretrained(model_name) +tokenizer = AutoTokenizer.from_pretrained(model_name) +``` + +### In FARM + +```python +from farm.modeling.adaptive_model import AdaptiveModel +from farm.modeling.tokenization import Tokenizer +from farm.infer import Inferencer + +model_name = "deepset/minilm-uncased-squad2" + +# a) Get predictions +nlp = Inferencer.load(model_name, task_type="question_answering") +QA_input = [{"questions": ["Why is model conversion important?"], + "text": "The option to convert models between FARM and transformers gives freedom to the user and let people easily switch between frameworks."}] +res = nlp.inference_from_dicts(dicts=QA_input) + +# b) Load model & tokenizer +model = AdaptiveModel.convert_from_transformers(model_name, device="cpu", task_type="question_answering") +tokenizer = Tokenizer.load(model_name) +``` + +### In haystack +For doing QA at scale (i.e. many docs instead of single paragraph), you can load the model also in [haystack](https://github.com/deepset-ai/haystack/): +```python +reader = FARMReader(model_name_or_path="deepset/minilm-uncased-squad2") +# or +reader = TransformersReader(model="deepset/minilm-uncased-squad2",tokenizer="deepset/minilm-uncased-squad2") +``` + + +## Authors +Vaishali Pal `vaishali.pal [at] deepset.ai` +Branden Chan: `branden.chan [at] deepset.ai` +Timo MÃ¶ller: `timo.moeller [at] deepset.ai` +Malte Pietsch: `malte.pietsch [at] deepset.ai` +Tanay Soni: `tanay.soni [at] deepset.ai` + +## About us +![deepset logo](https://raw.githubusercontent.com/deepset-ai/FARM/master/docs/img/deepset_logo.png) + +We bring NLP to the industry via open source! +Our focus: Industry specific language models & large scale QA systems. + +Some of our work: +- [German BERT (aka "bert-base-german-cased")](https://deepset.ai/german-bert) +- [FARM](https://github.com/deepset-ai/FARM) +- [Haystack](https://github.com/deepset-ai/haystack/) + +Get in touch: +[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Website](https://deepset.ai) From f250beb8aac83009c70ff01ae8568384683d0f3c Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Fri, 31 Jul 2020 01:34:46 -0700 Subject: [PATCH 070/127] enable easy checkout switch (#5645) * enable easy checkout switch allow having multiple repository checkouts and not needing to remember to rerun 'pip install -e .[dev]' when switching between checkouts and running tests. * make isort happy * examples needs one too --- examples/conftest.py | 11 +++++++++++ tests/conftest.py | 11 +++++++++++ 2 files changed, 22 insertions(+) create mode 100644 examples/conftest.py create mode 100644 tests/conftest.py diff --git a/examples/conftest.py b/examples/conftest.py new file mode 100644 index 000000000000..0a83207cb5bb --- /dev/null +++ b/examples/conftest.py @@ -0,0 +1,11 @@ +# tests directory-specific settings - this file is run automatically +# by pytest before any tests are run + +import sys +from os.path import abspath, dirname, join + + +# allow having multiple repository checkouts and not needing to remember to rerun +# 'pip install -e .[dev]' when switching between checkouts and running tests. +git_repo_path = abspath(join(dirname(dirname(__file__)), "src")) +sys.path.insert(1, git_repo_path) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 000000000000..0a83207cb5bb --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,11 @@ +# tests directory-specific settings - this file is run automatically +# by pytest before any tests are run + +import sys +from os.path import abspath, dirname, join + + +# allow having multiple repository checkouts and not needing to remember to rerun +# 'pip install -e .[dev]' when switching between checkouts and running tests. +git_repo_path = abspath(join(dirname(dirname(__file__)), "src")) +sys.path.insert(1, git_repo_path) From cf3cf304ca234ce70ffa223d7868e63133acd7ec Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Fri, 31 Jul 2020 17:41:14 +0900 Subject: [PATCH 071/127] Replace mecab-python3 with fugashi for Japanese tokenization (#6086) * Replace mecab-python3 with fugashi This replaces mecab-python3 with fugashi for Japanese tokenization. I am the maintainer of both projects. Both projects are MeCab wrappers, so the underlying C++ code is the same. fugashi is the newer wrapper and doesn't use SWIG, so for basic use of the MeCab API it's easier to use. This code insures the use of a version of ipadic installed via pip, which should make versioning and tracking down issues easier. fugashi has wheels for Windows, OSX, and Linux, which will help with issues with installing old versions of mecab-python3 on Windows. Compared to mecab-python3, because fugashi doesn't use SWIG, it doesn't require a C++ runtime to be installed on Windows. In adding this change I removed some code dealing with `cursor`, `token_start`, and `token_end` variables. These variables didn't seem to be used for anything, it is unclear to me why they were there. I ran the tests and they passed, though I couldn't figure out how to run the slow tests (`--runslow` gave an error) and didn't try testing with Tensorflow. * Style fix * Remove unused variable Forgot to delete this... * Adapt doc with install instructions * Fix typo Co-authored-by: sgugger Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- .circleci/config.yml | 2 +- docs/source/pretrained_models.rst | 10 +++++---- setup.cfg | 2 +- setup.py | 4 ++-- .../tokenization_bert_japanese.py | 21 ++++++++----------- 5 files changed, 19 insertions(+), 20 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 4d76bb6ae27e..100109539b69 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -56,7 +56,7 @@ jobs: RUN_CUSTOM_TOKENIZERS: yes steps: - checkout - - run: sudo pip install .[mecab,testing] + - run: sudo pip install .[ja,testing] - run: python -m pytest -s ./tests/test_tokenization_bert_japanese.py | tee output.txt - store_artifacts: path: ~/transformers/output.txt diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst index 27f048dc2504..0ef6c976f1b7 100644 --- a/docs/source/pretrained_models.rst +++ b/docs/source/pretrained_models.rst @@ -74,14 +74,16 @@ For a list that includes community-uploaded models, refer to `https://huggingfac | | | (see `details on dbmdz repository `__). | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | | ``cl-tohoku/bert-base-japanese`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. | -| | | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece. | -| | | | `MeCab `__ is required for tokenization. | +| | | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece and this requires some extra dependencies, | +| | | | `fugashi `__ which is a wrapper around `MeCab `__. | +| | | | Use ``pip install transformers["ja"]`` (or ``pip install -e .["ja"]`` if you install from source) to install them. | | | | | | | | (see `details on cl-tohoku repository `__). | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | | ``cl-tohoku/bert-base-japanese-whole-word-masking`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. | -| | | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized with MeCab and WordPiece. | -| | | | `MeCab `__ is required for tokenization. | +| | | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece and this requires some extra dependencies, | +| | | | `fugashi `__ which is a wrapper around `MeCab `__. | +| | | | Use ``pip install transformers["ja"]`` (or ``pip install -e .["ja"]`` if you install from source) to install them. | | | | | | | | (see `details on cl-tohoku repository `__). | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ diff --git a/setup.cfg b/setup.cfg index d630f0b3a9f9..e5467ab62302 100644 --- a/setup.cfg +++ b/setup.cfg @@ -10,10 +10,10 @@ known_third_party = faiss fastprogress fire + fugashi git h5py matplotlib - MeCab nlp nltk numpy diff --git a/setup.py b/setup.py index c4fc91ab3170..206c3e354096 100644 --- a/setup.py +++ b/setup.py @@ -65,7 +65,7 @@ extras = {} -extras["mecab"] = ["mecab-python3<1"] +extras["ja"] = ["fugashi>=1.0", "ipadic>=1.0,<2.0"] extras["sklearn"] = ["scikit-learn"] # keras2onnx and onnxconverter-common version is specific through a commit until 1.7.0 lands on pypi @@ -97,7 +97,7 @@ "isort @ git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort", "flake8", ] -extras["dev"] = extras["testing"] + extras["quality"] + ["mecab-python3<1", "scikit-learn", "tensorflow", "torch"] +extras["dev"] = extras["testing"] + extras["quality"] + extras["ja"] + ["scikit-learn", "tensorflow", "torch"] setup( name="transformers", diff --git a/src/transformers/tokenization_bert_japanese.py b/src/transformers/tokenization_bert_japanese.py index 43ae8e18945b..c3ede2c47e5b 100644 --- a/src/transformers/tokenization_bert_japanese.py +++ b/src/transformers/tokenization_bert_japanese.py @@ -185,9 +185,14 @@ def __init__(self, do_lower_case=False, never_split=None, normalize_text=True, m self.never_split = never_split if never_split is not None else [] self.normalize_text = normalize_text - import MeCab + import fugashi + import ipadic - self.mecab = MeCab.Tagger(mecab_option) if mecab_option is not None else MeCab.Tagger() + # Use ipadic by default (later options can override it) + mecab_option = mecab_option or "" + mecab_option = ipadic.MECAB_ARGS + " " + mecab_option + + self.mecab = fugashi.GenericTagger(mecab_option) def tokenize(self, text, never_split=None, **kwargs): """Tokenizes a piece of text.""" @@ -197,21 +202,13 @@ def tokenize(self, text, never_split=None, **kwargs): never_split = self.never_split + (never_split if never_split is not None else []) tokens = [] - mecab_output = self.mecab.parse(text) - - cursor = 0 - for line in mecab_output.split("\n"): - if line == "EOS": - break + for word in self.mecab(text): + token = word.surface - token, _ = line.split("\t") - token_start = text.index(token, cursor) - token_end = token_start + len(token) if self.do_lower_case and token not in never_split: token = token.lower() tokens.append(token) - cursor = token_end return tokens From 838dc06ff5a438159ac25f531d622e8f344476f5 Mon Sep 17 00:00:00 2001 From: Suraj Patil Date: Fri, 31 Jul 2020 14:14:23 +0530 Subject: [PATCH 072/127] parse arguments from dict (#4869) * add parse_dict to parse arguments from dict * add unit test for parse_dict --- src/transformers/hf_argparser.py | 13 +++++++++++++ tests/test_hf_argparser.py | 14 ++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/src/transformers/hf_argparser.py b/src/transformers/hf_argparser.py index 7d3e2d02e51c..6c4e3f204b72 100644 --- a/src/transformers/hf_argparser.py +++ b/src/transformers/hf_argparser.py @@ -158,3 +158,16 @@ def parse_json_file(self, json_file: str) -> Tuple[DataClass, ...]: obj = dtype(**inputs) outputs.append(obj) return (*outputs,) + + def parse_dict(self, args: dict) -> Tuple[DataClass, ...]: + """ + Alternative helper method that does not use `argparse` at all, + instead uses a dict and populating the dataclass types. + """ + outputs = [] + for dtype in self.dataclass_types: + keys = {f.name for f in dataclasses.fields(dtype)} + inputs = {k: v for k, v in args.items() if k in keys} + obj = dtype(**inputs) + outputs.append(obj) + return (*outputs,) diff --git a/tests/test_hf_argparser.py b/tests/test_hf_argparser.py index a3bda37a554a..3c219d0b6f3a 100644 --- a/tests/test_hf_argparser.py +++ b/tests/test_hf_argparser.py @@ -152,6 +152,20 @@ def test_with_optional(self): args = parser.parse_args("--foo 12 --bar 3.14 --baz 42 --ces a b c --des 1 2 3".split()) self.assertEqual(args, Namespace(foo=12, bar=3.14, baz="42", ces=["a", "b", "c"], des=[1, 2, 3])) + def test_parse_dict(self): + parser = HfArgumentParser(BasicExample) + + args_dict = { + "foo": 12, + "bar": 3.14, + "baz": "42", + "flag": True, + } + + parsed_args = parser.parse_dict(args_dict)[0] + args = BasicExample(**args_dict) + self.assertEqual(parsed_args, args) + def test_integration_training_args(self): parser = HfArgumentParser(TrainingArguments) self.assertIsNotNone(parser) From 603cd81a01003e1d2be9357f4e517d1686df017f Mon Sep 17 00:00:00 2001 From: Mehrdad Farahani Date: Fri, 31 Jul 2020 12:19:06 +0200 Subject: [PATCH 073/127] readme m3hrdadfi/albert-fa-base-v2 (#6153) * readme m3hrdadfi/albert-fa-base-v2 model_card readme for m3hrdadfi/albert-fa-base-v2 * Update model_cards/m3hrdadfi/albert-fa-base-v2/README.md Co-authored-by: Julien Chaumond --- .../m3hrdadfi/albert-fa-base-v2/README.md | 161 ++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100644 model_cards/m3hrdadfi/albert-fa-base-v2/README.md diff --git a/model_cards/m3hrdadfi/albert-fa-base-v2/README.md b/model_cards/m3hrdadfi/albert-fa-base-v2/README.md new file mode 100644 index 000000000000..1d4e8ab7be73 --- /dev/null +++ b/model_cards/m3hrdadfi/albert-fa-base-v2/README.md @@ -0,0 +1,161 @@ +--- +language: fa +tags: +- albert-persian +- persian-lm +license: apache-2.0 +datasets: +- Persian Wikidumps +- MirasText +- BigBang Page +- Chetor +- Eligasht +- DigiMag +- Ted Talks +- Books (Novels, ...) +--- + +# ALBERT-Persian + +## ALBERT-Persian: A Lite BERT for Self-supervised Learning of Language Representations for the Persian Language + +## Introduction + +ALBERT-Persian trained on a massive amount of public corpora ([Persian Wikidumps](https://dumps.wikimedia.org/fawiki/), [MirasText](https://github.com/miras-tech/MirasText)) and six other manually crawled text data from a various type of websites ([BigBang Page](https://bigbangpage.com/) `scientific`, [Chetor](https://www.chetor.com/) `lifestyle`, [Eligasht](https://www.eligasht.com/Blog/) `itinerary`, [Digikala](https://www.digikala.com/mag/) `digital magazine`, [Ted Talks](https://www.ted.com/talks) `general conversational`, Books `novels, storybooks, short stories from old to the contemporary era`). + + + +## Intended uses & limitations + +You can use the raw model for either masked language modeling or next sentence prediction, but it's mostly intended to +be fine-tuned on a downstream task. See the [model hub](https://huggingface.co/models?search=albert-fa) to look for +fine-tuned versions on a task that interests you. + + +### How to use + +#### TensorFlow 2.0 + +```python +from transformers import AutoConfig, AutoTokenizer, TFAutoModel + +config = AutoConfig.from_pretrained("m3hrdadfi/albert-fa-base-v2") +tokenizer = AutoTokenizer.from_pretrained("m3hrdadfi/albert-fa-base-v2") +model = TFAutoModel.from_pretrained("m3hrdadfi/albert-fa-base-v2") + +text = "Ù…Ø§ Ø¯Ø± Ù‡ÙˆØ´ÙˆØ§Ø±Ù‡ Ù…Ø¹ØªÙ‚Ø¯ÛŒÙ… Ø¨Ø§ Ø§Ù†ØªÙ‚Ø§Ù„ ØµØÛŒØ Ø¯Ø§Ù†Ø´ Ùˆ Ø¢Ú¯Ø§Ù‡ÛŒØŒ Ù‡Ù…Ù‡ Ø§ÙØ±Ø§Ø¯ Ù…ÛŒâ€ŒØªÙˆØ§Ù†Ù†Ø¯ Ø§Ø² Ø§Ø¨Ø²Ø§Ø±Ù‡Ø§ÛŒ Ù‡ÙˆØ´Ù…Ù†Ø¯ Ø§Ø³ØªÙØ§Ø¯Ù‡ Ú©Ù†Ù†Ø¯. Ø´Ø¹Ø§Ø± Ù…Ø§ Ù‡ÙˆØ´ Ù…ØµÙ†ÙˆØ¹ÛŒ Ø¨Ø±Ø§ÛŒ Ù‡Ù…Ù‡ Ø§Ø³Øª." +tokenizer.tokenize(text) + +>>> ['â–Ù…Ø§', 'â–Ø¯Ø±', 'â–Ù‡ÙˆØ´', 'ÙˆØ§Ø±Ù‡', 'â–Ù…Ø¹ØªÙ‚Ø¯', 'ÛŒÙ…', 'â–Ø¨Ø§', 'â–Ø§Ù†ØªÙ‚Ø§Ù„', 'â–ØµØÛŒØ', 'â–Ø¯Ø§Ù†Ø´', 'â–Ùˆ', 'â–Ø§Ú¯Ø§Ù‡', 'ÛŒ', 'ØŒ', 'â–Ù‡Ù…Ù‡', 'â–Ø§ÙØ±Ø§Ø¯', 'â–Ù…ÛŒ', 'â–ØªÙˆØ§Ù†Ù†Ø¯', 'â–Ø§Ø²', 'â–Ø§Ø¨Ø²Ø§Ø±Ù‡Ø§ÛŒ', 'â–Ù‡ÙˆØ´Ù…Ù†Ø¯', 'â–Ø§Ø³ØªÙØ§Ø¯Ù‡', 'â–Ú©Ù†Ù†Ø¯', '.', 'â–Ø´Ø¹Ø§Ø±', 'â–Ù…Ø§', 'â–Ù‡ÙˆØ´', 'â–Ù…ØµÙ†ÙˆØ¹ÛŒ', 'â–Ø¨Ø±Ø§ÛŒ', 'â–Ù‡Ù…Ù‡', 'â–Ø§Ø³Øª', '.'] + +``` + +#### Pytorch + +```python +from transformers import AutoConfig, AutoTokenizer, AutoModel + +config = AutoConfig.from_pretrained("m3hrdadfi/albert-fa-base-v2") +tokenizer = AutoTokenizer.from_pretrained("m3hrdadfi/albert-fa-base-v2") +model = AutoModel.from_pretrained("m3hrdadfi/albert-fa-base-v2") +``` + +## Training + +ALBERT-Persian is the first attempt on ALBERT for the Persian Language. The model was trained based on Google's ALBERT BASE Version 2.0 over various writing styles from numerous subjects (e.g., scientific, novels, news) with more than `3.9M` documents, `73M` sentences, and `1.3B` words, like the way we did for [ParsBERT](https://github.com/hooshvare/parsbert). + +## Goals +Objective goals during training are as below (after 140K steps). + +``` bash +***** Eval results ***** +global_step = 140000 +loss = 2.0080082 +masked_lm_accuracy = 0.6141017 +masked_lm_loss = 1.9963315 +sentence_order_accuracy = 0.985 +sentence_order_loss = 0.06908702 +``` + + +## Derivative models + +### Base Config + +#### Albert Model +- [m3hrdadfi/albert-face-base-v2](https://huggingface.co/m3hrdadfi/albert-fa-base-v2) + +#### Albert Sentiment Analysis +- [m3hrdadfi/albert-fa-base-v2-sentiment-digikala](https://huggingface.co/m3hrdadfi/albert-fa-base-v2-sentiment-digikala) +- [m3hrdadfi/albert-fa-base-v2-sentiment-snappfood](https://huggingface.co/m3hrdadfi/albert-fa-base-v2-sentiment-snappfood) +- [m3hrdadfi/albert-fa-base-v2-sentiment-deepsentipers-binary](https://huggingface.co/m3hrdadfi/albert-fa-base-v2-sentiment-deepsentipers-binary) +- [m3hrdadfi/albert-fa-base-v2-sentiment-deepsentipers-multi](https://huggingface.co/m3hrdadfi/albert-fa-base-v2-sentiment-deepsentipers-multi) +- [m3hrdadfi/albert-fa-base-v2-sentiment-binary](https://huggingface.co/m3hrdadfi/albert-fa-base-v2-sentiment-binary) +- [m3hrdadfi/albert-fa-base-v2-sentiment-multi](https://huggingface.co/m3hrdadfi/albert-fa-base-v2-sentiment-multi) +- [m3hrdadfi/albert-fa-base-v2-sentiment-multi](https://huggingface.co/m3hrdadfi/albert-fa-base-v2-sentiment-multi) + +#### Albert Text Classification +- [m3hrdadfi/albert-fa-base-v2-clf-digimag](https://huggingface.co/m3hrdadfi/albert-fa-base-v2-clf-digimag) +- [m3hrdadfi/albert-fa-base-v2-clf-persiannews](https://huggingface.co/m3hrdadfi/albert-fa-base-v2-clf-persiannews) + +#### Albert NER +- [m3hrdadfi/albert-fa-base-v2-ner](https://huggingface.co/m3hrdadfi/albert-fa-base-v2-ner) +- [m3hrdadfi/albert-fa-base-v2-ner-arman](https://huggingface.co/m3hrdadfi/albert-fa-base-v2-ner-arman) +- [m3hrdadfi/albert-fa-base-v2-ner-arman](https://huggingface.co/m3hrdadfi/albert-fa-base-v2-ner-arman) + +## Eval results + +The following tables summarize the F1 scores obtained by ALBERT-Persian as compared to other models and architectures. + + +### Sentiment Analysis (SA) Task + +| Dataset | ALBERT-fa-base-v2 | ParsBERT-v1 | mBERT | DeepSentiPers | +|:------------------------:|:-----------------:|:-----------:|:-----:|:-------------:| +| Digikala User Comments | 81.12 | 81.74 | 80.74 | - | +| SnappFood User Comments | 85.79 | 88.12 | 87.87 | - | +| SentiPers (Multi Class) | 66.12 | 71.11 | - | 69.33 | +| SentiPers (Binary Class) | 91.09 | 92.13 | - | 91.98 | + + +### Text Classification (TC) Task + +| Dataset | ALBERT-fa-base-v2 | ParsBERT-v1 | mBERT | +|:-----------------:|:-----------------:|:-----------:|:-----:| +| Digikala Magazine | 92.33 | 93.59 | 90.72 | +| Persian News | 97.01 | 97.19 | 95.79 | + + +### Named Entity Recognition (NER) Task + +| Dataset | ALBERT-fa-base-v2 | ParsBERT-v1 | mBERT | MorphoBERT | Beheshti-NER | LSTM-CRF | Rule-Based CRF | BiLSTM-CRF | +|:-------:|:-----------------:|:-----------:|:-----:|:----------:|:------------:|:--------:|:--------------:|:----------:| +| PEYMA | 88.99 | 93.10 | 86.64 | - | 90.59 | - | 84.00 | - | +| ARMAN | 97.43 | 98.79 | 95.89 | 89.9 | 84.03 | 86.55 | - | 77.45 | + + +### BibTeX entry and citation info + +Please cite in publications as the following: + +```bibtex +@misc{ALBERT-Persian, + author = {Mehrdad Farahani}, + title = {ALBERT-Persian: A Lite BERT for Self-supervised Learning of Language Representations for the Persian Language}, + year = {2020}, + publisher = {GitHub}, + journal = {GitHub repository}, + howpublished = {\url{https://github.com/m3hrdadfi/albert-persian}}, +} + +@article{ParsBERT, + title={ParsBERT: Transformer-based Model for Persian Language Understanding}, + author={Mehrdad Farahani, Mohammad Gharachorloo, Marzieh Farahani, Mohammad Manthouri}, + journal={ArXiv}, + year={2020}, + volume={abs/2005.12515} +} +``` + +## Questions? +Post a Github issue on the [ALBERT-Persian](https://github.com/m3hrdadfi/albert-persian) repo. From 86caab1e0bc2078af442c9e2655cf68cd429d75f Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Fri, 31 Jul 2020 09:43:23 -0400 Subject: [PATCH 074/127] Harmonize both Trainers API (#6157) * Harmonize both Trainers API * Fix test * main_prcess -> process_zero --- docs/source/main_classes/trainer.rst | 17 +++ src/transformers/trainer.py | 175 ++++++++++++++------------- src/transformers/trainer_tf.py | 30 ++--- 3 files changed, 125 insertions(+), 97 deletions(-) diff --git a/docs/source/main_classes/trainer.rst b/docs/source/main_classes/trainer.rst index e5687de4692d..55b308a74e07 100644 --- a/docs/source/main_classes/trainer.rst +++ b/docs/source/main_classes/trainer.rst @@ -11,6 +11,23 @@ customization during training. The API supports distributed training on multiple GPUs/TPUs, mixed precision through `NVIDIA Apex `__ for PyTorch and :obj:`tf.keras.mixed_precision` for TensorFlow. +Both :class:`~transformers.Trainer` and :class:`~transformers.TFTrainer` contain the basic training loop supporting the +previous features. To inject custom behavior you can subclass them and override the following methods: + +- **get_train_dataloader**/**get_train_tfdataset** -- Creates the training DataLoader (PyTorch) or TF Dataset. +- **get_eval_dataloader**/**get_eval_tfdataset** -- Creates the evaulation DataLoader (PyTorch) or TF Dataset. +- **get_test_dataloader**/**get_test_tfdataset** -- Creates the test DataLoader (PyTorch) or TF Dataset. +- **log** -- Logs information on the various objects watching training. +- **setup_wandb** -- Setups wandb (see `here `__ for more information). +- **create_optimizer_and_scheduler** -- Setups the optimizer and learning rate scheduler if they were not passed at + init. +- **training_step** -- Performs a training step. +- **prediction_step** -- Performs an evaluation/test step. +- **run_model** (TensorFlow only) -- Basic pass through the model. +- **evaluate** -- Runs an evaluation loop and returns metrics. +- **predict** -- Returns predictions (with metrics if labels are available) on a test set. + + ``Trainer`` ~~~~~~~~~~~ diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index a0bdf30b78db..f449dd138b17 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -172,18 +172,6 @@ class Trainer: :func:`~transformers.get_linear_schedule_with_warmup` controlled by :obj:`args`. """ - model: PreTrainedModel - args: TrainingArguments - data_collator: DataCollator - train_dataset: Optional[Dataset] - eval_dataset: Optional[Dataset] - compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None - prediction_loss_only: bool - tb_writer: Optional["SummaryWriter"] = None - optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = None - global_step: Optional[int] = None - epoch: Optional[float] = None - def __init__( self, model: PreTrainedModel, @@ -194,7 +182,7 @@ def __init__( compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, prediction_loss_only=False, tb_writer: Optional["SummaryWriter"] = None, - optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = None, + optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), ): self.model = model.to(args.device) self.args = args @@ -203,10 +191,9 @@ def __init__( self.eval_dataset = eval_dataset self.compute_metrics = compute_metrics self.prediction_loss_only = prediction_loss_only - self.optimizers = optimizers - if tb_writer is not None: - self.tb_writer = tb_writer - elif is_tensorboard_available() and self.is_world_master(): + self.optimizer, self.lr_scheduler = optimizers + self.tb_writer = tb_writer + if tb_writer is None and is_tensorboard_available() and self.is_world_process_zero(): self.tb_writer = SummaryWriter(log_dir=self.args.logging_dir) if not is_tensorboard_available(): logger.warning( @@ -221,7 +208,7 @@ def __init__( ) set_seed(self.args.seed) # Create output directory if needed - if self.is_world_master(): + if self.is_world_process_zero(): os.makedirs(self.args.output_dir, exist_ok=True) if is_torch_tpu_available(): # Set an xla_device flag on the model's config. @@ -236,6 +223,8 @@ def __init__( ), FutureWarning, ) + self.global_step = None + self.epoch = None if self.args.fp16 and _use_native_amp: self.scaler = torch.cuda.amp.GradScaler() @@ -333,39 +322,35 @@ def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader: drop_last=self.args.dataloader_drop_last, ) - def get_optimizers( - self, num_training_steps: int - ) -> Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]: + def create_optimizer_and_scheduler(self, num_training_steps: int): """ Setup the optimizer and the learning rate scheduler. We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the Trainer's init through :obj:`optimizers`, or subclass and override this method in a subclass. """ - if self.optimizers is not None: - return self.optimizers - # Prepare optimizer and schedule (linear warmup and decay) - no_decay = ["bias", "LayerNorm.weight"] - optimizer_grouped_parameters = [ - { - "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], - "weight_decay": self.args.weight_decay, - }, - { - "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], - "weight_decay": 0.0, - }, - ] - optimizer = AdamW( - optimizer_grouped_parameters, - lr=self.args.learning_rate, - betas=(self.args.adam_beta1, self.args.adam_beta2), - eps=self.args.adam_epsilon, - ) - scheduler = get_linear_schedule_with_warmup( - optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=num_training_steps - ) - return optimizer, scheduler + if self.optimizer is None: + no_decay = ["bias", "LayerNorm.weight"] + optimizer_grouped_parameters = [ + { + "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], + "weight_decay": self.args.weight_decay, + }, + { + "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], + "weight_decay": 0.0, + }, + ] + self.optimizer = AdamW( + optimizer_grouped_parameters, + lr=self.args.learning_rate, + betas=(self.args.adam_beta1, self.args.adam_beta2), + eps=self.args.adam_epsilon, + ) + if self.lr_scheduler is None: + self.lr_scheduler = get_linear_schedule_with_warmup( + self.optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=num_training_steps + ) def setup_wandb(self): """ @@ -390,7 +375,7 @@ def setup_wandb(self): ) return self._setup_wandb() - if self.is_world_master(): + if self.is_world_process_zero(): logger.info( 'Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"' ) @@ -426,7 +411,7 @@ def train(self, model_path: Optional[str] = None): t_total = int(len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs) num_train_epochs = self.args.num_train_epochs - optimizer, scheduler = self.get_optimizers(num_training_steps=t_total) + self.create_optimizer_and_scheduler(num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if ( @@ -435,16 +420,16 @@ def train(self, model_path: Optional[str] = None): and os.path.isfile(os.path.join(model_path, "scheduler.pt")) ): # Load in optimizer and scheduler states - optimizer.load_state_dict( + self.optimizer.load_state_dict( torch.load(os.path.join(model_path, "optimizer.pt"), map_location=self.args.device) ) - scheduler.load_state_dict(torch.load(os.path.join(model_path, "scheduler.pt"))) + self.lr_scheduler.load_state_dict(torch.load(os.path.join(model_path, "scheduler.pt"))) model = self.model if self.args.fp16 and _use_apex: if not is_apex_available(): raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") - model, optimizer = amp.initialize(model, optimizer, opt_level=self.args.fp16_opt_level) + model, self.optimizer = amp.initialize(model, self.optimizer, opt_level=self.args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if self.args.n_gpu > 1: @@ -506,7 +491,7 @@ def train(self, model_path: Optional[str] = None): logging_loss = 0.0 model.zero_grad() train_iterator = trange( - epochs_trained, int(num_train_epochs), desc="Epoch", disable=not self.is_local_master() + epochs_trained, int(num_train_epochs), desc="Epoch", disable=not self.is_local_process_zero() ) for epoch in train_iterator: if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler): @@ -516,9 +501,9 @@ def train(self, model_path: Optional[str] = None): parallel_loader = pl.ParallelLoader(train_dataloader, [self.args.device]).per_device_loader( self.args.device ) - epoch_iterator = tqdm(parallel_loader, desc="Iteration", disable=not self.is_local_master()) + epoch_iterator = tqdm(parallel_loader, desc="Iteration", disable=not self.is_local_process_zero()) else: - epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=not self.is_local_master()) + epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=not self.is_local_process_zero()) # Reset the past mems state at the beginning of each epoch if necessary. if self.args.past_index >= 0: @@ -531,7 +516,7 @@ def train(self, model_path: Optional[str] = None): steps_trained_in_current_epoch -= 1 continue - tr_loss += self.training_step(model, inputs, optimizer) + tr_loss += self.training_step(model, inputs) if (step + 1) % self.args.gradient_accumulation_steps == 0 or ( # last step in epoch but step is always smaller than gradient_accumulation_steps @@ -539,23 +524,22 @@ def train(self, model_path: Optional[str] = None): and (step + 1) == len(epoch_iterator) ): if self.args.fp16 and _use_native_amp: - self.scaler.unscale_(optimizer) + self.scaler.unscale_(self.optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), self.args.max_grad_norm) elif self.args.fp16 and _use_apex: - torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), self.args.max_grad_norm) + torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), self.args.max_grad_norm) if is_torch_tpu_available(): - xm.optimizer_step(optimizer) - + xm.optimizer_step(self.optimizer) if self.args.fp16 and _use_native_amp: - self.scaler.step(optimizer) + self.scaler.step(self.optimizer) self.scaler.update() else: - optimizer.step() + self.optimizer.step() - scheduler.step() + self.lr_scheduler.step() model.zero_grad() self.global_step += 1 self.epoch = epoch + (step + 1) / len(epoch_iterator) @@ -567,9 +551,9 @@ def train(self, model_path: Optional[str] = None): logs["loss"] = (tr_loss - logging_loss) / self.args.logging_steps # backward compatibility for pytorch schedulers logs["learning_rate"] = ( - scheduler.get_last_lr()[0] + self.lr_scheduler.get_last_lr()[0] if version.parse(torch.__version__) >= version.parse("1.4") - else scheduler.get_lr()[0] + else self.lr_scheduler.get_lr()[0] ) logging_loss = tr_loss @@ -590,16 +574,16 @@ def train(self, model_path: Optional[str] = None): self.save_model(output_dir) - if self.is_world_master(): + if self.is_world_process_zero(): self._rotate_checkpoints() if is_torch_tpu_available(): xm.rendezvous("saving_optimizer_states") - xm.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) - xm.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) - elif self.is_world_master(): - torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) - torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) + xm.save(self.optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) + xm.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) + elif self.is_world_process_zero(): + torch.save(self.optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) + torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) if self.args.max_steps > 0 and self.global_step > self.args.max_steps: epoch_iterator.close() @@ -660,7 +644,7 @@ def log(self, logs: Dict[str, float], iterator: Optional[tqdm] = None) -> None: ) self.tb_writer.flush() if is_wandb_available(): - if self.is_world_master(): + if self.is_world_process_zero(): wandb.log(logs, step=self.global_step) output = {**logs, **{"step": self.global_step}} if iterator is not None: @@ -684,11 +668,9 @@ def _prepare_inputs( return inputs - def training_step( - self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]], optimizer: torch.optim.Optimizer - ) -> float: + def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> float: """ - Perform a training step on :obj:`model` using obj:`inputs` and :obj:`optimizer`. + Perform a training step on a batch of inputs. Subclass and override to inject custom behavior. @@ -700,19 +682,16 @@ def training_step( The dictionary will be unpacked before being fed to the model. Most models expect the targets under the argument :obj:`labels`. Check your model's documentation for all accepted arguments. - optimizer (:obj:`torch.optim.Optimizer`): - The optimizer to use to make a step. Return: - `float`: - The training loss on this batch. + :obj:`float`: The training loss on this batch. """ if hasattr(self, "_training_step"): warnings.warn( "The `_training_step` method is deprecated and won't be called in a future version, define `training_step` in your subclass.", FutureWarning, ) - return self._training_step(model, inputs, optimizer) + return self._training_step(model, inputs, self.optimizer) model.train() inputs = self._prepare_inputs(inputs, model) @@ -738,7 +717,7 @@ def training_step( if self.args.fp16 and _use_native_amp: self.scaler.scale(loss).backward() elif self.args.fp16 and _use_apex: - with amp.scale_loss(loss, optimizer) as scaled_loss: + with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() @@ -746,6 +725,22 @@ def training_step( return loss.item() def is_local_master(self) -> bool: + """ + Whether or not this process is the local (e.g., on one machine if training in a distributed fashion on + several machines) main process. + + .. warning:: + + This method is deprecated, use :meth:`~transformers.Trainer.is_local_process_zero` instead. + """ + warnings.warn("This method is deprecated, use `Trainer.is_local_process_zero()` instead.", FutureWarning) + return self.is_local_process_zero() + + def is_local_process_zero(self) -> bool: + """ + Whether or not this process is the local (e.g., on one machine if training in a distributed fashion on + several machines) main process. + """ if is_torch_tpu_available(): return xm.is_master_ordinal(local=True) else: @@ -753,8 +748,20 @@ def is_local_master(self) -> bool: def is_world_master(self) -> bool: """ - This will be True only in one process, even in distributed mode, - even when training on multiple machines. + Whether or not this process is the global main process (when training in a distributed fashion on + several machines, this is only going to be :obj:`True` for one process). + + .. warning:: + + This method is deprecated, use :meth:`~transformers.Trainer.is_world_process_zero` instead. + """ + warnings.warn("This method is deprecated, use `Trainer.is_world_process_zero()` instead.", FutureWarning) + return self.is_world_process_zero() + + def is_world_process_zero(self) -> bool: + """ + Whether or not this process is the global main process (when training in a distributed fashion on + several machines, this is only going to be :obj:`True` for one process). """ if is_torch_tpu_available(): return xm.is_master_ordinal(local=False) @@ -770,7 +777,7 @@ def save_model(self, output_dir: Optional[str] = None): if is_torch_tpu_available(): self._save_tpu(output_dir) - elif self.is_world_master(): + elif self.is_world_process_zero(): self._save(output_dir) def _save_tpu(self, output_dir: Optional[str] = None): @@ -846,6 +853,7 @@ def evaluate(self, eval_dataset: Optional[Dataset] = None) -> Dict[str, float]: Args: eval_dataset (:obj:`Dataset`, `optional`): Pass a dataset if you wish to override :obj:`self.eval_dataset`. + Returns: A dictionary containing the evaluation loss and the potential metrics computed from the predictions. """ @@ -871,6 +879,7 @@ def predict(self, test_dataset: Dataset) -> PredictionOutput: Args: test_dataset (:obj:`Dataset`): Dataset to run the predictions on. + Returns: `NamedTuple`: predictions (:obj:`np.ndarray`): diff --git a/src/transformers/trainer_tf.py b/src/transformers/trainer_tf.py index 808582e819e7..03d028994a03 100644 --- a/src/transformers/trainer_tf.py +++ b/src/transformers/trainer_tf.py @@ -63,17 +63,6 @@ class TFTrainer: an instance of :class:`~transformers.WarmUp`. """ - model: TFPreTrainedModel - args: TFTrainingArguments - train_dataset: Optional[tf.data.Dataset] - eval_dataset: Optional[tf.data.Dataset] - compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None - prediction_loss_only: bool - tb_writer: Optional[tf.summary.SummaryWriter] = None - optimizers: Tuple[tf.keras.optimizers.Optimizer, tf.keras.optimizers.schedules.LearningRateSchedule] = (None, None) - global_step: Optional[int] = None - epoch_logging: Optional[float] = None - def __init__( self, model: TFPreTrainedModel, @@ -325,6 +314,15 @@ def prediction_loop( return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics) def log(self, logs: Dict[str, float]) -> None: + """ + Log :obj:`logs` on the various objects watching training. + + Subclass and override this method to inject custom behavior. + + Args: + logs (:obj:`Dict[str, float]`): + The values to log. + """ if hasattr(self, "_log"): warnings.warn( "The `_log` method is deprecated and won't be called in a future version, define `log` in your subclass.", @@ -356,6 +354,7 @@ def evaluate(self, eval_dataset: Optional[tf.data.Dataset] = None) -> Dict[str, Args: eval_dataset (:class:`~tf.data.Dataset`, `optional`): Pass a dataset if you wish to override :obj:`self.eval_dataset`. + Returns: A dictionary containing the evaluation loss and the potential metrics computed from the predictions. """ @@ -577,9 +576,12 @@ def run_model(self, features, labels, training): Subclass and override this method if you want to inject some custom behavior. Args: - features: the batched features. - labels: the batched labels. - training: run the model in training mode or not + features (:obj:`tf.Tensor`): A batch of input features. + labels (:obj:`tf.Tensor`): A batch of labels. + training (:obj:`bool`): Whether or not to run the model in training mode. + + Returns: + A tuple of two :obj:`tf.Tensor`: The loss and logits. """ if hasattr(self, "_run_model"): warnings.warn( From d951c14ae46ee36b76981588ed6d03ab353ad766 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Fri, 31 Jul 2020 09:44:37 -0400 Subject: [PATCH 075/127] Model output test (#6155) * Use return_dict=True in all tests * Formatting --- src/transformers/modeling_encoder_decoder.py | 2 + src/transformers/modeling_openai.py | 7 +- src/transformers/modeling_reformer.py | 1 + .../tests/test_modeling_xxx.py | 47 ++------ tests/test_modeling_albert.py | 66 +++------- tests/test_modeling_bart.py | 28 ++--- tests/test_modeling_bert.py | 113 +++++------------- tests/test_modeling_camembert.py | 4 +- tests/test_modeling_common.py | 1 - tests/test_modeling_ctrl.py | 24 ++-- tests/test_modeling_distilbert.py | 44 ++----- tests/test_modeling_dpr.py | 32 ++--- tests/test_modeling_electra.py | 60 ++-------- tests/test_modeling_flaubert.py | 77 +++--------- tests/test_modeling_gpt2.py | 43 +++---- tests/test_modeling_longformer.py | 75 +++--------- tests/test_modeling_mbart.py | 5 +- tests/test_modeling_mobilebert.py | 89 ++++---------- tests/test_modeling_openai.py | 26 ++-- tests/test_modeling_reformer.py | 53 +++----- tests/test_modeling_roberta.py | 48 ++------ tests/test_modeling_t5.py | 30 +++-- tests/test_modeling_transfo_xl.py | 29 ++--- tests/test_modeling_xlm.py | 75 +++--------- tests/test_modeling_xlm_roberta.py | 8 +- tests/test_modeling_xlnet.py | 102 +++++----------- 26 files changed, 322 insertions(+), 767 deletions(-) diff --git a/src/transformers/modeling_encoder_decoder.py b/src/transformers/modeling_encoder_decoder.py index 3eb92ad8f905..ec98a250d9ed 100644 --- a/src/transformers/modeling_encoder_decoder.py +++ b/src/transformers/modeling_encoder_decoder.py @@ -273,6 +273,7 @@ def forward( attention_mask=attention_mask, inputs_embeds=inputs_embeds, head_mask=head_mask, + return_dict=False, **kwargs_encoder, ) @@ -287,6 +288,7 @@ def forward( encoder_attention_mask=attention_mask, head_mask=decoder_head_mask, labels=labels, + return_dict=False, **kwargs_decoder, ) diff --git a/src/transformers/modeling_openai.py b/src/transformers/modeling_openai.py index 04cf8fb8a4f4..e3406bc291e1 100644 --- a/src/transformers/modeling_openai.py +++ b/src/transformers/modeling_openai.py @@ -688,16 +688,15 @@ def forward( lm_logits = self.lm_head(hidden_states) mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1) - lm_loss = None + lm_loss, mc_loss = None, None if mc_labels is not None: loss_fct = CrossEntropyLoss() - lm_loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)) - mc_loss = None + mc_loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)) if labels is not None: shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() loss_fct = CrossEntropyLoss() - mc_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) if not return_dict: output = (lm_logits, mc_logits) + transformer_outputs[1:] diff --git a/src/transformers/modeling_reformer.py b/src/transformers/modeling_reformer.py index 8109d6b98f91..f9e8ac76d6d1 100644 --- a/src/transformers/modeling_reformer.py +++ b/src/transformers/modeling_reformer.py @@ -2386,6 +2386,7 @@ def forward( If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.reformer( input_ids, diff --git a/templates/adding_a_new_model/tests/test_modeling_xxx.py b/templates/adding_a_new_model/tests/test_modeling_xxx.py index 8f522eb22ee2..d81c9a5009a8 100644 --- a/templates/adding_a_new_model/tests/test_modeling_xxx.py +++ b/templates/adding_a_new_model/tests/test_modeling_xxx.py @@ -121,6 +121,7 @@ def prepare_config_and_inputs(self): max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, + return_dict=True, ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -134,18 +135,13 @@ def create_and_check_xxx_model( model = XxxModel(config=config) model.to(torch_device) model.eval() - sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) - sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids) - sequence_output, pooled_output = model(input_ids) - - result = { - "sequence_output": sequence_output, - "pooled_output": pooled_output, - } + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + result = model(input_ids, token_type_ids=token_type_ids) + result = model(input_ids) self.parent.assertListEqual( - list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].size()), [self.batch_size, self.seq_length, self.hidden_size] ) - self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size]) + self.parent.assertListEqual(list(result["pooler_output"].size()), [self.batch_size, self.hidden_size]) def create_and_check_xxx_for_masked_lm( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -153,16 +149,10 @@ def create_and_check_xxx_for_masked_lm( model = XxxForMaskedLM(config=config) model.to(torch_device) model.eval() - loss, prediction_scores = model( + result = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels ) - result = { - "loss": loss, - "prediction_scores": prediction_scores, - } - self.parent.assertListEqual( - list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size] - ) + self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]) self.check_loss_output(result) def create_and_check_xxx_for_question_answering( @@ -171,18 +161,13 @@ def create_and_check_xxx_for_question_answering( model = XxxForQuestionAnswering(config=config) model.to(torch_device) model.eval() - loss, start_logits, end_logits = model( + result = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, start_positions=sequence_labels, end_positions=sequence_labels, ) - result = { - "loss": loss, - "start_logits": start_logits, - "end_logits": end_logits, - } self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length]) self.check_loss_output(result) @@ -194,13 +179,7 @@ def create_and_check_xxx_for_sequence_classification( model = XxxForSequenceClassification(config) model.to(torch_device) model.eval() - loss, logits = model( - input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels - ) - result = { - "loss": loss, - "logits": logits, - } + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels]) self.check_loss_output(result) @@ -211,11 +190,7 @@ def create_and_check_xxx_for_token_classification( model = XxxForTokenClassification(config=config) model.to(torch_device) model.eval() - loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) - result = { - "loss": loss, - "logits": logits, - } + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]) self.check_loss_output(result) diff --git a/tests/test_modeling_albert.py b/tests/test_modeling_albert.py index 844721f2eadd..c7ad2d21922d 100644 --- a/tests/test_modeling_albert.py +++ b/tests/test_modeling_albert.py @@ -98,6 +98,7 @@ def prepare_config_and_inputs(self): type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, num_hidden_groups=self.num_hidden_groups, + return_dict=True, ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -111,18 +112,13 @@ def create_and_check_albert_model( model = AlbertModel(config=config) model.to(torch_device) model.eval() - sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) - sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids) - sequence_output, pooled_output = model(input_ids) - - result = { - "sequence_output": sequence_output, - "pooled_output": pooled_output, - } + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + result = model(input_ids, token_type_ids=token_type_ids) + result = model(input_ids) self.parent.assertListEqual( - list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].size()), [self.batch_size, self.seq_length, self.hidden_size] ) - self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size]) + self.parent.assertListEqual(list(result["pooler_output"].size()), [self.batch_size, self.hidden_size]) def create_and_check_albert_for_pretraining( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -130,22 +126,17 @@ def create_and_check_albert_for_pretraining( model = AlbertForPreTraining(config=config) model.to(torch_device) model.eval() - loss, prediction_scores, sop_scores = model( + result = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels, sentence_order_label=sequence_labels, ) - result = { - "loss": loss, - "prediction_scores": prediction_scores, - "sop_scores": sop_scores, - } self.parent.assertListEqual( - list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size] + list(result["prediction_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size] ) - self.parent.assertListEqual(list(result["sop_scores"].size()), [self.batch_size, config.num_labels]) + self.parent.assertListEqual(list(result["sop_logits"].size()), [self.batch_size, config.num_labels]) self.check_loss_output(result) def create_and_check_albert_for_masked_lm( @@ -154,16 +145,8 @@ def create_and_check_albert_for_masked_lm( model = AlbertForMaskedLM(config=config) model.to(torch_device) model.eval() - loss, prediction_scores = model( - input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels - ) - result = { - "loss": loss, - "prediction_scores": prediction_scores, - } - self.parent.assertListEqual( - list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size] - ) + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) + self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]) self.check_loss_output(result) def create_and_check_albert_for_question_answering( @@ -172,18 +155,13 @@ def create_and_check_albert_for_question_answering( model = AlbertForQuestionAnswering(config=config) model.to(torch_device) model.eval() - loss, start_logits, end_logits = model( + result = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, start_positions=sequence_labels, end_positions=sequence_labels, ) - result = { - "loss": loss, - "start_logits": start_logits, - "end_logits": end_logits, - } self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length]) self.check_loss_output(result) @@ -195,13 +173,7 @@ def create_and_check_albert_for_sequence_classification( model = AlbertForSequenceClassification(config) model.to(torch_device) model.eval() - loss, logits = model( - input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels - ) - result = { - "loss": loss, - "logits": logits, - } + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels]) self.check_loss_output(result) @@ -212,11 +184,7 @@ def create_and_check_albert_for_token_classification( model = AlbertForTokenClassification(config=config) model.to(torch_device) model.eval() - loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) - result = { - "loss": loss, - "logits": logits, - } + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]) self.check_loss_output(result) @@ -230,16 +198,12 @@ def create_and_check_albert_for_multiple_choice( multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() - loss, logits = model( + result = model( multiple_choice_inputs_ids, attention_mask=multiple_choice_input_mask, token_type_ids=multiple_choice_token_type_ids, labels=choice_labels, ) - result = { - "loss": loss, - "logits": logits, - } self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices]) def prepare_config_and_inputs_for_common(self): diff --git a/tests/test_modeling_bart.py b/tests/test_modeling_bart.py index 5ea77c70eb59..e86e46812e2e 100644 --- a/tests/test_modeling_bart.py +++ b/tests/test_modeling_bart.py @@ -238,6 +238,7 @@ def _get_config_and_data(self): eos_token_id=2, pad_token_id=1, bos_token_id=0, + return_dict=True, ) return config, input_ids, batch_size @@ -247,24 +248,20 @@ def test_sequence_classification_forward(self): model = BartForSequenceClassification(config) model.to(torch_device) outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=labels) - logits = outputs[1] expected_shape = torch.Size((batch_size, config.num_labels)) - self.assertEqual(logits.shape, expected_shape) - loss = outputs[0] - self.assertIsInstance(loss.item(), float) + self.assertEqual(outputs["logits"].shape, expected_shape) + self.assertIsInstance(outputs["loss"].item(), float) def test_question_answering_forward(self): config, input_ids, batch_size = self._get_config_and_data() sequence_labels = ids_tensor([batch_size], 2).to(torch_device) model = BartForQuestionAnswering(config) model.to(torch_device) - loss, start_logits, end_logits, _ = model( - input_ids=input_ids, start_positions=sequence_labels, end_positions=sequence_labels, - ) + outputs = model(input_ids=input_ids, start_positions=sequence_labels, end_positions=sequence_labels,) - self.assertEqual(start_logits.shape, input_ids.shape) - self.assertEqual(end_logits.shape, input_ids.shape) - self.assertIsInstance(loss.item(), float) + self.assertEqual(outputs["start_logits"].shape, input_ids.shape) + self.assertEqual(outputs["end_logits"].shape, input_ids.shape) + self.assertIsInstance(outputs["loss"].item(), float) @timeout_decorator.timeout(1) def test_lm_forward(self): @@ -272,10 +269,10 @@ def test_lm_forward(self): lm_labels = ids_tensor([batch_size, input_ids.shape[1]], self.vocab_size).to(torch_device) lm_model = BartForConditionalGeneration(config) lm_model.to(torch_device) - loss, logits, enc_features = lm_model(input_ids=input_ids, labels=lm_labels) + outputs = lm_model(input_ids=input_ids, labels=lm_labels) expected_shape = (batch_size, input_ids.shape[1], config.vocab_size) - self.assertEqual(logits.shape, expected_shape) - self.assertIsInstance(loss.item(), float) + self.assertEqual(outputs["logits"].shape, expected_shape) + self.assertIsInstance(outputs["loss"].item(), float) def test_lm_uneven_forward(self): config = BartConfig( @@ -288,13 +285,14 @@ def test_lm_uneven_forward(self): encoder_ffn_dim=8, decoder_ffn_dim=8, max_position_embeddings=48, + return_dict=True, ) lm_model = BartForConditionalGeneration(config).to(torch_device) context = torch.Tensor([[71, 82, 18, 33, 46, 91, 2], [68, 34, 26, 58, 30, 2, 1]]).long().to(torch_device) summary = torch.Tensor([[82, 71, 82, 18, 2], [58, 68, 2, 1, 1]]).long().to(torch_device) - loss, logits, enc_features = lm_model(input_ids=context, decoder_input_ids=summary, labels=summary) + outputs = lm_model(input_ids=context, decoder_input_ids=summary, labels=summary) expected_shape = (*summary.shape, config.vocab_size) - self.assertEqual(logits.shape, expected_shape) + self.assertEqual(outputs["logits"].shape, expected_shape) def test_generate_beam_search(self): input_ids = torch.Tensor([[71, 82, 2], [68, 34, 2]]).long().to(torch_device) diff --git a/tests/test_modeling_bert.py b/tests/test_modeling_bert.py index 8a689fb8434c..a85d48983e55 100644 --- a/tests/test_modeling_bert.py +++ b/tests/test_modeling_bert.py @@ -120,6 +120,7 @@ def prepare_config_and_inputs(self): type_vocab_size=self.type_vocab_size, is_decoder=False, initializer_range=self.initializer_range, + return_dict=True, ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -160,18 +161,13 @@ def create_and_check_bert_model( model = BertModel(config=config) model.to(torch_device) model.eval() - sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) - sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids) - sequence_output, pooled_output = model(input_ids) - - result = { - "sequence_output": sequence_output, - "pooled_output": pooled_output, - } + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + result = model(input_ids, token_type_ids=token_type_ids) + result = model(input_ids) self.parent.assertListEqual( - list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].size()), [self.batch_size, self.seq_length, self.hidden_size] ) - self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size]) + self.parent.assertListEqual(list(result["pooler_output"].size()), [self.batch_size, self.hidden_size]) def create_and_check_bert_model_as_decoder( self, @@ -188,29 +184,24 @@ def create_and_check_bert_model_as_decoder( model = BertModel(config) model.to(torch_device) model.eval() - sequence_output, pooled_output = model( + result = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, ) - sequence_output, pooled_output = model( + result = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states, ) - sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) - - result = { - "sequence_output": sequence_output, - "pooled_output": pooled_output, - } + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) self.parent.assertListEqual( - list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].size()), [self.batch_size, self.seq_length, self.hidden_size] ) - self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size]) + self.parent.assertListEqual(list(result["pooler_output"].size()), [self.batch_size, self.hidden_size]) def create_and_check_bert_for_causal_lm( self, @@ -227,16 +218,8 @@ def create_and_check_bert_for_causal_lm( model = BertLMHeadModel(config=config) model.to(torch_device) model.eval() - loss, prediction_scores = model( - input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels - ) - result = { - "loss": loss, - "prediction_scores": prediction_scores, - } - self.parent.assertListEqual( - list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size] - ) + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) + self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]) self.check_loss_output(result) def create_and_check_bert_for_masked_lm( @@ -245,16 +228,8 @@ def create_and_check_bert_for_masked_lm( model = BertForMaskedLM(config=config) model.to(torch_device) model.eval() - loss, prediction_scores = model( - input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels - ) - result = { - "loss": loss, - "prediction_scores": prediction_scores, - } - self.parent.assertListEqual( - list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size] - ) + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) + self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]) self.check_loss_output(result) def create_and_check_bert_model_for_causal_lm_as_decoder( @@ -272,7 +247,7 @@ def create_and_check_bert_model_for_causal_lm_as_decoder( model = BertLMHeadModel(config=config) model.to(torch_device) model.eval() - loss, prediction_scores = model( + result = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, @@ -280,20 +255,14 @@ def create_and_check_bert_model_for_causal_lm_as_decoder( encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, ) - loss, prediction_scores = model( + result = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels, encoder_hidden_states=encoder_hidden_states, ) - result = { - "loss": loss, - "prediction_scores": prediction_scores, - } - self.parent.assertListEqual( - list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size] - ) + self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]) self.check_loss_output(result) def create_and_check_bert_for_next_sequence_prediction( @@ -302,14 +271,10 @@ def create_and_check_bert_for_next_sequence_prediction( model = BertForNextSentencePrediction(config=config) model.to(torch_device) model.eval() - loss, seq_relationship_score = model( + result = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, next_sentence_label=sequence_labels, ) - result = { - "loss": loss, - "seq_relationship_score": seq_relationship_score, - } - self.parent.assertListEqual(list(result["seq_relationship_score"].size()), [self.batch_size, 2]) + self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, 2]) self.check_loss_output(result) def create_and_check_bert_for_pretraining( @@ -318,22 +283,17 @@ def create_and_check_bert_for_pretraining( model = BertForPreTraining(config=config) model.to(torch_device) model.eval() - loss, prediction_scores, seq_relationship_score = model( + result = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels, next_sentence_label=sequence_labels, ) - result = { - "loss": loss, - "prediction_scores": prediction_scores, - "seq_relationship_score": seq_relationship_score, - } self.parent.assertListEqual( - list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size] + list(result["prediction_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size] ) - self.parent.assertListEqual(list(result["seq_relationship_score"].size()), [self.batch_size, 2]) + self.parent.assertListEqual(list(result["seq_relationship_logits"].size()), [self.batch_size, 2]) self.check_loss_output(result) def create_and_check_bert_for_question_answering( @@ -342,18 +302,13 @@ def create_and_check_bert_for_question_answering( model = BertForQuestionAnswering(config=config) model.to(torch_device) model.eval() - loss, start_logits, end_logits = model( + result = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, start_positions=sequence_labels, end_positions=sequence_labels, ) - result = { - "loss": loss, - "start_logits": start_logits, - "end_logits": end_logits, - } self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length]) self.check_loss_output(result) @@ -365,13 +320,7 @@ def create_and_check_bert_for_sequence_classification( model = BertForSequenceClassification(config) model.to(torch_device) model.eval() - loss, logits = model( - input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels - ) - result = { - "loss": loss, - "logits": logits, - } + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels]) self.check_loss_output(result) @@ -382,11 +331,7 @@ def create_and_check_bert_for_token_classification( model = BertForTokenClassification(config=config) model.to(torch_device) model.eval() - loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) - result = { - "loss": loss, - "logits": logits, - } + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]) self.check_loss_output(result) @@ -400,16 +345,12 @@ def create_and_check_bert_for_multiple_choice( multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() - loss, logits = model( + result = model( multiple_choice_inputs_ids, attention_mask=multiple_choice_input_mask, token_type_ids=multiple_choice_token_type_ids, labels=choice_labels, ) - result = { - "loss": loss, - "logits": logits, - } self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices]) self.check_loss_output(result) diff --git a/tests/test_modeling_camembert.py b/tests/test_modeling_camembert.py index 606915de2a1d..fa975b7873a1 100644 --- a/tests/test_modeling_camembert.py +++ b/tests/test_modeling_camembert.py @@ -28,13 +28,13 @@ class CamembertModelIntegrationTest(unittest.TestCase): @slow def test_output_embeds_base_model(self): - model = CamembertModel.from_pretrained("camembert-base") + model = CamembertModel.from_pretrained("camembert-base", return_dict=True) model.to(torch_device) input_ids = torch.tensor( [[5, 121, 11, 660, 16, 730, 25543, 110, 83, 6]], device=torch_device, dtype=torch.long, ) # J'aime le camembert ! - output = model(input_ids)[0] + output = model(input_ids)["last_hidden_state"] expected_shape = torch.Size((1, 10, 768)) self.assertEqual(output.shape, expected_shape) # compare the actual values for a slice. diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 8207f70f3190..9dde829d7441 100644 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -74,7 +74,6 @@ def _prepare_for_class(self, inputs_dict, model_class): def test_save_load(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - config.return_dict = True for model_class in self.all_model_classes: model = model_class(config) diff --git a/tests/test_modeling_ctrl.py b/tests/test_modeling_ctrl.py index 0d6e8f5aee49..29e5554f4044 100644 --- a/tests/test_modeling_ctrl.py +++ b/tests/test_modeling_ctrl.py @@ -88,9 +88,10 @@ def prepare_config_and_inputs(self): # hidden_dropout_prob=self.hidden_dropout_prob, # attention_probs_dropout_prob=self.attention_probs_dropout_prob, n_positions=self.max_position_embeddings, - n_ctx=self.max_position_embeddings + n_ctx=self.max_position_embeddings, # type_vocab_size=self.type_vocab_size, - # initializer_range=self.initializer_range + # initializer_range=self.initializer_range, + return_dict=True, ) head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) @@ -117,29 +118,20 @@ def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask) model(input_ids, token_type_ids=token_type_ids) - sequence_output, presents = model(input_ids) - - result = { - "sequence_output": sequence_output, - "presents": presents, - } + result = model(input_ids) self.parent.assertListEqual( - list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].size()), [self.batch_size, self.seq_length, self.hidden_size] ) - self.parent.assertEqual(len(result["presents"]), config.n_layer) + self.parent.assertEqual(len(result["past_key_values"]), config.n_layer) def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): model = CTRLLMHeadModel(config) model.to(torch_device) model.eval() - loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids) - - result = {"loss": loss, "lm_logits": lm_logits} + result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids) self.parent.assertListEqual(list(result["loss"].size()), []) - self.parent.assertListEqual( - list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size] - ) + self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() diff --git a/tests/test_modeling_distilbert.py b/tests/test_modeling_distilbert.py index 871d4d9da3c5..37e380c1c770 100644 --- a/tests/test_modeling_distilbert.py +++ b/tests/test_modeling_distilbert.py @@ -110,6 +110,7 @@ def prepare_config_and_inputs(self): attention_dropout=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, initializer_range=self.initializer_range, + return_dict=True, ) return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -123,14 +124,10 @@ def create_and_check_distilbert_model( model = DistilBertModel(config=config) model.to(torch_device) model.eval() - (sequence_output,) = model(input_ids, input_mask) - (sequence_output,) = model(input_ids) - - result = { - "sequence_output": sequence_output, - } + result = model(input_ids, input_mask) + result = model(input_ids) self.parent.assertListEqual( - list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].size()), [self.batch_size, self.seq_length, self.hidden_size] ) def create_and_check_distilbert_for_masked_lm( @@ -139,13 +136,9 @@ def create_and_check_distilbert_for_masked_lm( model = DistilBertForMaskedLM(config=config) model.to(torch_device) model.eval() - loss, prediction_scores = model(input_ids, attention_mask=input_mask, labels=token_labels) - result = { - "loss": loss, - "prediction_scores": prediction_scores, - } + result = model(input_ids, attention_mask=input_mask, labels=token_labels) self.parent.assertListEqual( - list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size] + list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size] ) self.check_loss_output(result) @@ -155,14 +148,9 @@ def create_and_check_distilbert_for_question_answering( model = DistilBertForQuestionAnswering(config=config) model.to(torch_device) model.eval() - loss, start_logits, end_logits = model( + result = model( input_ids, attention_mask=input_mask, start_positions=sequence_labels, end_positions=sequence_labels ) - result = { - "loss": loss, - "start_logits": start_logits, - "end_logits": end_logits, - } self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length]) self.check_loss_output(result) @@ -174,11 +162,7 @@ def create_and_check_distilbert_for_sequence_classification( model = DistilBertForSequenceClassification(config) model.to(torch_device) model.eval() - loss, logits = model(input_ids, attention_mask=input_mask, labels=sequence_labels) - result = { - "loss": loss, - "logits": logits, - } + result = model(input_ids, attention_mask=input_mask, labels=sequence_labels) self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels]) self.check_loss_output(result) @@ -190,11 +174,7 @@ def create_and_check_distilbert_for_token_classification( model.to(torch_device) model.eval() - loss, logits = model(input_ids, attention_mask=input_mask, labels=token_labels) - result = { - "loss": loss, - "logits": logits, - } + result = model(input_ids, attention_mask=input_mask, labels=token_labels) self.parent.assertListEqual( list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels] ) @@ -209,13 +189,9 @@ def create_and_check_distilbert_for_multiple_choice( model.eval() multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() - loss, logits = model( + result = model( multiple_choice_inputs_ids, attention_mask=multiple_choice_input_mask, labels=choice_labels, ) - result = { - "loss": loss, - "logits": logits, - } self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices]) self.check_loss_output(result) diff --git a/tests/test_modeling_dpr.py b/tests/test_modeling_dpr.py index 42883a4042ff..c3016dab3fb3 100644 --- a/tests/test_modeling_dpr.py +++ b/tests/test_modeling_dpr.py @@ -115,6 +115,7 @@ def prepare_config_and_inputs(self): type_vocab_size=self.type_vocab_size, is_decoder=False, initializer_range=self.initializer_range, + return_dict=True, ) config = DPRConfig(projection_dim=self.projection_dim, **config.to_dict()) @@ -126,15 +127,11 @@ def create_and_check_dpr_context_encoder( model = DPRContextEncoder(config=config) model.to(torch_device) model.eval() - embeddings = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)[0] - embeddings = model(input_ids, token_type_ids=token_type_ids)[0] - embeddings = model(input_ids)[0] - - result = { - "embeddings": embeddings, - } + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + result = model(input_ids, token_type_ids=token_type_ids) + result = model(input_ids) self.parent.assertListEqual( - list(result["embeddings"].size()), [self.batch_size, self.projection_dim or self.hidden_size] + list(result["pooler_output"].size()), [self.batch_size, self.projection_dim or self.hidden_size] ) def create_and_check_dpr_question_encoder( @@ -143,15 +140,11 @@ def create_and_check_dpr_question_encoder( model = DPRQuestionEncoder(config=config) model.to(torch_device) model.eval() - embeddings = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)[0] - embeddings = model(input_ids, token_type_ids=token_type_ids)[0] - embeddings = model(input_ids)[0] - - result = { - "embeddings": embeddings, - } + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + result = model(input_ids, token_type_ids=token_type_ids) + result = model(input_ids) self.parent.assertListEqual( - list(result["embeddings"].size()), [self.batch_size, self.projection_dim or self.hidden_size] + list(result["pooler_output"].size()), [self.batch_size, self.projection_dim or self.hidden_size] ) def create_and_check_dpr_reader( @@ -160,12 +153,7 @@ def create_and_check_dpr_reader( model = DPRReader(config=config) model.to(torch_device) model.eval() - start_logits, end_logits, relevance_logits, *_ = model(input_ids, attention_mask=input_mask,) - result = { - "relevance_logits": relevance_logits, - "start_logits": start_logits, - "end_logits": end_logits, - } + result = model(input_ids, attention_mask=input_mask,) self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["relevance_logits"].size()), [self.batch_size]) diff --git a/tests/test_modeling_electra.py b/tests/test_modeling_electra.py index 4f113bd8477a..9fb1a0f46aa9 100644 --- a/tests/test_modeling_electra.py +++ b/tests/test_modeling_electra.py @@ -97,6 +97,7 @@ def prepare_config_and_inputs(self): type_vocab_size=self.type_vocab_size, is_decoder=False, initializer_range=self.initializer_range, + return_dict=True, ) return ( @@ -127,15 +128,11 @@ def create_and_check_electra_model( model = ElectraModel(config=config) model.to(torch_device) model.eval() - (sequence_output,) = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) - (sequence_output,) = model(input_ids, token_type_ids=token_type_ids) - (sequence_output,) = model(input_ids) - - result = { - "sequence_output": sequence_output, - } + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + result = model(input_ids, token_type_ids=token_type_ids) + result = model(input_ids) self.parent.assertListEqual( - list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].size()), [self.batch_size, self.seq_length, self.hidden_size] ) def create_and_check_electra_for_masked_lm( @@ -152,16 +149,8 @@ def create_and_check_electra_for_masked_lm( model = ElectraForMaskedLM(config=config) model.to(torch_device) model.eval() - loss, prediction_scores = model( - input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels - ) - result = { - "loss": loss, - "prediction_scores": prediction_scores, - } - self.parent.assertListEqual( - list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size] - ) + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) + self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]) self.check_loss_output(result) def create_and_check_electra_for_token_classification( @@ -179,11 +168,7 @@ def create_and_check_electra_for_token_classification( model = ElectraForTokenClassification(config=config) model.to(torch_device) model.eval() - loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) - result = { - "loss": loss, - "logits": logits, - } + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]) self.check_loss_output(result) @@ -202,13 +187,7 @@ def create_and_check_electra_for_pretraining( model = ElectraForPreTraining(config=config) model.to(torch_device) model.eval() - loss, logits = model( - input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=fake_token_labels - ) - result = { - "loss": loss, - "logits": logits, - } + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=fake_token_labels) self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length]) self.check_loss_output(result) @@ -227,13 +206,7 @@ def create_and_check_electra_for_sequence_classification( model = ElectraForSequenceClassification(config) model.to(torch_device) model.eval() - loss, logits = model( - input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels - ) - result = { - "loss": loss, - "logits": logits, - } + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels]) self.check_loss_output(result) @@ -251,18 +224,13 @@ def create_and_check_electra_for_question_answering( model = ElectraForQuestionAnswering(config=config) model.to(torch_device) model.eval() - loss, start_logits, end_logits = model( + result = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, start_positions=sequence_labels, end_positions=sequence_labels, ) - result = { - "loss": loss, - "start_logits": start_logits, - "end_logits": end_logits, - } self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length]) self.check_loss_output(result) @@ -285,16 +253,12 @@ def create_and_check_electra_for_multiple_choice( multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() - loss, logits = model( + result = model( multiple_choice_inputs_ids, attention_mask=multiple_choice_input_mask, token_type_ids=multiple_choice_token_type_ids, labels=choice_labels, ) - result = { - "loss": loss, - "logits": logits, - } self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices]) self.check_loss_output(result) diff --git a/tests/test_modeling_flaubert.py b/tests/test_modeling_flaubert.py index d4342e21843f..bba631831d21 100644 --- a/tests/test_modeling_flaubert.py +++ b/tests/test_modeling_flaubert.py @@ -110,6 +110,7 @@ def prepare_config_and_inputs(self): initializer_range=self.initializer_range, summary_type=self.summary_type, use_proj=self.use_proj, + return_dict=True, ) return ( @@ -142,15 +143,11 @@ def create_and_check_flaubert_model( model = FlaubertModel(config=config) model.to(torch_device) model.eval() - outputs = model(input_ids, lengths=input_lengths, langs=token_type_ids) - outputs = model(input_ids, langs=token_type_ids) - outputs = model(input_ids) - sequence_output = outputs[0] - result = { - "sequence_output": sequence_output, - } + result = model(input_ids, lengths=input_lengths, langs=token_type_ids) + result = model(input_ids, langs=token_type_ids) + result = model(input_ids) self.parent.assertListEqual( - list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].size()), [self.batch_size, self.seq_length, self.hidden_size] ) def create_and_check_flaubert_lm_head( @@ -169,13 +166,7 @@ def create_and_check_flaubert_lm_head( model.to(torch_device) model.eval() - loss, logits = model(input_ids, token_type_ids=token_type_ids, labels=token_labels) - - result = { - "loss": loss, - "logits": logits, - } - + result = model(input_ids, token_type_ids=token_type_ids, labels=token_labels) self.parent.assertListEqual(list(result["loss"].size()), []) self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]) @@ -195,16 +186,9 @@ def create_and_check_flaubert_simple_qa( model.to(torch_device) model.eval() - outputs = model(input_ids) - - outputs = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels) - loss, start_logits, end_logits = outputs + result = model(input_ids) - result = { - "loss": loss, - "start_logits": start_logits, - "end_logits": end_logits, - } + result = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels) self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length]) self.check_loss_output(result) @@ -225,10 +209,9 @@ def create_and_check_flaubert_qa( model.to(torch_device) model.eval() - outputs = model(input_ids) - start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = outputs + result = model(input_ids) - outputs = model( + result_with_labels = model( input_ids, start_positions=sequence_labels, end_positions=sequence_labels, @@ -237,7 +220,7 @@ def create_and_check_flaubert_qa( p_mask=input_mask, ) - outputs = model( + result_with_labels = model( input_ids, start_positions=sequence_labels, end_positions=sequence_labels, @@ -245,22 +228,13 @@ def create_and_check_flaubert_qa( is_impossible=is_impossible_labels, ) - (total_loss,) = outputs + (total_loss,) = result_with_labels.to_tuple() - outputs = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels) + result_with_labels = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels) - (total_loss,) = outputs + (total_loss,) = result_with_labels.to_tuple() - result = { - "loss": total_loss, - "start_top_log_probs": start_top_log_probs, - "start_top_index": start_top_index, - "end_top_log_probs": end_top_log_probs, - "end_top_index": end_top_index, - "cls_logits": cls_logits, - } - - self.parent.assertListEqual(list(result["loss"].size()), []) + self.parent.assertListEqual(list(result_with_labels["loss"].size()), []) self.parent.assertListEqual( list(result["start_top_log_probs"].size()), [self.batch_size, model.config.start_n_top] ) @@ -292,13 +266,8 @@ def create_and_check_flaubert_sequence_classif( model.to(torch_device) model.eval() - (logits,) = model(input_ids) - loss, logits = model(input_ids, labels=sequence_labels) - - result = { - "loss": loss, - "logits": logits, - } + result = model(input_ids) + result = model(input_ids, labels=sequence_labels) self.parent.assertListEqual(list(result["loss"].size()), []) self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.type_sequence_label_size]) @@ -320,11 +289,7 @@ def create_and_check_flaubert_token_classif( model.to(torch_device) model.eval() - loss, logits = model(input_ids, attention_mask=input_mask, labels=token_labels) - result = { - "loss": loss, - "logits": logits, - } + result = model(input_ids, attention_mask=input_mask, labels=token_labels) self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]) self.check_loss_output(result) @@ -347,16 +312,12 @@ def create_and_check_flaubert_multiple_choice( multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() - loss, logits = model( + result = model( multiple_choice_inputs_ids, attention_mask=multiple_choice_input_mask, token_type_ids=multiple_choice_token_type_ids, labels=choice_labels, ) - result = { - "loss": loss, - "logits": logits, - } self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices]) self.check_loss_output(result) diff --git a/tests/test_modeling_gpt2.py b/tests/test_modeling_gpt2.py index b97d9d385657..14ef2257c439 100644 --- a/tests/test_modeling_gpt2.py +++ b/tests/test_modeling_gpt2.py @@ -122,9 +122,10 @@ def prepare_config_and_inputs(self): n_positions=self.max_position_embeddings, n_ctx=self.max_position_embeddings, # type_vocab_size=self.type_vocab_size, - # initializer_range=self.initializer_range + # initializer_range=self.initializer_range, bos_token_id=self.bos_token_id, eos_token_id=self.eos_token_id, + return_dict=True, ) head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) @@ -149,18 +150,14 @@ def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, model.to(torch_device) model.eval() - model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask) - model(input_ids, token_type_ids=token_type_ids) - sequence_output, presents = model(input_ids) + result = model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask) + result = model(input_ids, token_type_ids=token_type_ids) + result = model(input_ids) - result = { - "sequence_output": sequence_output, - "presents": presents, - } self.parent.assertListEqual( - list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size], + list(result["last_hidden_state"].size()), [self.batch_size, self.seq_length, self.hidden_size], ) - self.parent.assertEqual(len(result["presents"]), config.n_layer) + self.parent.assertEqual(len(result["past_key_values"]), config.n_layer) def create_and_check_gpt2_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): model = GPT2Model(config=config) @@ -175,7 +172,7 @@ def create_and_check_gpt2_model_past(self, config, input_ids, input_mask, head_m self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf)) self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1) - output, past = outputs + output, past = outputs.to_tuple() # create hypothetical next token and extent to next_input_ids next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) @@ -185,8 +182,8 @@ def create_and_check_gpt2_model_past(self, config, input_ids, input_mask, head_m next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1) - output_from_no_past, _ = model(next_input_ids, token_type_ids=next_token_type_ids) - output_from_past, _ = model(next_tokens, token_type_ids=next_token_types, past=past) + output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"] + output_from_past = model(next_tokens, token_type_ids=next_token_types, past=past)["last_hidden_state"] # select random slice random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() @@ -209,7 +206,7 @@ def create_and_check_gpt2_model_attention_mask_past( attn_mask[:, half_seq_length:] = 0 # first forward pass - output, past = model(input_ids, attention_mask=attn_mask) + output, past = model(input_ids, attention_mask=attn_mask).to_tuple() # create hypothetical next token and extent to next_input_ids next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) @@ -226,8 +223,8 @@ def create_and_check_gpt2_model_attention_mask_past( ) # get two different outputs - output_from_no_past, _ = model(next_input_ids, attention_mask=attn_mask) - output_from_past, _ = model(next_tokens, past=past, attention_mask=attn_mask) + output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"] + output_from_past = model(next_tokens, past=past, attention_mask=attn_mask)["last_hidden_state"] # select random slice random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() @@ -242,13 +239,10 @@ def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mas model.to(torch_device) model.eval() - loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids) - - result = {"loss": loss, "lm_logits": lm_logits} - + result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids) self.parent.assertListEqual(list(result["loss"].size()), []) self.parent.assertListEqual( - list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size], + list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size], ) def create_and_check_double_lm_head_model( @@ -270,11 +264,8 @@ def create_and_check_double_lm_head_model( "labels": multiple_choice_inputs_ids, } - loss, lm_logits, mc_logits, _ = model(**inputs) - - result = {"loss": loss, "lm_logits": lm_logits, "mc_logits": mc_logits} - - self.parent.assertListEqual(list(result["loss"].size()), []) + result = model(**inputs) + self.parent.assertListEqual(list(result["lm_loss"].size()), []) self.parent.assertListEqual( list(result["lm_logits"].size()), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size], ) diff --git a/tests/test_modeling_longformer.py b/tests/test_modeling_longformer.py index 8bfea34bf993..a98b9a7e3565 100644 --- a/tests/test_modeling_longformer.py +++ b/tests/test_modeling_longformer.py @@ -108,6 +108,7 @@ def prepare_config_and_inputs(self): type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, attention_window=self.attention_window, + return_dict=True, ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -123,8 +124,8 @@ def create_and_check_attention_mask_determinism( model.eval() attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device) - output_with_mask = model(input_ids, attention_mask=attention_mask)[0] - output_without_mask = model(input_ids)[0] + output_with_mask = model(input_ids, attention_mask=attention_mask)["last_hidden_state"] + output_without_mask = model(input_ids)["last_hidden_state"] self.parent.assertTrue(torch.allclose(output_with_mask[0, 0, :5], output_without_mask[0, 0, :5], atol=1e-4)) def create_and_check_longformer_model( @@ -133,18 +134,13 @@ def create_and_check_longformer_model( model = LongformerModel(config=config) model.to(torch_device) model.eval() - sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) - sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids) - sequence_output, pooled_output = model(input_ids) - - result = { - "sequence_output": sequence_output, - "pooled_output": pooled_output, - } + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + result = model(input_ids, token_type_ids=token_type_ids) + result = model(input_ids) self.parent.assertListEqual( - list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].size()), [self.batch_size, self.seq_length, self.hidden_size] ) - self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size]) + self.parent.assertListEqual(list(result["pooler_output"].size()), [self.batch_size, self.hidden_size]) def create_and_check_longformer_model_with_global_attention_mask( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -156,25 +152,19 @@ def create_and_check_longformer_model_with_global_attention_mask( global_attention_mask[:, input_mask.shape[-1] // 2] = 0 global_attention_mask = global_attention_mask.to(torch_device) - sequence_output, pooled_output = model( + result = model( input_ids, attention_mask=input_mask, global_attention_mask=global_attention_mask, token_type_ids=token_type_ids, ) - sequence_output, pooled_output = model( - input_ids, token_type_ids=token_type_ids, global_attention_mask=global_attention_mask - ) - sequence_output, pooled_output = model(input_ids, global_attention_mask=global_attention_mask) + result = model(input_ids, token_type_ids=token_type_ids, global_attention_mask=global_attention_mask) + result = model(input_ids, global_attention_mask=global_attention_mask) - result = { - "sequence_output": sequence_output, - "pooled_output": pooled_output, - } self.parent.assertListEqual( - list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].size()), [self.batch_size, self.seq_length, self.hidden_size] ) - self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size]) + self.parent.assertListEqual(list(result["pooler_output"].size()), [self.batch_size, self.hidden_size]) def create_and_check_longformer_for_masked_lm( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -182,16 +172,8 @@ def create_and_check_longformer_for_masked_lm( model = LongformerForMaskedLM(config=config) model.to(torch_device) model.eval() - loss, prediction_scores = model( - input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels - ) - result = { - "loss": loss, - "prediction_scores": prediction_scores, - } - self.parent.assertListEqual( - list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size] - ) + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) + self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]) self.check_loss_output(result) def create_and_check_longformer_for_question_answering( @@ -200,7 +182,7 @@ def create_and_check_longformer_for_question_answering( model = LongformerForQuestionAnswering(config=config) model.to(torch_device) model.eval() - loss, start_logits, end_logits = model( + result = model( input_ids, attention_mask=input_mask, global_attention_mask=input_mask, @@ -208,11 +190,6 @@ def create_and_check_longformer_for_question_answering( start_positions=sequence_labels, end_positions=sequence_labels, ) - result = { - "loss": loss, - "start_logits": start_logits, - "end_logits": end_logits, - } self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length]) self.check_loss_output(result) @@ -224,13 +201,7 @@ def create_and_check_longformer_for_sequence_classification( model = LongformerForSequenceClassification(config) model.to(torch_device) model.eval() - loss, logits = model( - input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels - ) - result = { - "loss": loss, - "logits": logits, - } + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels]) self.check_loss_output(result) @@ -241,11 +212,7 @@ def create_and_check_longformer_for_token_classification( model = LongformerForTokenClassification(config=config) model.to(torch_device) model.eval() - loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) - result = { - "loss": loss, - "logits": logits, - } + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]) self.check_loss_output(result) @@ -260,17 +227,13 @@ def create_and_check_longformer_for_multiple_choice( multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() - loss, logits = model( + result = model( multiple_choice_inputs_ids, attention_mask=multiple_choice_input_mask, global_attention_mask=multiple_choice_input_mask, token_type_ids=multiple_choice_token_type_ids, labels=choice_labels, ) - result = { - "loss": loss, - "logits": logits, - } self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices]) self.check_loss_output(result) diff --git a/tests/test_modeling_mbart.py b/tests/test_modeling_mbart.py index 159fc42976b6..5445555221be 100644 --- a/tests/test_modeling_mbart.py +++ b/tests/test_modeling_mbart.py @@ -114,13 +114,14 @@ def test_mbart_fast_forward(self): decoder_ffn_dim=32, max_position_embeddings=48, add_final_layer_norm=True, + return_dict=True, ) lm_model = BartForConditionalGeneration(config).to(torch_device) context = torch.Tensor([[71, 82, 18, 33, 46, 91, 2], [68, 34, 26, 58, 30, 2, 1]]).long().to(torch_device) summary = torch.Tensor([[82, 71, 82, 18, 2], [58, 68, 2, 1, 1]]).long().to(torch_device) - loss, logits, enc_features = lm_model(input_ids=context, decoder_input_ids=summary, labels=summary) + result = lm_model(input_ids=context, decoder_input_ids=summary, labels=summary) expected_shape = (*summary.shape, config.vocab_size) - self.assertEqual(logits.shape, expected_shape) + self.assertEqual(result["logits"].shape, expected_shape) @require_torch diff --git a/tests/test_modeling_mobilebert.py b/tests/test_modeling_mobilebert.py index 6d46c319c2be..2d85d7faf351 100644 --- a/tests/test_modeling_mobilebert.py +++ b/tests/test_modeling_mobilebert.py @@ -122,6 +122,7 @@ def prepare_config_and_inputs(self): type_vocab_size=self.type_vocab_size, is_decoder=False, initializer_range=self.initializer_range, + return_dict=True, ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -162,18 +163,14 @@ def create_and_check_mobilebert_model( model = MobileBertModel(config=config) model.to(torch_device) model.eval() - sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) - sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids) - sequence_output, pooled_output = model(input_ids) - - result = { - "sequence_output": sequence_output, - "pooled_output": pooled_output, - } + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + result = model(input_ids, token_type_ids=token_type_ids) + result = model(input_ids) + self.parent.assertListEqual( - list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].size()), [self.batch_size, self.seq_length, self.hidden_size] ) - self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size]) + self.parent.assertListEqual(list(result["pooler_output"].size()), [self.batch_size, self.hidden_size]) def create_and_check_mobilebert_model_as_decoder( self, @@ -190,29 +187,25 @@ def create_and_check_mobilebert_model_as_decoder( model = MobileBertModel(config) model.to(torch_device) model.eval() - sequence_output, pooled_output = model( + result = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, ) - sequence_output, pooled_output = model( + result = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states, ) - sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) - result = { - "sequence_output": sequence_output, - "pooled_output": pooled_output, - } self.parent.assertListEqual( - list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].size()), [self.batch_size, self.seq_length, self.hidden_size] ) - self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size]) + self.parent.assertListEqual(list(result["pooler_output"].size()), [self.batch_size, self.hidden_size]) def create_and_check_mobilebert_for_masked_lm( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -220,16 +213,8 @@ def create_and_check_mobilebert_for_masked_lm( model = MobileBertForMaskedLM(config=config) model.to(torch_device) model.eval() - loss, prediction_scores = model( - input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels - ) - result = { - "loss": loss, - "prediction_scores": prediction_scores, - } - self.parent.assertListEqual( - list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size] - ) + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) + self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]) self.check_loss_output(result) def create_and_check_mobilebert_for_next_sequence_prediction( @@ -238,14 +223,10 @@ def create_and_check_mobilebert_for_next_sequence_prediction( model = MobileBertForNextSentencePrediction(config=config) model.to(torch_device) model.eval() - loss, seq_relationship_score = model( + result = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, next_sentence_label=sequence_labels, ) - result = { - "loss": loss, - "seq_relationship_score": seq_relationship_score, - } - self.parent.assertListEqual(list(result["seq_relationship_score"].size()), [self.batch_size, 2]) + self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, 2]) self.check_loss_output(result) def create_and_check_mobilebert_for_pretraining( @@ -254,22 +235,17 @@ def create_and_check_mobilebert_for_pretraining( model = MobileBertForPreTraining(config=config) model.to(torch_device) model.eval() - loss, prediction_scores, seq_relationship_score = model( + result = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels, next_sentence_label=sequence_labels, ) - result = { - "loss": loss, - "prediction_scores": prediction_scores, - "seq_relationship_score": seq_relationship_score, - } self.parent.assertListEqual( - list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size] + list(result["prediction_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size] ) - self.parent.assertListEqual(list(result["seq_relationship_score"].size()), [self.batch_size, 2]) + self.parent.assertListEqual(list(result["seq_relationship_logits"].size()), [self.batch_size, 2]) self.check_loss_output(result) def create_and_check_mobilebert_for_question_answering( @@ -278,18 +254,13 @@ def create_and_check_mobilebert_for_question_answering( model = MobileBertForQuestionAnswering(config=config) model.to(torch_device) model.eval() - loss, start_logits, end_logits = model( + result = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, start_positions=sequence_labels, end_positions=sequence_labels, ) - result = { - "loss": loss, - "start_logits": start_logits, - "end_logits": end_logits, - } self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length]) self.check_loss_output(result) @@ -301,13 +272,7 @@ def create_and_check_mobilebert_for_sequence_classification( model = MobileBertForSequenceClassification(config) model.to(torch_device) model.eval() - loss, logits = model( - input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels - ) - result = { - "loss": loss, - "logits": logits, - } + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels]) self.check_loss_output(result) @@ -318,11 +283,7 @@ def create_and_check_mobilebert_for_token_classification( model = MobileBertForTokenClassification(config=config) model.to(torch_device) model.eval() - loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) - result = { - "loss": loss, - "logits": logits, - } + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]) self.check_loss_output(result) @@ -336,16 +297,12 @@ def create_and_check_mobilebert_for_multiple_choice( multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() - loss, logits = model( + result = model( multiple_choice_inputs_ids, attention_mask=multiple_choice_input_mask, token_type_ids=multiple_choice_token_type_ids, labels=choice_labels, ) - result = { - "loss": loss, - "logits": logits, - } self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices]) self.check_loss_output(result) diff --git a/tests/test_modeling_openai.py b/tests/test_modeling_openai.py index 7818623672d6..5d39313da957 100644 --- a/tests/test_modeling_openai.py +++ b/tests/test_modeling_openai.py @@ -85,9 +85,10 @@ def prepare_config_and_inputs(self): # hidden_dropout_prob=self.hidden_dropout_prob, # attention_probs_dropout_prob=self.attention_probs_dropout_prob, n_positions=self.max_position_embeddings, - n_ctx=self.max_position_embeddings + n_ctx=self.max_position_embeddings, # type_vocab_size=self.type_vocab_size, # initializer_range=self.initializer_range + return_dict=True, ) head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) @@ -110,13 +111,12 @@ def create_and_check_openai_gpt_model(self, config, input_ids, head_mask, token_ model.to(torch_device) model.eval() - model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask) - model(input_ids, token_type_ids=token_type_ids) - (sequence_output,) = model(input_ids) + result = model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask) + result = model(input_ids, token_type_ids=token_type_ids) + result = model(input_ids) - result = {"sequence_output": sequence_output} self.parent.assertListEqual( - list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size], + list(result["last_hidden_state"].size()), [self.batch_size, self.seq_length, self.hidden_size], ) def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args): @@ -124,13 +124,10 @@ def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_typ model.to(torch_device) model.eval() - loss, lm_logits = model(input_ids, token_type_ids=token_type_ids, labels=input_ids) - - result = {"loss": loss, "lm_logits": lm_logits} - + result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids) self.parent.assertListEqual(list(result["loss"].size()), []) self.parent.assertListEqual( - list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size], + list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size], ) def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args): @@ -138,11 +135,8 @@ def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, to model.to(torch_device) model.eval() - loss, lm_logits, mc_logits = model(input_ids, token_type_ids=token_type_ids, labels=input_ids) - - result = {"loss": loss, "lm_logits": lm_logits} - - self.parent.assertListEqual(list(result["loss"].size()), []) + result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids) + self.parent.assertListEqual(list(result["lm_loss"].size()), []) self.parent.assertListEqual( list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size], ) diff --git a/tests/test_modeling_reformer.py b/tests/test_modeling_reformer.py index b70ec98c8b56..b15f1d435565 100644 --- a/tests/test_modeling_reformer.py +++ b/tests/test_modeling_reformer.py @@ -165,6 +165,7 @@ def prepare_config_and_inputs(self): attn_layers=self.attn_layers, pad_token_id=self.pad_token_id, hash_seed=self.hash_seed, + return_dict=True, ) return ( @@ -181,15 +182,12 @@ def create_and_check_reformer_model(self, config, input_ids, input_mask, choice_ model = ReformerModel(config=config) model.to(torch_device) model.eval() - sequence_output, _ = model(input_ids, attention_mask=input_mask) - sequence_output, _ = model(input_ids) + result = model(input_ids, attention_mask=input_mask) + result = model(input_ids) - result = { - "sequence_output": sequence_output, - } # 2 * hidden_size because we use reversible resnet layers self.parent.assertListEqual( - list(result["sequence_output"].size()), [self.batch_size, self.seq_length, 2 * self.hidden_size], + list(result["last_hidden_state"].size()), [self.batch_size, self.seq_length, 2 * self.hidden_size], ) def create_and_check_reformer_model_with_lm_backward(self, config, input_ids, input_mask, choice_labels): @@ -198,7 +196,7 @@ def create_and_check_reformer_model_with_lm_backward(self, config, input_ids, in model = ReformerForMaskedLM(config=config) model.to(torch_device) model.eval() - loss = model(input_ids, attention_mask=input_mask, labels=input_ids)[0] + loss = model(input_ids, attention_mask=input_mask, labels=input_ids)["loss"] loss.backward() def create_and_check_reformer_with_lm(self, config, input_ids, input_mask, choice_labels): @@ -207,13 +205,9 @@ def create_and_check_reformer_with_lm(self, config, input_ids, input_mask, choic model = ReformerModelWithLMHead(config=config) model.to(torch_device) model.eval() - loss, prediction_scores, _ = model(input_ids, attention_mask=input_mask, labels=input_ids) - result = { - "loss": loss, - "prediction_scores": prediction_scores, - } + result = model(input_ids, attention_mask=input_mask, labels=input_ids) self.parent.assertListEqual( - list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size], + list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size], ) self.check_loss_output(result) @@ -222,13 +216,9 @@ def create_and_check_reformer_with_mlm(self, config, input_ids, input_mask, choi model = ReformerForMaskedLM(config=config) model.to(torch_device) model.eval() - loss, prediction_scores = model(input_ids, attention_mask=input_mask, labels=input_ids) - result = { - "loss": loss, - "prediction_scores": prediction_scores, - } + result = model(input_ids, attention_mask=input_mask, labels=input_ids) self.parent.assertListEqual( - list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size], + list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size], ) self.check_loss_output(result) @@ -325,7 +315,7 @@ def create_and_check_reformer_feed_forward_chunking(self, config, input_ids, inp model.to(torch_device) model.eval() - hidden_states_with_chunk = model(input_ids, attention_mask=input_mask)[0] + hidden_states_with_chunk = model(input_ids, attention_mask=input_mask)["last_hidden_state"] self.parent.assertTrue(torch.allclose(hidden_states_no_chunk, hidden_states_with_chunk, atol=1e-3)) def create_and_check_reformer_feed_backward_chunking(self, config, input_ids, input_mask, choice_labels): @@ -408,7 +398,7 @@ def create_and_check_reformer_model_fp16_forward(self, config, input_ids, input_ model.to(torch_device) model.half() model.eval() - output = model(input_ids, attention_mask=input_mask)[0] + output = model(input_ids, attention_mask=input_mask)["last_input_state"] self.parent.assertFalse(torch.isnan(output).any().item()) def create_and_check_reformer_model_generate(self, config, input_ids, input_mask, choice_labels): @@ -444,21 +434,16 @@ def create_and_check_reformer_no_chunking(self, config, input_ids, input_mask, c model = ReformerForMaskedLM(config=config) model.to(torch_device) model.eval() - output_logits = model(input_ids, attention_mask=input_mask)[0] + output_logits = model(input_ids, attention_mask=input_mask)["logits"] self.parent.assertTrue(output_logits.shape[1] == input_ids.shape[-1]) def create_and_check_reformer_for_question_answering(self, config, input_ids, input_mask, choice_labels): model = ReformerForQuestionAnswering(config=config) model.to(torch_device) model.eval() - loss, start_logits, end_logits = model( + result = model( input_ids, attention_mask=input_mask, start_positions=choice_labels, end_positions=choice_labels, ) - result = { - "loss": loss, - "start_logits": start_logits, - "end_logits": end_logits, - } self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length]) self.check_loss_output(result) @@ -474,11 +459,11 @@ def create_and_check_past_buckets_states(self, config, input_ids, input_mask, ch input_ids_second = input_ids[:, -1:] # return saved cache - _, past_buckets_states = model(input_ids_first, use_cache=True) + past_buckets_states = model(input_ids_first, use_cache=True)["past_buckets_states"] # calculate last output with and without cache - outputs_with_cache, _ = model(input_ids_second, past_buckets_states=past_buckets_states, use_cache=True) - outputs_without_cache = model(input_ids)[0][:, -1] + outputs_with_cache = model(input_ids_second, past_buckets_states=past_buckets_states, use_cache=True)["logits"] + outputs_without_cache = model(input_ids)["logits"][:, -1] # select random slice idx random_slice_idx = torch.randint(outputs_without_cache.shape[-1], (1, 1), device=torch_device).item() @@ -504,11 +489,7 @@ def create_and_check_reformer_for_sequence_classification( model = ReformerForSequenceClassification(config) model.to(torch_device) model.eval() - loss, logits = model(input_ids, attention_mask=input_mask, labels=sequence_labels) - result = { - "loss": loss, - "logits": logits, - } + result = model(input_ids, attention_mask=input_mask, labels=sequence_labels) self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels]) self.check_loss_output(result) diff --git a/tests/test_modeling_roberta.py b/tests/test_modeling_roberta.py index e408a6b4a7f6..82de9241919f 100644 --- a/tests/test_modeling_roberta.py +++ b/tests/test_modeling_roberta.py @@ -96,6 +96,7 @@ def prepare_config_and_inputs(self): max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, + return_dict=True, ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -109,18 +110,14 @@ def create_and_check_roberta_model( model = RobertaModel(config=config) model.to(torch_device) model.eval() - sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) - sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids) - sequence_output, pooled_output = model(input_ids) - - result = { - "sequence_output": sequence_output, - "pooled_output": pooled_output, - } + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) + result = model(input_ids, token_type_ids=token_type_ids) + result = model(input_ids) + self.parent.assertListEqual( - list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].size()), [self.batch_size, self.seq_length, self.hidden_size] ) - self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size]) + self.parent.assertListEqual(list(result["pooler_output"].size()), [self.batch_size, self.hidden_size]) def create_and_check_roberta_for_masked_lm( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -128,16 +125,8 @@ def create_and_check_roberta_for_masked_lm( model = RobertaForMaskedLM(config=config) model.to(torch_device) model.eval() - loss, prediction_scores = model( - input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels - ) - result = { - "loss": loss, - "prediction_scores": prediction_scores, - } - self.parent.assertListEqual( - list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size] - ) + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) + self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]) self.check_loss_output(result) def create_and_check_roberta_for_token_classification( @@ -147,11 +136,7 @@ def create_and_check_roberta_for_token_classification( model = RobertaForTokenClassification(config=config) model.to(torch_device) model.eval() - loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) - result = { - "loss": loss, - "logits": logits, - } + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]) self.check_loss_output(result) @@ -165,16 +150,12 @@ def create_and_check_roberta_for_multiple_choice( multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() - loss, logits = model( + result = model( multiple_choice_inputs_ids, attention_mask=multiple_choice_input_mask, token_type_ids=multiple_choice_token_type_ids, labels=choice_labels, ) - result = { - "loss": loss, - "logits": logits, - } self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices]) self.check_loss_output(result) @@ -184,18 +165,13 @@ def create_and_check_roberta_for_question_answering( model = RobertaForQuestionAnswering(config=config) model.to(torch_device) model.eval() - loss, start_logits, end_logits = model( + result = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, start_positions=sequence_labels, end_positions=sequence_labels, ) - result = { - "loss": loss, - "start_logits": start_logits, - "end_logits": end_logits, - } self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length]) self.check_loss_output(result) diff --git a/tests/test_modeling_t5.py b/tests/test_modeling_t5.py index 09949f07b248..9177e2cd5438 100644 --- a/tests/test_modeling_t5.py +++ b/tests/test_modeling_t5.py @@ -83,6 +83,7 @@ def prepare_config_and_inputs(self): bos_token_id=self.pad_token_id, pad_token_id=self.pad_token_id, decoder_start_token_id=self.decoder_start_token_id, + return_dict=True, ) return ( @@ -136,13 +137,17 @@ def create_and_check_t5_model( model = T5Model(config=config) model.to(torch_device) model.eval() - decoder_output, decoder_past, encoder_output = model( + result = model( input_ids=input_ids, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, ) - decoder_output, decoder_past, encoder_output = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids) + result = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids) + decoder_output = result["last_hidden_state"] + decoder_past = result["decoder_past_key_values"] + encoder_output = result["encoder_last_hidden_state"] + self.parent.assertEqual(encoder_output.size(), (self.batch_size, self.encoder_seq_length, self.hidden_size)) self.parent.assertEqual(decoder_output.size(), (self.batch_size, self.decoder_seq_length, self.hidden_size)) self.parent.assertEqual(len(decoder_past), 2) @@ -162,10 +167,9 @@ def create_and_check_t5_with_lm_head( decoder_attention_mask=decoder_attention_mask, labels=lm_labels, ) - loss, prediction_scores, _, _ = outputs self.parent.assertEqual(len(outputs), 4) - self.parent.assertEqual(prediction_scores.size(), (self.batch_size, self.decoder_seq_length, self.vocab_size)) - self.parent.assertEqual(loss.size(), ()) + self.parent.assertEqual(outputs["logits"].size(), (self.batch_size, self.decoder_seq_length, self.vocab_size)) + self.parent.assertEqual(outputs["loss"].size(), ()) def create_and_check_t5_decoder_model_past( self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels, @@ -179,7 +183,7 @@ def create_and_check_t5_decoder_model_past( self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf)) self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1) - output, past_key_value_states = outputs + output, past_key_value_states = outputs.to_tuple() # create hypothetical next token and extent to next_input_ids next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) @@ -187,8 +191,8 @@ def create_and_check_t5_decoder_model_past( # append to next input_ids and next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) - output_from_no_past = model(next_input_ids)[0] - output_from_past = model(next_tokens, past_key_value_states=past_key_value_states)[0] + output_from_no_past = model(next_input_ids)["last_hidden_state"] + output_from_past = model(next_tokens, past_key_value_states=past_key_value_states)["last_hidden_state"] # select random slice random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() @@ -212,7 +216,7 @@ def create_and_check_t5_decoder_model_attention_mask_past( attn_mask[:, half_seq_length:] = 0 # first forward pass - output, past_key_value_states = model(input_ids, attention_mask=attn_mask, use_cache=True) + output, past_key_value_states = model(input_ids, attention_mask=attn_mask, use_cache=True).to_tuple() # create hypothetical next token and extent to next_input_ids next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) @@ -229,8 +233,10 @@ def create_and_check_t5_decoder_model_attention_mask_past( ) # get two different outputs - output_from_no_past = model(next_input_ids, attention_mask=attn_mask)[0] - output_from_past = model(next_tokens, past_key_value_states=past_key_value_states, attention_mask=attn_mask)[0] + output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"] + output_from_past = model(next_tokens, past_key_value_states=past_key_value_states, attention_mask=attn_mask)[ + "last_hidden_state" + ] # select random slice random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() @@ -256,7 +262,7 @@ def create_and_check_t5_model_fp16_forward( self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels, ): model = T5Model(config=config).to(torch_device).half().eval() - output = model(input_ids, decoder_input_ids=input_ids, attention_mask=attention_mask)[0] + output = model(input_ids, decoder_input_ids=input_ids, attention_mask=attention_mask)["last_hidden_state"] self.parent.assertFalse(torch.isnan(output).any().item()) def prepare_config_and_inputs_for_common(self): diff --git a/tests/test_modeling_transfo_xl.py b/tests/test_modeling_transfo_xl.py index 73a8036f2475..1d67c8403193 100644 --- a/tests/test_modeling_transfo_xl.py +++ b/tests/test_modeling_transfo_xl.py @@ -75,6 +75,7 @@ def prepare_config_and_inputs(self): div_val=self.div_val, n_layer=self.num_hidden_layers, eos_token_id=self.eos_token_id, + return_dict=True, ) return (config, input_ids_1, input_ids_2, lm_labels) @@ -88,13 +89,13 @@ def create_transfo_xl_model(self, config, input_ids_1, input_ids_2, lm_labels): model.to(torch_device) model.eval() - hidden_states_1, mems_1 = model(input_ids_1) - hidden_states_2, mems_2 = model(input_ids_2, mems_1) + outputs1 = model(input_ids_1) + outputs2 = model(input_ids_2, outputs1["mems"]) outputs = { - "hidden_states_1": hidden_states_1, - "mems_1": mems_1, - "hidden_states_2": hidden_states_2, - "mems_2": mems_2, + "hidden_states_1": outputs1["last_hidden_state"], + "mems_1": outputs1["mems"], + "hidden_states_2": outputs2["last_hidden_state"], + "mems_2": outputs2["mems"], } return outputs @@ -119,17 +120,17 @@ def create_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels) model.to(torch_device) model.eval() - lm_logits_1, mems_1 = model(input_ids_1) - loss_1, _, mems_1 = model(input_ids_1, labels=lm_labels) - lm_logits_2, mems_2 = model(input_ids_2, mems=mems_1) - loss_2, _, mems_2 = model(input_ids_2, labels=lm_labels, mems=mems_1) + lm_logits_1 = model(input_ids_1)["prediction_scores"] + outputs1 = model(input_ids_1, labels=lm_labels) + lm_logits_2 = model(input_ids_2, mems=outputs1["mems"])["prediction_scores"] + outputs2 = model(input_ids_2, labels=lm_labels, mems=outputs1["mems"]) outputs = { - "loss_1": loss_1, - "mems_1": mems_1, + "loss_1": outputs1["losses"], + "mems_1": outputs1["mems"], "lm_logits_1": lm_logits_1, - "loss_2": loss_2, - "mems_2": mems_2, + "loss_2": outputs2["losses"], + "mems_2": outputs2["mems"], "lm_logits_2": lm_logits_2, } return outputs diff --git a/tests/test_modeling_xlm.py b/tests/test_modeling_xlm.py index efa9346cee51..30e98d8dd1da 100644 --- a/tests/test_modeling_xlm.py +++ b/tests/test_modeling_xlm.py @@ -113,6 +113,7 @@ def prepare_config_and_inputs(self): use_proj=self.use_proj, num_labels=self.num_labels, bos_token_id=self.bos_token_id, + return_dict=True, ) return ( @@ -145,15 +146,11 @@ def create_and_check_xlm_model( model = XLMModel(config=config) model.to(torch_device) model.eval() - outputs = model(input_ids, lengths=input_lengths, langs=token_type_ids) - outputs = model(input_ids, langs=token_type_ids) - outputs = model(input_ids) - sequence_output = outputs[0] - result = { - "sequence_output": sequence_output, - } + result = model(input_ids, lengths=input_lengths, langs=token_type_ids) + result = model(input_ids, langs=token_type_ids) + result = model(input_ids) self.parent.assertListEqual( - list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].size()), [self.batch_size, self.seq_length, self.hidden_size] ) def create_and_check_xlm_lm_head( @@ -172,13 +169,7 @@ def create_and_check_xlm_lm_head( model.to(torch_device) model.eval() - loss, logits = model(input_ids, token_type_ids=token_type_ids, labels=token_labels) - - result = { - "loss": loss, - "logits": logits, - } - + result = model(input_ids, token_type_ids=token_type_ids, labels=token_labels) self.parent.assertListEqual(list(result["loss"].size()), []) self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]) @@ -201,13 +192,7 @@ def create_and_check_xlm_simple_qa( outputs = model(input_ids) outputs = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels) - loss, start_logits, end_logits = outputs - - result = { - "loss": loss, - "start_logits": start_logits, - "end_logits": end_logits, - } + result = outputs self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length]) self.check_loss_output(result) @@ -228,10 +213,9 @@ def create_and_check_xlm_qa( model.to(torch_device) model.eval() - outputs = model(input_ids) - start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = outputs + result = model(input_ids) - outputs = model( + result_with_labels = model( input_ids, start_positions=sequence_labels, end_positions=sequence_labels, @@ -240,7 +224,7 @@ def create_and_check_xlm_qa( p_mask=input_mask, ) - outputs = model( + result_with_labels = model( input_ids, start_positions=sequence_labels, end_positions=sequence_labels, @@ -248,22 +232,13 @@ def create_and_check_xlm_qa( is_impossible=is_impossible_labels, ) - (total_loss,) = outputs + (total_loss,) = result_with_labels.to_tuple() - outputs = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels) - - (total_loss,) = outputs + result_with_labels = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels) - result = { - "loss": total_loss, - "start_top_log_probs": start_top_log_probs, - "start_top_index": start_top_index, - "end_top_log_probs": end_top_log_probs, - "end_top_index": end_top_index, - "cls_logits": cls_logits, - } + (total_loss,) = result_with_labels.to_tuple() - self.parent.assertListEqual(list(result["loss"].size()), []) + self.parent.assertListEqual(list(result_with_labels["loss"].size()), []) self.parent.assertListEqual( list(result["start_top_log_probs"].size()), [self.batch_size, model.config.start_n_top] ) @@ -295,14 +270,8 @@ def create_and_check_xlm_sequence_classif( model.to(torch_device) model.eval() - (logits,) = model(input_ids) - loss, logits = model(input_ids, labels=sequence_labels) - - result = { - "loss": loss, - "logits": logits, - } - + result = model(input_ids) + result = model(input_ids, labels=sequence_labels) self.parent.assertListEqual(list(result["loss"].size()), []) self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.type_sequence_label_size]) @@ -323,11 +292,7 @@ def create_and_check_xlm_token_classif( model.to(torch_device) model.eval() - loss, logits = model(input_ids, attention_mask=input_mask, labels=token_labels) - result = { - "loss": loss, - "logits": logits, - } + result = model(input_ids, attention_mask=input_mask, labels=token_labels) self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]) self.check_loss_output(result) @@ -350,16 +315,12 @@ def create_and_check_xlm_for_multiple_choice( multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() - loss, logits = model( + result = model( multiple_choice_inputs_ids, attention_mask=multiple_choice_input_mask, token_type_ids=multiple_choice_token_type_ids, labels=choice_labels, ) - result = { - "loss": loss, - "logits": logits, - } self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices]) self.check_loss_output(result) diff --git a/tests/test_modeling_xlm_roberta.py b/tests/test_modeling_xlm_roberta.py index f4098118cb8f..7e109856d0ae 100644 --- a/tests/test_modeling_xlm_roberta.py +++ b/tests/test_modeling_xlm_roberta.py @@ -28,7 +28,7 @@ class XLMRobertaModelIntegrationTest(unittest.TestCase): @slow def test_xlm_roberta_base(self): - model = XLMRobertaModel.from_pretrained("xlm-roberta-base") + model = XLMRobertaModel.from_pretrained("xlm-roberta-base", return_dict=True) input_ids = torch.tensor([[0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2]]) # The dog is cute and lives in the garden house @@ -40,14 +40,14 @@ def test_xlm_roberta_base(self): # xlmr.eval() # expected_output_values_last_dim = xlmr.extract_features(input_ids[0])[:, :, -1] - output = model(input_ids)[0].detach() + output = model(input_ids)["last_hidden_state"].detach() self.assertEqual(output.shape, expected_output_shape) # compare the actual values for a slice of last dim self.assertTrue(torch.allclose(output[:, :, -1], expected_output_values_last_dim, atol=1e-3)) @slow def test_xlm_roberta_large(self): - model = XLMRobertaModel.from_pretrained("xlm-roberta-large") + model = XLMRobertaModel.from_pretrained("xlm-roberta-large", return_dict=True) input_ids = torch.tensor([[0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2]]) # The dog is cute and lives in the garden house @@ -59,7 +59,7 @@ def test_xlm_roberta_large(self): # xlmr.eval() # expected_output_values_last_dim = xlmr.extract_features(input_ids[0])[:, :, -1] - output = model(input_ids)[0].detach() + output = model(input_ids)["last_hidden_state"].detach() self.assertEqual(output.shape, expected_output_shape) # compare the actual values for a slice of last dim self.assertTrue(torch.allclose(output[:, :, -1], expected_output_values_last_dim, atol=1e-3)) diff --git a/tests/test_modeling_xlnet.py b/tests/test_modeling_xlnet.py index bf0d3bf6ed77..e0d9479503c5 100644 --- a/tests/test_modeling_xlnet.py +++ b/tests/test_modeling_xlnet.py @@ -137,6 +137,7 @@ def prepare_config_and_inputs(self): bos_token_id=self.bos_token_id, pad_token_id=self.pad_token_id, eos_token_id=self.eos_token_id, + return_dict=True, ) return ( @@ -177,15 +178,10 @@ def create_and_check_xlnet_base_model( model.to(torch_device) model.eval() - _, _ = model(input_ids_1, input_mask=input_mask) - _, _ = model(input_ids_1, attention_mask=input_mask) - _, _ = model(input_ids_1, token_type_ids=segment_ids) - outputs, mems_1 = model(input_ids_1) - - result = { - "mems_1": mems_1, - "outputs": outputs, - } + result = model(input_ids_1, input_mask=input_mask) + result = model(input_ids_1, attention_mask=input_mask) + result = model(input_ids_1, token_type_ids=segment_ids) + result = model(input_ids_1) config.mem_len = 0 model = XLNetModel(config) @@ -195,10 +191,10 @@ def create_and_check_xlnet_base_model( self.parent.assertEqual(len(base_model_output), 2) self.parent.assertListEqual( - list(result["outputs"].size()), [self.batch_size, self.seq_length, self.hidden_size], + list(result["last_hidden_state"].size()), [self.batch_size, self.seq_length, self.hidden_size], ) self.parent.assertListEqual( - list(list(mem.size()) for mem in result["mems_1"]), + list(list(mem.size()) for mem in result["mems"]), [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers, ) @@ -233,7 +229,7 @@ def create_and_check_xlnet_model_use_cache( self.parent.assertTrue(len(outputs_cache) == len(outputs_conf)) self.parent.assertTrue(len(outputs_cache) == len(outputs_no_cache) + 1) - output, mems = outputs_cache + output, mems = outputs_cache.to_tuple() # create hypothetical next token and extent to next_input_ids next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) @@ -253,8 +249,8 @@ def create_and_check_xlnet_model_use_cache( single_mask = torch.ones(input_ids_1.shape[0], 1, 1, dtype=torch.float, device=torch_device) # second forward pass - output_from_no_past, _ = model(next_input_ids, perm_mask=causal_mask) - output_from_past, _ = model(next_tokens, mems=mems, perm_mask=single_mask) + output_from_no_past = model(next_input_ids, perm_mask=causal_mask)["last_hidden_state"] + output_from_past = model(next_tokens, mems=mems, perm_mask=single_mask)["last_hidden_state"] # select random slice random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() @@ -283,7 +279,7 @@ def create_and_check_xlnet_base_model_with_att_output( model.to(torch_device) model.eval() - _, _, attentions = model(input_ids_1, target_mapping=target_mapping, output_attentions=True) + attentions = model(input_ids_1, target_mapping=target_mapping, output_attentions=True)["attentions"] self.parent.assertEqual(len(attentions), config.n_layer) self.parent.assertIsInstance(attentions[0], tuple) @@ -309,36 +305,27 @@ def create_and_check_xlnet_lm_head( model.to(torch_device) model.eval() - loss_1, all_logits_1, mems_1 = model(input_ids_1, token_type_ids=segment_ids, labels=lm_labels) - - loss_2, all_logits_2, mems_2 = model(input_ids_2, token_type_ids=segment_ids, labels=lm_labels, mems=mems_1) + result1 = model(input_ids_1, token_type_ids=segment_ids, labels=lm_labels) - logits, _ = model(input_ids_q, perm_mask=perm_mask, target_mapping=target_mapping) + result2 = model(input_ids_2, token_type_ids=segment_ids, labels=lm_labels, mems=result1["mems"]) - result = { - "loss_1": loss_1, - "mems_1": mems_1, - "all_logits_1": all_logits_1, - "loss_2": loss_2, - "mems_2": mems_2, - "all_logits_2": all_logits_2, - } + _ = model(input_ids_q, perm_mask=perm_mask, target_mapping=target_mapping) - self.parent.assertListEqual(list(result["loss_1"].size()), []) + self.parent.assertListEqual(list(result1["loss"].size()), []) self.parent.assertListEqual( - list(result["all_logits_1"].size()), [self.batch_size, self.seq_length, self.vocab_size], + list(result1["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size], ) self.parent.assertListEqual( - list(list(mem.size()) for mem in result["mems_1"]), + list(list(mem.size()) for mem in result1["mems"]), [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers, ) - self.parent.assertListEqual(list(result["loss_2"].size()), []) + self.parent.assertListEqual(list(result2["loss"].size()), []) self.parent.assertListEqual( - list(result["all_logits_2"].size()), [self.batch_size, self.seq_length, self.vocab_size], + list(result2["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size], ) self.parent.assertListEqual( - list(list(mem.size()) for mem in result["mems_2"]), + list(list(mem.size()) for mem in result2["mems"]), [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers, ) @@ -361,10 +348,9 @@ def create_and_check_xlnet_qa( model.to(torch_device) model.eval() - outputs = model(input_ids_1) - (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits, mems,) = outputs + result = model(input_ids_1) - outputs = model( + result_with_labels = model( input_ids_1, start_positions=sequence_labels, end_positions=sequence_labels, @@ -373,7 +359,7 @@ def create_and_check_xlnet_qa( p_mask=input_mask, ) - outputs = model( + result_with_labels = model( input_ids_1, start_positions=sequence_labels, end_positions=sequence_labels, @@ -381,23 +367,13 @@ def create_and_check_xlnet_qa( is_impossible=is_impossible_labels, ) - total_loss, mems = outputs + total_loss, mems = result_with_labels.to_tuple() - outputs = model(input_ids_1, start_positions=sequence_labels, end_positions=sequence_labels,) + result_with_labels = model(input_ids_1, start_positions=sequence_labels, end_positions=sequence_labels,) - total_loss, mems = outputs + total_loss, mems = result_with_labels.to_tuple() - result = { - "loss": total_loss, - "start_top_log_probs": start_top_log_probs, - "start_top_index": start_top_index, - "end_top_log_probs": end_top_log_probs, - "end_top_index": end_top_index, - "cls_logits": cls_logits, - "mems": mems, - } - - self.parent.assertListEqual(list(result["loss"].size()), []) + self.parent.assertListEqual(list(result_with_labels["loss"].size()), []) self.parent.assertListEqual( list(result["start_top_log_probs"].size()), [self.batch_size, model.config.start_n_top], ) @@ -436,21 +412,15 @@ def create_and_check_xlnet_token_classif( model.to(torch_device) model.eval() - logits, mems_1 = model(input_ids_1) - loss, logits, mems_1 = model(input_ids_1, labels=token_labels) - - result = { - "loss": loss, - "mems_1": mems_1, - "logits": logits, - } + result = model(input_ids_1) + result = model(input_ids_1, labels=token_labels) self.parent.assertListEqual(list(result["loss"].size()), []) self.parent.assertListEqual( list(result["logits"].size()), [self.batch_size, self.seq_length, self.type_sequence_label_size], ) self.parent.assertListEqual( - list(list(mem.size()) for mem in result["mems_1"]), + list(list(mem.size()) for mem in result["mems"]), [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers, ) @@ -473,21 +443,15 @@ def create_and_check_xlnet_sequence_classif( model.to(torch_device) model.eval() - logits, mems_1 = model(input_ids_1) - loss, logits, mems_1 = model(input_ids_1, labels=sequence_labels) - - result = { - "loss": loss, - "mems_1": mems_1, - "logits": logits, - } + result = model(input_ids_1) + result = model(input_ids_1, labels=sequence_labels) self.parent.assertListEqual(list(result["loss"].size()), []) self.parent.assertListEqual( list(result["logits"].size()), [self.batch_size, self.type_sequence_label_size], ) self.parent.assertListEqual( - list(list(mem.size()) for mem in result["mems_1"]), + list(list(mem.size()) for mem in result["mems"]), [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers, ) From 8edfaaa81b9995cedea2f8805e4c18c2b6cb5bfc Mon Sep 17 00:00:00 2001 From: Joe Davison Date: Fri, 31 Jul 2020 10:56:32 -0400 Subject: [PATCH 076/127] bart-large-mnli-yahoo-answers model card (#6133) * Add bart-large-mnli-yahoo-answers model card * Add examples * Add widget example * Rm bart tag Co-authored-by: Julien Chaumond Co-authored-by: Julien Chaumond --- .../bart-large-mnli-yahoo-answers/README.md | 72 +++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 model_cards/joeddav/bart-large-mnli-yahoo-answers/README.md diff --git a/model_cards/joeddav/bart-large-mnli-yahoo-answers/README.md b/model_cards/joeddav/bart-large-mnli-yahoo-answers/README.md new file mode 100644 index 000000000000..4918cc7cb1ff --- /dev/null +++ b/model_cards/joeddav/bart-large-mnli-yahoo-answers/README.md @@ -0,0 +1,72 @@ +--- +language: en +tags: +- text-classification +- pytorch +datasets: +- yahoo-answers +widget: +- text: "Who are you voting for in 2020? This text is about politics." +--- + +# bart-lage-mnli-yahoo-answers + +## Model Description + +This model takes [facebook/bart-large-mnli](https://huggingface.co/facebook/bart-large-mnli) and fine-tunes it on Yahoo Answers topic classification. It can be used to predict whether a topic label can be assigned to a given sequence, whether or not the label has been seen before. + +You can play with an interactive demo of this zero-shot technique with this model, as well as the non-finetuned [facebook/bart-large-mnli](https://huggingface.co/facebook/bart-large-mnli), [here](https://huggingface.co/zero-shot/). + +## Inteded Usage + +This model was fine-tuned on topic classification and will perform best at zero-shot topic classification. Use `hypothesis_template="This text is about {}."` as this is the template used during fine-tuning. + +For settings other than topic classification, you can use any model pre-trained on MNLI such as [facebook/bart-large-mnli](https://huggingface.co/facebook/bart-large-mnli) or [roberta-large-mnli](https://huggingface.co/roberta-large-mnli) with the same code as written below. + +#### With the zero-shot classification pipeline + +The model can be used with the `zero-shot-classification` pipeline like so: + +```python +from transformers import pipeline +nlp = pipeline("zero-shot-classification", model="joeddav/bart-large-mnli-yahoo-answers") + +sequence_to_classify = "Who are you voting for in 2020?" +candidate_labels = ["Europe", "public health", "politics", "elections"] +hypothesis_template = "This text is about {}." +nlp(sequence_to_classify, candidate_labels, multi_class=True, hypothesis_template=hypothesis_template) +``` + +#### With manual PyTorch + +```python +# pose sequence as a NLI premise and label as a hypothesis +from transformers import BartForSequenceClassification, BartTokenizer +nli_model = BartForSequenceClassification.from_pretrained('joeddav/bart-large-mnli-yahoo-answers') +tokenizer = BartTokenizer.from_pretrained('joeddav/bart-large-mnli-yahoo-answers') + +premise = sequence +hypothesis = f'This text is about {label}.' + +# run through model pre-trained on MNLI +x = tokenizer.encode(premise, hypothesis, return_tensors='pt', + max_length=tokenizer.max_len, + truncation_strategy='only_first') +logits = nli_model(x.to(device))[0] + +# we throw away "neutral" (dim 1) and take the probability of +# "entailment" (2) as the probability of the label being true +entail_contradiction_logits = logits[:,[0,2]] +probs = entail_contradiction_logits.softmax(dim=1) +prob_label_is_true = probs[:,1] +``` + +## Training + +The model is a pre-trained MNLI classifier further fine-tuned on Yahoo Answers topic classification in the manner originally described in [Yin et al. 2019](https://arxiv.org/abs/1909.00161) and [this blog post](https://joeddav.github.io/blog/2020/05/29/ZSL.html). That is, each sequence is fed to the pre-trained NLI model in place of the premise and each candidate label as the hypothesis, formatted like so: `This text is about {class name}.` For each example in the training set, a true and a randomly-selected false label hypothesis are fed to the model which must predict which labels are valid and which are false. + +Since this method studies the ability to classify unseen labels after being trained on a different set of labels, the model is only trained on 5 out of the 10 labels in Yahoo Answers. These are "Society & Culture", "Health", "Computers & Internet", "Business & Finance", and "Family & Relationships". + +## Evaluation Results + +This model was evaluated with the label-weighted F1 of the _seen_ and _unseen_ labels. That is, for each example the model must predict from one of the 10 corpus labels. The F1 is reported for the labels seen during training as well as the labels unseen during training. We found an F1 score of `.68` and `.72` for the unseen and seen labels, respectively. In order to adjust for the in-vs-out of distribution labels, we subtract a fixed amount of 30% from the normalized probabilities of the _seen_ labels, as described in [Yin et al. 2019](https://arxiv.org/abs/1909.00161) and [our blog post](https://joeddav.github.io/blog/2020/05/29/ZSL.html). From a39dfe4fb122c11be98a563fb8ca43b322e01036 Mon Sep 17 00:00:00 2001 From: Faiaz Rahman <42232624+faiazrahman@users.noreply.github.com> Date: Sat, 1 Aug 2020 03:20:48 -0700 Subject: [PATCH 077/127] Fixed typo in Longformer (#6180) --- docs/source/model_doc/longformer.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/model_doc/longformer.rst b/docs/source/model_doc/longformer.rst index badfb4c091ef..c2d44a60b07e 100644 --- a/docs/source/model_doc/longformer.rst +++ b/docs/source/model_doc/longformer.rst @@ -16,7 +16,7 @@ Longformer Self Attention ~~~~~~~~~~~~~~~~~~~~~~~~~~ Longformer self attention employs self attention on both a "local" context and a "global" context. Most tokens only attend "locally" to each other meaning that each token attends to its :math:`\frac{1}{2} w` previous tokens and :math:`\frac{1}{2} w` succeding tokens with :math:`w` being the window length as defined in `config.attention_window`. Note that `config.attention_window` can be of type ``list`` to define a different :math:`w` for each layer. -A selecetd few tokens attend "globally" to all other tokens, as it is conventionally done for all tokens in *e.g.* `BertSelfAttention`. +A selected few tokens attend "globally" to all other tokens, as it is conventionally done for all tokens in *e.g.* `BertSelfAttention`. Note that "locally" and "globally" attending tokens are projected by different query, key and value matrices. Also note that every "locally" attending token not only attends to tokens within its window :math:`w`, but also to all "globally" attending tokens so that global attention is *symmetric*. From d8dbf3b75d58667e2ecaf42b4aa076e83d034d26 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Sat, 1 Aug 2020 11:51:07 -0700 Subject: [PATCH 078/127] [s2s] clean up + doc (#6184) Co-authored-by: Sam Shleifer --- examples/seq2seq/README.md | 9 +++++++++ examples/seq2seq/finetune.sh | 5 +---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/examples/seq2seq/README.md b/examples/seq2seq/README.md index 58ba4fce118f..9d12dc33481f 100644 --- a/examples/seq2seq/README.md +++ b/examples/seq2seq/README.md @@ -66,6 +66,15 @@ Summarization Tips: Datasets: Seq2SeqDataset will be used for all models besides MBart, for which MBartDataset will be used.** A new dataset is needed to support multilingual tasks. + +### Command Line Options + +To see all the possible command line options, run: + +```bash +./finetune.sh --help # this calls python finetune.py --help +``` + ### Finetuning Training Params To override the pretrained model's training params, you can pass them to `./finetune.sh`: diff --git a/examples/seq2seq/finetune.sh b/examples/seq2seq/finetune.sh index fa7e9bf63cb1..54c89289a065 100755 --- a/examples/seq2seq/finetune.sh +++ b/examples/seq2seq/finetune.sh @@ -2,6 +2,7 @@ export PYTHONPATH="../":"${PYTHONPATH}" # the proper usage is documented in the README, you need to specify data_dir, output_dir and model_name_or_path +# run ./finetune.sh --help to see all the possible options python finetune.py \ --learning_rate=3e-5 \ --fp16 \ @@ -10,8 +11,4 @@ python finetune.py \ --do_predict \ --n_val 1000 \ --val_check_interval 0.1 \ - --encoder_layerdrop 0.1 \ - --decoder_layerdrop 0.1 \ - --dropout 0.1 \ - --attention_dropout 0.1 \ $@ From 82a0e2b67ec94d28b20e24b3393644002bbd0d4b Mon Sep 17 00:00:00 2001 From: Philip May Date: Sun, 2 Aug 2020 09:58:26 +0200 Subject: [PATCH 079/127] Fix docstring for BertTokenizerFast (#6185) - remove duplicate doc-entry for tokenize_chinese_chars - add doc for strip_accents and wordpieces_prefix --- src/transformers/tokenization_bert.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/transformers/tokenization_bert.py b/src/transformers/tokenization_bert.py index 5b5bd311e000..96c71d0d9fde 100644 --- a/src/transformers/tokenization_bert.py +++ b/src/transformers/tokenization_bert.py @@ -577,10 +577,6 @@ class BertTokenizerFast(PreTrainedTokenizerFast): mask_token (:obj:`string`, `optional`, defaults to "[MASK]"): The token used for masking values. This is the token used when training this model with masked language modeling. This is the token which the model will try to predict. - tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`): - Whether to tokenize Chinese characters. - This should likely be deactivated for Japanese: - see: https://github.com/huggingface/transformers/issues/328 clean_text (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to clean the text before tokenization by removing any control characters and replacing all whitespaces by the classic one. @@ -588,6 +584,11 @@ class BertTokenizerFast(PreTrainedTokenizerFast): Whether to tokenize Chinese characters. This should likely be deactivated for Japanese: see: https://github.com/huggingface/transformers/issues/328 + strip_accents: (:obj:`bool`, `optional`, defaults to :obj:`None`): + Whether to strip all accents. If this option is not specified (ie == None), + then it will be determined by the value for `lowercase` (as in the original Bert). + wordpieces_prefix: (:obj:`string`, `optional`, defaults to "##"): + The prefix for subwords. """ vocab_files_names = VOCAB_FILES_NAMES From 16c2240164d8303dd0abd1483461432fd7b9c416 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20M=C3=BCller?= Date: Mon, 3 Aug 2020 09:53:38 +0200 Subject: [PATCH 080/127] Add script to convert tf2.x checkpoint to PyTorch (#5791) * Add script to convert tf2.x checkpoint to pytorch The script converts the newer TF2.x checkpoints (as published on their official GitHub: https://github.com/tensorflow/models/tree/master/official/nlp/bert) to Pytorch. * rename file in order to stay consistent with naming convention --- ...bert_original_tf2_checkpoint_to_pytorch.py | 226 ++++++++++++++++++ 1 file changed, 226 insertions(+) create mode 100644 src/transformers/convert_bert_original_tf2_checkpoint_to_pytorch.py diff --git a/src/transformers/convert_bert_original_tf2_checkpoint_to_pytorch.py b/src/transformers/convert_bert_original_tf2_checkpoint_to_pytorch.py new file mode 100644 index 000000000000..0e37b24246ff --- /dev/null +++ b/src/transformers/convert_bert_original_tf2_checkpoint_to_pytorch.py @@ -0,0 +1,226 @@ +""" +This script can be used to convert a head-less TF2.x Bert model to PyTorch, +as published on the official GitHub: https://github.com/tensorflow/models/tree/master/official/nlp/bert + +TF2.x uses different variable names from the original BERT (TF 1.4) implementation. +The script re-maps the TF2.x Bert weight names to the original names, so the model can be imported with Huggingface/transformer. + +You may adapt this script to include classification/MLM/NSP/etc. heads. +""" +import argparse +import logging +import os +import re + +import tensorflow as tf +import torch + +from transformers import BertConfig, BertModel + + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def load_tf2_weights_in_bert(model, tf_checkpoint_path, config): + tf_path = os.path.abspath(tf_checkpoint_path) + logger.info("Converting TensorFlow checkpoint from {}".format(tf_path)) + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + layer_depth = [] + for full_name, shape in init_vars: + # logger.info("Loading TF weight {} with shape {}".format(name, shape)) + name = full_name.split("/") + if full_name == "_CHECKPOINTABLE_OBJECT_GRAPH" or name[0] in ["global_step", "save_counter"]: + logger.info(f"Skipping non-model layer {full_name}") + continue + if "optimizer" in full_name: + logger.info(f"Skipping optimization layer {full_name}") + continue + if name[0] == "model": + # ignore initial 'model' + name = name[1:] + # figure out how many levels deep the name is + depth = 0 + for _name in name: + if _name.startswith("layer_with_weights"): + depth += 1 + else: + break + layer_depth.append(depth) + # read data + array = tf.train.load_variable(tf_path, full_name) + names.append("/".join(name)) + arrays.append(array) + logger.info(f"Read a total of {len(arrays):,} layers") + + # Sanity check + if len(set(layer_depth)) != 1: + raise ValueError(f"Found layer names with different depths (layer depth {list(set(layer_depth))})") + layer_depth = list(set(layer_depth))[0] + if layer_depth != 1: + raise ValueError( + "The model contains more than just the embedding/encoder layers. This script does not handle MLM/NSP heads." + ) + + # convert layers + logger.info("Converting weights...") + for full_name, array in zip(names, arrays): + name = full_name.split("/") + pointer = model + trace = [] + for i, m_name in enumerate(name): + if m_name == ".ATTRIBUTES": + # variable names end with .ATTRIBUTES/VARIABLE_VALUE + break + if m_name.startswith("layer_with_weights"): + layer_num = int(m_name.split("-")[-1]) + if layer_num <= 2: + # embedding layers + # layer_num 0: word_embeddings + # layer_num 1: position_embeddings + # layer_num 2: token_type_embeddings + continue + elif layer_num == 3: + # embedding LayerNorm + trace.extend(["embeddings", "LayerNorm"]) + pointer = getattr(pointer, "embeddings") + pointer = getattr(pointer, "LayerNorm") + elif layer_num > 3 and layer_num < config.num_hidden_layers + 4: + # encoder layers + trace.extend(["encoder", "layer", str(layer_num - 4)]) + pointer = getattr(pointer, "encoder") + pointer = getattr(pointer, "layer") + pointer = pointer[layer_num - 4] + elif layer_num == config.num_hidden_layers + 4: + # pooler layer + trace.extend(["pooler", "dense"]) + pointer = getattr(pointer, "pooler") + pointer = getattr(pointer, "dense") + elif m_name == "embeddings": + trace.append("embeddings") + pointer = getattr(pointer, "embeddings") + if layer_num == 0: + trace.append("word_embeddings") + pointer = getattr(pointer, "word_embeddings") + elif layer_num == 1: + trace.append("position_embeddings") + pointer = getattr(pointer, "position_embeddings") + elif layer_num == 2: + trace.append("token_type_embeddings") + pointer = getattr(pointer, "token_type_embeddings") + else: + raise ValueError("Unknown embedding layer with name {full_name}") + trace.append("weight") + pointer = getattr(pointer, "weight") + elif m_name == "_attention_layer": + # self-attention layer + trace.extend(["attention", "self"]) + pointer = getattr(pointer, "attention") + pointer = getattr(pointer, "self") + elif m_name == "_attention_layer_norm": + # output attention norm + trace.extend(["attention", "output", "LayerNorm"]) + pointer = getattr(pointer, "attention") + pointer = getattr(pointer, "output") + pointer = getattr(pointer, "LayerNorm") + elif m_name == "_attention_output_dense": + # output attention dense + trace.extend(["attention", "output", "dense"]) + pointer = getattr(pointer, "attention") + pointer = getattr(pointer, "output") + pointer = getattr(pointer, "dense") + elif m_name == "_output_dense": + # output dense + trace.extend(["output", "dense"]) + pointer = getattr(pointer, "output") + pointer = getattr(pointer, "dense") + elif m_name == "_output_layer_norm": + # output dense + trace.extend(["output", "LayerNorm"]) + pointer = getattr(pointer, "output") + pointer = getattr(pointer, "LayerNorm") + elif m_name == "_key_dense": + # attention key + trace.append("key") + pointer = getattr(pointer, "key") + elif m_name == "_query_dense": + # attention query + trace.append("query") + pointer = getattr(pointer, "query") + elif m_name == "_value_dense": + # attention value + trace.append("value") + pointer = getattr(pointer, "value") + elif m_name == "_intermediate_dense": + # attention intermediate dense + trace.extend(["intermediate", "dense"]) + pointer = getattr(pointer, "intermediate") + pointer = getattr(pointer, "dense") + elif m_name == "_output_layer_norm": + # output layer norm + trace.append("output") + pointer = getattr(pointer, "output") + # weights & biases + elif m_name in ["bias", "beta"]: + trace.append("bias") + pointer = getattr(pointer, "bias") + elif m_name in ["kernel", "gamma"]: + trace.append("weight") + pointer = getattr(pointer, "weight") + else: + logger.warning(f"Ignored {m_name}") + # for certain layers reshape is necessary + trace = ".".join(trace) + if re.match(r"(\S+)\.attention\.self\.(key|value|query)\.(bias|weight)", trace) or re.match( + r"(\S+)\.attention\.output\.dense\.weight", trace + ): + array = array.reshape(pointer.data.shape) + if "kernel" in full_name: + array = array.transpose() + if pointer.shape == array.shape: + pointer.data = torch.from_numpy(array) + else: + raise ValueError( + f"Shape mismatch in layer {full_name}: Model expects shape {pointer.shape} but layer contains shape: {array.shape}" + ) + logger.info(f"Successfully set variable {full_name} to PyTorch layer {trace}") + return model + + +def convert_tf2_checkpoint_to_pytorch(tf_checkpoint_path, config_path, pytorch_dump_path): + # Instantiate model + logger.info(f"Loading model based on config from {config_path}...") + config = BertConfig.from_json_file(config_path) + model = BertModel(config) + + # Load weights from checkpoint + logger.info(f"Loading weights from checkpoint {tf_checkpoint_path}...") + load_tf2_weights_in_bert(model, tf_checkpoint_path, config) + + # Save pytorch-model + logger.info(f"Saving PyTorch model to {pytorch_dump_path}...") + torch.save(model.state_dict(), pytorch_dump_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--tf_checkpoint_path", type=str, required=True, help="Path to the TensorFlow 2.x checkpoint path." + ) + parser.add_argument( + "--bert_config_file", + type=str, + required=True, + help="The config json file corresponding to the BERT model. This specifies the model architecture.", + ) + parser.add_argument( + "--pytorch_dump_path", + type=str, + required=True, + help="Path to the output PyTorch model (must include filename).", + ) + args = parser.parse_args() + convert_tf2_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path) From 5a0dac53bfd6e69ae64fb3119d607445e1a308d8 Mon Sep 17 00:00:00 2001 From: Teven Date: Mon, 3 Aug 2020 10:19:03 +0200 Subject: [PATCH 081/127] Empty assert hunt (#6056) * Fixed empty asserts * black-reformatted stragglers in templates * More code quality checks * Update src/transformers/convert_marian_to_pytorch.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/convert_marian_to_pytorch.py Co-authored-by: Sam Shleifer * removed unused line as per @sshleifer Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Sam Shleifer --- src/transformers/commands/train.py | 1 - src/transformers/convert_marian_to_pytorch.py | 27 ++++++++++++------- .../data/datasets/language_modeling.py | 4 +-- src/transformers/data/metrics/__init__.py | 8 ++++-- .../data/metrics/squad_metrics.py | 8 +++--- src/transformers/data/processors/utils.py | 8 ++++-- src/transformers/data/processors/xnli.py | 8 ++++-- src/transformers/modeling_albert.py | 4 ++- src/transformers/modeling_bert.py | 4 ++- src/transformers/modeling_electra.py | 4 ++- src/transformers/modeling_gpt2.py | 4 ++- src/transformers/modeling_mobilebert.py | 4 ++- src/transformers/modeling_openai.py | 8 ++++-- src/transformers/modeling_t5.py | 4 ++- src/transformers/modeling_tf_albert.py | 4 ++- src/transformers/modeling_tf_distilbert.py | 12 +++++---- src/transformers/modeling_tf_flaubert.py | 12 ++++++--- src/transformers/modeling_tf_openai.py | 4 ++- src/transformers/modeling_tf_xlnet.py | 3 +-- src/transformers/modeling_transfo_xl.py | 4 ++- src/transformers/modeling_xlnet.py | 12 ++++++--- src/transformers/tokenization_transfo_xl.py | 4 +-- src/transformers/tokenization_utils_base.py | 5 ++-- src/transformers/trainer.py | 14 +++++++--- .../adding_a_new_example_script/utils_xxx.py | 20 +++++++++----- templates/adding_a_new_model/modeling_xxx.py | 4 ++- 26 files changed, 131 insertions(+), 63 deletions(-) diff --git a/src/transformers/commands/train.py b/src/transformers/commands/train.py index 483da8748bc5..36d88318069f 100644 --- a/src/transformers/commands/train.py +++ b/src/transformers/commands/train.py @@ -81,7 +81,6 @@ def __init__(self, args: Namespace): self.framework = "tf" if is_tf_available() else "torch" os.makedirs(args.output, exist_ok=True) - assert os.path.isdir(args.output) self.output = args.output self.column_label = args.column_label diff --git a/src/transformers/convert_marian_to_pytorch.py b/src/transformers/convert_marian_to_pytorch.py index bd58534ed3ea..9498d8c2ef94 100644 --- a/src/transformers/convert_marian_to_pytorch.py +++ b/src/transformers/convert_marian_to_pytorch.py @@ -166,7 +166,7 @@ def write_model_card( extra_markdown = f"### {hf_model_name}\n\n* source languages: {s}\n* target languages: {t}\n* OPUS readme: [{opus_name}]({readme_url})\n" # combine with opus markdown opus_readme_path = Path(f"{repo_path}{opus_name}/README.md") - assert opus_readme_path.exists(), opus_readme_path + assert opus_readme_path.exists(), f"Readme file {opus_readme_path} not found" content = opus_readme_path.open().read() content = content.split("\n# ")[-1] # Get the lowest level 1 header in the README -- the most recent model. content = "*".join(content.split("*")[1:]) @@ -231,7 +231,9 @@ def fetch_test_set(test_set_url): src = lmap(str.strip, lns[::4]) gold = lmap(str.strip, lns[1::4]) mar_model = lmap(str.strip, lns[2::4]) - assert len(gold) == len(mar_model) == len(src) + assert ( + len(gold) == len(mar_model) == len(src) + ), f"Gold, marian and source lengths {len(gold)}, {len(mar_model)}, {len(src)} mismatched" os.remove(fname) return src, mar_model, gold @@ -374,20 +376,21 @@ def __init__(self, source_dir): self.state_dict = np.load(npz_path) cfg = load_config_from_state_dict(self.state_dict) assert cfg["dim-vocabs"][0] == cfg["dim-vocabs"][1] - assert "Wpos" not in self.state_dict + assert "Wpos" not in self.state_dict, "Wpos key in state dictionary" self.state_dict = dict(self.state_dict) self.wemb, self.final_bias = add_emb_entries(self.state_dict["Wemb"], self.state_dict[BIAS_KEY], 1) self.pad_token_id = self.wemb.shape[0] - 1 cfg["vocab_size"] = self.pad_token_id + 1 # self.state_dict['Wemb'].sha self.state_keys = list(self.state_dict.keys()) - if "Wtype" in self.state_dict: - raise ValueError("found Wtype key") + assert "Wtype" not in self.state_dict, "Wtype key in state dictionary" self._check_layer_entries() self.source_dir = source_dir self.cfg = cfg hidden_size, intermediate_shape = self.state_dict["encoder_l1_ffn_W1"].shape - assert hidden_size == cfg["dim-emb"] == 512 + assert ( + hidden_size == cfg["dim-emb"] == 512 + ), f"Hidden size {hidden_size} and configured size {cfg['dim_emb']} mismatched or not 512" # Process decoder.yml decoder_yml = cast_marian_config(load_yaml(source_dir / "decoder.yml")) @@ -448,7 +451,7 @@ def sub_keys(self, layer_prefix): def load_marian_model(self) -> MarianMTModel: state_dict, cfg = self.state_dict, self.hf_config - assert cfg.static_position_embeddings + assert cfg.static_position_embeddings, "config.static_position_embeddings should be True" model = MarianMTModel(cfg) assert "hidden_size" not in cfg.to_dict() @@ -476,7 +479,9 @@ def load_marian_model(self) -> MarianMTModel: raise NotImplementedError("Need to convert layernorm_embedding") assert not self.extra_keys, f"Failed to convert {self.extra_keys}" - assert model.model.shared.padding_idx == self.pad_token_id + assert ( + model.model.shared.padding_idx == self.pad_token_id + ), f"Padding tokens {model.model.shared.padding_idx} and {self.pad_token_id} mismatched" return model @@ -500,7 +505,9 @@ def convert(source_dir: Path, dest_dir): save_tokenizer(tokenizer, dest_dir) opus_state = OpusState(source_dir) - assert opus_state.cfg["vocab_size"] == len(tokenizer.encoder) + assert opus_state.cfg["vocab_size"] == len( + tokenizer.encoder + ), f"Original vocab size {opus_state.cfg['vocab_size']} and new vocab size {len(tokenizer.encoder)} mismatched" # save_json(opus_state.cfg, dest_dir / "marian_original_config.json") # ^^ Save human readable marian config for debugging @@ -517,7 +524,7 @@ def convert(source_dir: Path, dest_dir): args = parser.parse_args() source_dir = Path(args.src) - assert source_dir.exists() + assert source_dir.exists(), f"Source directory {source_dir} not found" dest_dir = f"converted-{source_dir.name}" if args.dest is None else args.dest convert(source_dir, dest_dir) diff --git a/src/transformers/data/datasets/language_modeling.py b/src/transformers/data/datasets/language_modeling.py index 94988a859b66..5a9aeb2225b5 100644 --- a/src/transformers/data/datasets/language_modeling.py +++ b/src/transformers/data/datasets/language_modeling.py @@ -22,7 +22,7 @@ class TextDataset(Dataset): def __init__( self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, overwrite_cache=False, ): - assert os.path.isfile(file_path) + assert os.path.isfile(file_path), f"Input file path {file_path} not found" block_size = block_size - tokenizer.num_special_tokens_to_add(pair=False) @@ -82,7 +82,7 @@ class LineByLineTextDataset(Dataset): """ def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int): - assert os.path.isfile(file_path) + assert os.path.isfile(file_path), f"Input file path {file_path} not found" # Here, we do not cache the features, operating under the assumption # that we will soon use fast multithreaded tokenizers from the # `tokenizers` repo everywhere =) diff --git a/src/transformers/data/metrics/__init__.py b/src/transformers/data/metrics/__init__.py index 59ffdc8db1d9..3bb437602dc7 100644 --- a/src/transformers/data/metrics/__init__.py +++ b/src/transformers/data/metrics/__init__.py @@ -51,7 +51,9 @@ def pearson_and_spearman(preds, labels): } def glue_compute_metrics(task_name, preds, labels): - assert len(preds) == len(labels) + assert len(preds) == len( + labels + ), f"Predictions and labels have mismatched lengths {len(preds)} and {len(labels)}" if task_name == "cola": return {"mcc": matthews_corrcoef(labels, preds)} elif task_name == "sst-2": @@ -78,7 +80,9 @@ def glue_compute_metrics(task_name, preds, labels): raise KeyError(task_name) def xnli_compute_metrics(task_name, preds, labels): - assert len(preds) == len(labels) + assert len(preds) == len( + labels + ), f"Predictions and labels have mismatched lengths {len(preds)} and {len(labels)}" if task_name == "xnli": return {"acc": simple_accuracy(preds, labels)} else: diff --git a/src/transformers/data/metrics/squad_metrics.py b/src/transformers/data/metrics/squad_metrics.py index c467fee71b35..d01c34bf08d1 100644 --- a/src/transformers/data/metrics/squad_metrics.py +++ b/src/transformers/data/metrics/squad_metrics.py @@ -523,7 +523,7 @@ def compute_predictions_logits( if not nbest: nbest.append(_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) - assert len(nbest) >= 1 + assert len(nbest) >= 1, "No valid predictions" total_scores = [] best_non_null_entry = None @@ -544,7 +544,7 @@ def compute_predictions_logits( output["end_logit"] = entry.end_logit nbest_json.append(output) - assert len(nbest_json) >= 1 + assert len(nbest_json) >= 1, "No valid predictions" if not version_2_with_negative: all_predictions[example.qas_id] = nbest_json[0]["text"] @@ -739,8 +739,8 @@ def compute_predictions_log_probs( output["end_log_prob"] = entry.end_log_prob nbest_json.append(output) - assert len(nbest_json) >= 1 - assert best_non_null_entry is not None + assert len(nbest_json) >= 1, "No valid predictions" + assert best_non_null_entry is not None, "No valid predictions" score_diff = score_null scores_diff_json[example.qas_id] = score_diff diff --git a/src/transformers/data/processors/utils.py b/src/transformers/data/processors/utils.py index 4550e5756bbd..7df0471608b7 100644 --- a/src/transformers/data/processors/utils.py +++ b/src/transformers/data/processors/utils.py @@ -194,8 +194,12 @@ def add_examples_from_csv( def add_examples( self, texts_or_text_and_labels, labels=None, ids=None, overwrite_labels=False, overwrite_examples=False ): - assert labels is None or len(texts_or_text_and_labels) == len(labels) - assert ids is None or len(texts_or_text_and_labels) == len(ids) + assert labels is None or len(texts_or_text_and_labels) == len( + labels + ), f"Text and labels have mismatched lengths {len(texts_or_text_and_labels)} and {len(labels)}" + assert ids is None or len(texts_or_text_and_labels) == len( + ids + ), f"Text and ids have mismatched lengths {len(texts_or_text_and_labels)} and {len(ids)}" if ids is None: ids = [None] * len(texts_or_text_and_labels) if labels is None: diff --git a/src/transformers/data/processors/xnli.py b/src/transformers/data/processors/xnli.py index 6a744c628014..8564634e0df7 100644 --- a/src/transformers/data/processors/xnli.py +++ b/src/transformers/data/processors/xnli.py @@ -45,7 +45,9 @@ def get_train_examples(self, data_dir): text_a = line[0] text_b = line[1] label = "contradiction" if line[2] == "contradictory" else line[2] - assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str) + assert isinstance(text_a, str), f"Training input {text_a} is not a string" + assert isinstance(text_b, str), f"Training input {text_b} is not a string" + assert isinstance(label, str), f"Training label {label} is not a string" examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples @@ -63,7 +65,9 @@ def get_test_examples(self, data_dir): text_a = line[6] text_b = line[7] label = line[1] - assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str) + assert isinstance(text_a, str), f"Training input {text_a} is not a string" + assert isinstance(text_b, str), f"Training input {text_b} is not a string" + assert isinstance(label, str), f"Training label {label} is not a string" examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples diff --git a/src/transformers/modeling_albert.py b/src/transformers/modeling_albert.py index ef96228b5ba1..cbd94ce47f66 100644 --- a/src/transformers/modeling_albert.py +++ b/src/transformers/modeling_albert.py @@ -179,7 +179,9 @@ def load_tf_weights_in_albert(model, config, tf_checkpoint_path): elif m_name == "kernel": array = np.transpose(array) try: - assert pointer.shape == array.shape + assert ( + pointer.shape == array.shape + ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched" except AssertionError as e: e.args += (pointer.shape, array.shape) raise diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py index 11dd8f8b36d0..74c5acafbe66 100644 --- a/src/transformers/modeling_bert.py +++ b/src/transformers/modeling_bert.py @@ -146,7 +146,9 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path): elif m_name == "kernel": array = np.transpose(array) try: - assert pointer.shape == array.shape + assert ( + pointer.shape == array.shape + ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched" except AssertionError as e: e.args += (pointer.shape, array.shape) raise diff --git a/src/transformers/modeling_electra.py b/src/transformers/modeling_electra.py index 1f2cb118c0f0..5e4e1286d28f 100644 --- a/src/transformers/modeling_electra.py +++ b/src/transformers/modeling_electra.py @@ -114,7 +114,9 @@ def load_tf_weights_in_electra(model, config, tf_checkpoint_path, discriminator_ elif m_name == "kernel": array = np.transpose(array) try: - assert pointer.shape == array.shape, original_name + assert ( + pointer.shape == array.shape + ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched" except AssertionError as e: e.args += (pointer.shape, array.shape) raise diff --git a/src/transformers/modeling_gpt2.py b/src/transformers/modeling_gpt2.py index a2168726ccd2..3a8d104d8941 100644 --- a/src/transformers/modeling_gpt2.py +++ b/src/transformers/modeling_gpt2.py @@ -106,7 +106,9 @@ def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path): num = int(scope_names[1]) pointer = pointer[num] try: - assert pointer.shape == array.shape + assert ( + pointer.shape == array.shape + ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched" except AssertionError as e: e.args += (pointer.shape, array.shape) raise diff --git a/src/transformers/modeling_mobilebert.py b/src/transformers/modeling_mobilebert.py index d3a4cd8e32ba..f0b01cfa618b 100644 --- a/src/transformers/modeling_mobilebert.py +++ b/src/transformers/modeling_mobilebert.py @@ -130,7 +130,9 @@ def load_tf_weights_in_mobilebert(model, config, tf_checkpoint_path): elif m_name == "kernel": array = np.transpose(array) try: - assert pointer.shape == array.shape + assert ( + pointer.shape == array.shape + ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched" except AssertionError as e: e.args += (pointer.shape, array.shape) raise diff --git a/src/transformers/modeling_openai.py b/src/transformers/modeling_openai.py index e3406bc291e1..071b86662f87 100644 --- a/src/transformers/modeling_openai.py +++ b/src/transformers/modeling_openai.py @@ -121,12 +121,16 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path): num = int(scope_names[1]) pointer = pointer[num] try: - assert pointer.shape == array.shape + assert ( + pointer.shape == array.shape + ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched" except AssertionError as e: e.args += (pointer.shape, array.shape) raise try: - assert pointer.shape == array.shape + assert ( + pointer.shape == array.shape + ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched" except AssertionError as e: e.args += (pointer.shape, array.shape) raise diff --git a/src/transformers/modeling_t5.py b/src/transformers/modeling_t5.py index d7665ba2017d..03a0827e1d2a 100644 --- a/src/transformers/modeling_t5.py +++ b/src/transformers/modeling_t5.py @@ -131,7 +131,9 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path): logger.info("Transposing numpy weight of shape {} for {}".format(array.shape, name)) array = np.transpose(array) try: - assert pointer.shape == array.shape + assert ( + pointer.shape == array.shape + ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched" except AssertionError as e: e.args += (pointer.shape, array.shape) raise diff --git a/src/transformers/modeling_tf_albert.py b/src/transformers/modeling_tf_albert.py index 0d9b699ddda2..6facc33c22d2 100644 --- a/src/transformers/modeling_tf_albert.py +++ b/src/transformers/modeling_tf_albert.py @@ -170,7 +170,9 @@ def __init__(self, config, **kwargs): ) self.num_attention_heads = config.num_attention_heads - assert config.hidden_size % config.num_attention_heads == 0 + assert ( + config.hidden_size % config.num_attention_heads == 0 + ), f"Hidden size {config.hidden_size} not dividable by number of heads {config.num_attention_heads}" self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size diff --git a/src/transformers/modeling_tf_distilbert.py b/src/transformers/modeling_tf_distilbert.py index a22328d743b0..577be0b6fc9c 100644 --- a/src/transformers/modeling_tf_distilbert.py +++ b/src/transformers/modeling_tf_distilbert.py @@ -195,7 +195,7 @@ def __init__(self, config, **kwargs): self.dim = config.dim self.dropout = tf.keras.layers.Dropout(config.attention_dropout) - assert self.dim % self.n_heads == 0 + assert self.dim % self.n_heads == 0, f"Hidden size {self.dim} not dividable by number of heads {self.n_heads}" self.q_lin = tf.keras.layers.Dense( config.dim, kernel_initializer=get_initializer(config.initializer_range), name="q_lin" @@ -311,7 +311,9 @@ def __init__(self, config, **kwargs): self.dropout = tf.keras.layers.Dropout(config.dropout) self.activation = config.activation - assert config.dim % config.n_heads == 0 + assert ( + config.dim % config.n_heads == 0 + ), f"Hidden size {config.dim} not dividable by number of heads {config.n_heads}" self.attention = TFMultiHeadSelfAttention(config, name="attention") self.sa_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="sa_layer_norm") @@ -395,11 +397,11 @@ def call(self, inputs, training=False): hidden_state = layer_outputs[-1] if cast_bool_to_primitive(output_attentions) is True: - assert len(layer_outputs) == 2 + assert len(layer_outputs) == 2, f"Incorrect number of outputs {len(layer_outputs)} instead of 2" attentions = layer_outputs[0] all_attentions = all_attentions + (attentions,) else: - assert len(layer_outputs) == 1 + assert len(layer_outputs) == 1, f"Incorrect number of outputs {len(layer_outputs)} instead of 1" # Add last layer if cast_bool_to_primitive(output_hidden_states) is True: @@ -1024,7 +1026,7 @@ def __init__(self, config, *inputs, **kwargs): self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) - assert config.num_labels == 2 + assert config.num_labels == 2, f"Incorrect number of labels {config.num_labels} instead of 2" self.dropout = tf.keras.layers.Dropout(config.qa_dropout) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) diff --git a/src/transformers/modeling_tf_flaubert.py b/src/transformers/modeling_tf_flaubert.py index cf721be25ccd..9885004586a9 100644 --- a/src/transformers/modeling_tf_flaubert.py +++ b/src/transformers/modeling_tf_flaubert.py @@ -193,7 +193,9 @@ def call( # check inputs # assert shape_list(lengths)[0] == bs - tf.debugging.assert_equal(shape_list(lengths)[0], bs) + tf.debugging.assert_equal( + shape_list(lengths)[0], bs + ), f"Expected batch size {shape_list(lengths)[0]} and received batch size {bs} mismatched" # assert lengths.max().item() <= slen # input_ids = input_ids.transpose(0, 1) # batch size as dimension 0 # assert (src_enc is None) == (src_len is None) @@ -211,13 +213,17 @@ def call( position_ids = tf.expand_dims(tf.range(slen), axis=0) else: # assert shape_list(position_ids) == [bs, slen] # (slen, bs) - tf.debugging.assert_equal(shape_list(position_ids), [bs, slen]) + tf.debugging.assert_equal( + shape_list(position_ids), [bs, slen] + ), f"Position id shape {shape_list(position_ids)} and input shape {[bs, slen]} mismatched" # position_ids = position_ids.transpose(0, 1) # langs if langs is not None: # assert shape_list(langs) == [bs, slen] # (slen, bs) - tf.debugging.assert_equal(shape_list(langs), [bs, slen]) + tf.debugging.assert_equal( + shape_list(langs), [bs, slen] + ), f"Lang shape {shape_list(langs)} and input shape {[bs, slen]} mismatched" # langs = langs.transpose(0, 1) # Prepare head mask if needed diff --git a/src/transformers/modeling_tf_openai.py b/src/transformers/modeling_tf_openai.py index ef6805abcc9a..7d7adb1407ea 100644 --- a/src/transformers/modeling_tf_openai.py +++ b/src/transformers/modeling_tf_openai.py @@ -77,7 +77,9 @@ def __init__(self, nx, n_ctx, config, scale=False, **kwargs): n_state = nx # in Attention: n_state=768 (nx=n_embd) # [switch nx => n_state from Block to Attention to keep identical to TF implem] - assert n_state % config.n_head == 0 + assert ( + n_state % config.n_head == 0 + ), f"Hidden dimension {n_state} not dividable by number of heads {config.n_head}" self.n_ctx = n_ctx self.n_head = config.n_head self.split_size = n_state diff --git a/src/transformers/modeling_tf_xlnet.py b/src/transformers/modeling_tf_xlnet.py index 769a997c4947..1c8cba4e9801 100644 --- a/src/transformers/modeling_tf_xlnet.py +++ b/src/transformers/modeling_tf_xlnet.py @@ -493,8 +493,7 @@ def relative_positional_encoding(self, qlen, klen, bsz=None, dtype=None): bwd_pos_seq = tf.clip_by_value(bwd_pos_seq, -self.clamp_len, self.clamp_len) if bsz is not None: - # With bi_data, the batch size should be divisible by 2. - assert bsz % 2 == 0 + assert bsz % 2 == 0, f"With bi_data, the batch size {bsz} should be divisible by 2" fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz // 2) bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq, bsz // 2) else: diff --git a/src/transformers/modeling_transfo_xl.py b/src/transformers/modeling_transfo_xl.py index bdad2f406d93..f17855ce1d91 100644 --- a/src/transformers/modeling_transfo_xl.py +++ b/src/transformers/modeling_transfo_xl.py @@ -155,7 +155,9 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path): p_i.data = torch.from_numpy(arr_i) else: try: - assert pointer.shape == array.shape + assert ( + pointer.shape == array.shape + ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched" except AssertionError as e: e.args += (pointer.shape, array.shape) raise diff --git a/src/transformers/modeling_xlnet.py b/src/transformers/modeling_xlnet.py index e0892661ffbb..9746fb008f2a 100644 --- a/src/transformers/modeling_xlnet.py +++ b/src/transformers/modeling_xlnet.py @@ -169,11 +169,15 @@ def load_tf_weights_in_xlnet(model, config, tf_path): array = np.transpose(array) if isinstance(pointer, list): # Here we will split the TF weights - assert len(pointer) == array.shape[0] + assert ( + len(pointer) == array.shape[0] + ), f"Pointer length {len(pointer)} and array length {array.shape[0]} mismatched" for i, p_i in enumerate(pointer): arr_i = array[i, ...] try: - assert p_i.shape == arr_i.shape + assert ( + p_i.shape == arr_i.shape + ), f"Pointer shape {p_i.shape} and array shape {arr_i.shape} mismatched" except AssertionError as e: e.args += (p_i.shape, arr_i.shape) raise @@ -181,7 +185,9 @@ def load_tf_weights_in_xlnet(model, config, tf_path): p_i.data = torch.from_numpy(arr_i) else: try: - assert pointer.shape == array.shape + assert ( + pointer.shape == array.shape + ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched" except AssertionError as e: e.args += (pointer.shape, array.shape) raise diff --git a/src/transformers/tokenization_transfo_xl.py b/src/transformers/tokenization_transfo_xl.py index cd1635841a72..9a2827217765 100644 --- a/src/transformers/tokenization_transfo_xl.py +++ b/src/transformers/tokenization_transfo_xl.py @@ -147,7 +147,7 @@ def _compile_space_around_punctuation_pattern(self): def count_file(self, path, verbose=False, add_eos=False): if verbose: logger.info("counting file {} ...".format(path)) - assert os.path.exists(path) + assert os.path.exists(path), f"Input file {path} not found" sents = [] with open(path, "r", encoding="utf-8") as f: @@ -233,7 +233,7 @@ def build_vocab(self): def encode_file(self, path, ordered=False, verbose=False, add_eos=True, add_double_eos=False): if verbose: logger.info("encoding file {} ...".format(path)) - assert os.path.exists(path) + assert os.path.exists(path), f"Output file {path} not found" encoded = [] with open(path, "r", encoding="utf-8") as f: for idx, line in enumerate(f): diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index d63c4bb5545a..f4cab85c71ff 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -683,7 +683,8 @@ def __init__(self, verbose=True, **kwargs): for key, value in kwargs.items(): if key in self.SPECIAL_TOKENS_ATTRIBUTES: if key == "additional_special_tokens": - assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value) + assert isinstance(value, (list, tuple)), f"Value {value} is not a list or tuple" + assert all(isinstance(t, str) for t in value), "One of the tokens is not a string" setattr(self, key, value) elif isinstance(value, (str, AddedToken)): setattr(self, key, value) @@ -752,7 +753,7 @@ def add_special_tokens(self, special_tokens_dict: Dict[str, Union[str, AddedToke added_tokens = 0 for key, value in special_tokens_dict.items(): - assert key in self.SPECIAL_TOKENS_ATTRIBUTES + assert key in self.SPECIAL_TOKENS_ATTRIBUTES, f"Key {key} is not a special token" if self.verbose: logger.info("Assigning %s to the %s key of the tokenizer", value, key) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index f449dd138b17..8a3209355ae9 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -124,11 +124,15 @@ def __iter__(self): # add extra samples to make it evenly divisible indices += indices[: (self.total_size - len(indices))] - assert len(indices) == self.total_size + assert ( + len(indices) == self.total_size + ), f"Indices length {len(indices)} and total size {self.total_size} mismatched" # subsample indices = indices[self.rank * self.num_samples : (self.rank + 1) * self.num_samples] - assert len(indices) == self.num_samples + assert ( + len(indices) == self.num_samples + ), f"Indices length {len(indices)} and and sample number {self.num_samples} mismatched" return iter(indices) @@ -566,9 +570,11 @@ def train(self, model_path: Optional[str] = None): # In all cases (even distributed/parallel), self.model is always a reference # to the model we want to save. if hasattr(model, "module"): - assert model.module is self.model + assert ( + model.module is self.model + ), f"Module {model.module} should be a reference to self.model" else: - assert model is self.model + assert model is self.model, f"Model {model} should be a reference to self.model" # Save model checkpoint output_dir = os.path.join(self.args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{self.global_step}") diff --git a/templates/adding_a_new_example_script/utils_xxx.py b/templates/adding_a_new_example_script/utils_xxx.py index b8f8cdf2b962..bc2a219f9d7e 100644 --- a/templates/adding_a_new_example_script/utils_xxx.py +++ b/templates/adding_a_new_example_script/utils_xxx.py @@ -327,9 +327,15 @@ def convert_examples_to_features( segment_ids.append(pad_token_segment_id) p_mask.append(1) - assert len(input_ids) == max_seq_length - assert len(input_mask) == max_seq_length - assert len(segment_ids) == max_seq_length + assert ( + len(input_ids) == max_seq_length + ), f"Input ids and sequence have mismatched lengths {len(input_ids)} and {max_seq_length}" + assert ( + len(input_mask) == max_seq_length + ), f"Input mask and sequence have mismatched lengths {len(input_mask)} and {max_seq_length}" + assert ( + len(segment_ids) == max_seq_length + ), f"Segment ids and sequence have mismatched lengths {len(segment_ids)} and {max_seq_length}" span_is_impossible = example.is_impossible start_position = None @@ -626,7 +632,7 @@ def write_predictions( if not nbest: nbest.append(_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) - assert len(nbest) >= 1 + assert len(nbest) >= 1, "No valid predictions" total_scores = [] best_non_null_entry = None @@ -647,7 +653,7 @@ def write_predictions( output["end_logit"] = entry.end_logit nbest_json.append(output) - assert len(nbest_json) >= 1 + assert len(nbest_json) >= 1, "No valid predictions" if not version_2_with_negative: all_predictions[example.qas_id] = nbest_json[0]["text"] @@ -843,8 +849,8 @@ def write_predictions_extended( output["end_log_prob"] = entry.end_log_prob nbest_json.append(output) - assert len(nbest_json) >= 1 - assert best_non_null_entry is not None + assert len(nbest_json) >= 1, "No valid predictions" + assert best_non_null_entry is not None, "No valid predictions" score_diff = score_null scores_diff_json[example.qas_id] = score_diff diff --git a/templates/adding_a_new_model/modeling_xxx.py b/templates/adding_a_new_model/modeling_xxx.py index 73676ed249fe..fb54dd7b456b 100644 --- a/templates/adding_a_new_model/modeling_xxx.py +++ b/templates/adding_a_new_model/modeling_xxx.py @@ -121,7 +121,9 @@ def load_tf_weights_in_xxx(model, config, tf_checkpoint_path): elif m_name == "kernel": array = np.transpose(array) try: - assert pointer.shape == array.shape + assert ( + pointer.shape == array.shape + ), f"Pointer and array have mismatched shapes {pointer.shape} and {array.shape}" except AssertionError as e: e.args += (pointer.shape, array.shape) raise From 9996f697e3ed7a0d6fe4348953723ad6b9d51477 Mon Sep 17 00:00:00 2001 From: Julien Plu Date: Mon, 3 Aug 2020 14:10:40 +0200 Subject: [PATCH 082/127] Fix saved model creation (#5468) * Fix TF Serving when output_hidden_states and output_attentions are True * Add tests for saved model creation + bug fix for multiple choices models * remove unused import * Fix the input for several layers * Fix test * Fix conflict printing * Apply style * Fix XLM and Flaubert for TensorFlow * Apply style * Fix TF check version * Apply style * Trigger CI --- src/transformers/modeling_tf_albert.py | 104 ++++++------ src/transformers/modeling_tf_bert.py | 122 ++++++------- src/transformers/modeling_tf_ctrl.py | 63 ++++--- src/transformers/modeling_tf_distilbert.py | 65 ++++--- src/transformers/modeling_tf_electra.py | 29 +++- src/transformers/modeling_tf_flaubert.py | 16 +- src/transformers/modeling_tf_gpt2.py | 57 +++---- src/transformers/modeling_tf_mobilebert.py | 131 ++++++++------ src/transformers/modeling_tf_openai.py | 46 ++--- src/transformers/modeling_tf_roberta.py | 17 +- src/transformers/modeling_tf_t5.py | 13 +- src/transformers/modeling_tf_transfo_xl.py | 37 ++-- .../modeling_tf_transfo_xl_utilities.py | 3 +- src/transformers/modeling_tf_utils.py | 47 +++-- src/transformers/modeling_tf_xlm.py | 30 ++-- src/transformers/modeling_tf_xlnet.py | 160 ++++++++++++------ src/transformers/trainer_tf.py | 15 +- tests/test_modeling_tf_common.py | 57 ++++++- tests/test_modeling_tf_xlnet.py | 8 +- 19 files changed, 572 insertions(+), 448 deletions(-) diff --git a/src/transformers/modeling_tf_albert.py b/src/transformers/modeling_tf_albert.py index 6facc33c22d2..4159a6cb08c9 100644 --- a/src/transformers/modeling_tf_albert.py +++ b/src/transformers/modeling_tf_albert.py @@ -35,7 +35,6 @@ TFQuestionAnsweringLoss, TFSequenceClassificationLoss, TFTokenClassificationLoss, - cast_bool_to_primitive, get_initializer, keras_serializable, shape_list, @@ -99,7 +98,15 @@ def build(self, input_shape): ) super().build(input_shape) - def call(self, inputs, mode="embedding", training=False): + def call( + self, + input_ids=None, + position_ids=None, + token_type_ids=None, + inputs_embeds=None, + mode="embedding", + training=False, + ): """Get token embeddings of inputs. Args: inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids) @@ -115,15 +122,15 @@ def call(self, inputs, mode="embedding", training=False): https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 """ if mode == "embedding": - return self._embedding(inputs, training=training) + return self._embedding(input_ids, position_ids, token_type_ids, inputs_embeds, training=training) elif mode == "linear": - return self._linear(inputs) + return self._linear(input_ids) else: raise ValueError("mode {} is not valid.".format(mode)) - def _embedding(self, inputs, training=False): + def _embedding(self, input_ids, position_ids, token_type_ids, inputs_embeds, training=False): """Applies embedding based on inputs tensor.""" - input_ids, position_ids, token_type_ids, inputs_embeds = inputs + assert not (input_ids is None and inputs_embeds is None) if input_ids is not None: input_shape = shape_list(input_ids) @@ -175,6 +182,7 @@ def __init__(self, config, **kwargs): ), f"Hidden size {config.hidden_size} not dividable by number of heads {config.num_attention_heads}" self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size + self.output_attentions = config.output_attentions self.query = tf.keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" @@ -192,9 +200,7 @@ def transpose_for_scores(self, x, batch_size): x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size)) return tf.transpose(x, perm=[0, 2, 1, 3]) - def call(self, inputs, training=False): - hidden_states, attention_mask, head_mask, output_attentions = inputs - + def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False): batch_size = shape_list(hidden_states)[0] mixed_query_layer = self.query(hidden_states) mixed_key_layer = self.key(hidden_states) @@ -233,9 +239,7 @@ def call(self, inputs, training=False): context_layer, (batch_size, -1, self.all_head_size) ) # (batch_size, seq_len_q, all_head_size) - outputs = ( - (context_layer, attention_probs) if cast_bool_to_primitive(output_attentions) is True else (context_layer,) - ) + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) return outputs @@ -248,9 +252,7 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - def call(self, inputs, training=False): - hidden_states, input_tensor = inputs - + def call(self, hidden_states, input_tensor, training=False): hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states, training=training) hidden_states = self.LayerNorm(hidden_states + input_tensor) @@ -262,6 +264,7 @@ def __init__(self, config, **kwargs): super().__init__(config, **kwargs) self.hidden_size = config.hidden_size + self.output_attentions = config.output_attentions self.dense = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) @@ -271,9 +274,7 @@ def __init__(self, config, **kwargs): def prune_heads(self, heads): raise NotImplementedError - def call(self, inputs, training=False): - input_tensor, attention_mask, head_mask, output_attentions = inputs - + def call(self, input_tensor, attention_mask, head_mask, output_attentions, training=False): batch_size = shape_list(input_tensor)[0] mixed_query_layer = self.query(input_tensor) mixed_key_layer = self.key(input_tensor) @@ -312,9 +313,7 @@ def call(self, inputs, training=False): context_layer, (batch_size, -1, self.all_head_size) ) # (batch_size, seq_len_q, all_head_size) - self_outputs = ( - (context_layer, attention_probs) if cast_bool_to_primitive(output_attentions) is True else (context_layer,) - ) + self_outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) hidden_states = self_outputs[0] @@ -349,11 +348,9 @@ def __init__(self, config, **kwargs): ) self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - def call(self, inputs, training=False): - hidden_states, attention_mask, head_mask, output_attentions = inputs - + def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False): attention_outputs = self.attention( - [hidden_states, attention_mask, head_mask, output_attentions], training=training + hidden_states, attention_mask, head_mask, output_attentions, training=training ) ffn_output = self.ffn(attention_outputs[0]) ffn_output = self.activation(ffn_output) @@ -371,32 +368,32 @@ class TFAlbertLayerGroup(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) + self.output_attentions = config.output_attentions + self.output_hidden_states = config.output_hidden_states self.albert_layers = [ TFAlbertLayer(config, name="albert_layers_._{}".format(i)) for i in range(config.inner_group_num) ] - def call(self, inputs, training=False): - hidden_states, attention_mask, head_mask, output_attentions, output_hidden_states = inputs - + def call(self, hidden_states, attention_mask, head_mask, output_attentions, output_hidden_states, training=False): layer_hidden_states = () layer_attentions = () for layer_index, albert_layer in enumerate(self.albert_layers): layer_output = albert_layer( - [hidden_states, attention_mask, head_mask[layer_index], output_attentions], training=training + hidden_states, attention_mask, head_mask[layer_index], output_attentions, training=training ) hidden_states = layer_output[0] - if cast_bool_to_primitive(output_attentions) is True: + if output_attentions: layer_attentions = layer_attentions + (layer_output[1],) - if cast_bool_to_primitive(output_hidden_states) is True: + if output_hidden_states: layer_hidden_states = layer_hidden_states + (hidden_states,) outputs = (hidden_states,) - if cast_bool_to_primitive(output_hidden_states) is True: + if output_hidden_states: outputs = outputs + (layer_hidden_states,) - if cast_bool_to_primitive(output_attentions) is True: + if output_attentions: outputs = outputs + (layer_attentions,) # last-layer hidden state, (layer hidden states), (layer attentions) return outputs @@ -417,13 +414,11 @@ def __init__(self, config, **kwargs): for i in range(config.num_hidden_groups) ] - def call(self, inputs, training=False): - hidden_states, attention_mask, head_mask, output_attentions, output_hidden_states = inputs - + def call(self, hidden_states, attention_mask, head_mask, output_attentions, output_hidden_states, training=False): hidden_states = self.embedding_hidden_mapping_in(hidden_states) all_attentions = () - if cast_bool_to_primitive(output_hidden_states) is True: + if output_hidden_states: all_hidden_states = (hidden_states,) for i in range(self.config.num_hidden_layers): @@ -434,27 +429,25 @@ def call(self, inputs, training=False): group_idx = int(i / (self.config.num_hidden_layers / self.config.num_hidden_groups)) layer_group_output = self.albert_layer_groups[group_idx]( - [ - hidden_states, - attention_mask, - head_mask[group_idx * layers_per_group : (group_idx + 1) * layers_per_group], - output_attentions, - output_hidden_states, - ], + hidden_states, + attention_mask, + head_mask[group_idx * layers_per_group : (group_idx + 1) * layers_per_group], + output_attentions, + output_hidden_states, training=training, ) hidden_states = layer_group_output[0] - if cast_bool_to_primitive(output_attentions) is True: + if output_attentions: all_attentions = all_attentions + layer_group_output[-1] - if cast_bool_to_primitive(output_hidden_states) is True: + if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) outputs = (hidden_states,) - if cast_bool_to_primitive(output_hidden_states) is True: + if output_hidden_states: outputs = outputs + (all_hidden_states,) - if cast_bool_to_primitive(output_attentions) is True: + if output_attentions: outputs = outputs + (all_attentions,) # last-layer hidden state, (all hidden states), (all attentions) @@ -619,9 +612,13 @@ def call( head_mask = [None] * self.num_hidden_layers # head_mask = tf.constant([0] * self.num_hidden_layers) - embedding_output = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training) + embedding_output = self.embeddings(input_ids, position_ids, token_type_ids, inputs_embeds, training=training) encoder_outputs = self.encoder( - [embedding_output, extended_attention_mask, head_mask, output_attentions, output_hidden_states], + embedding_output, + extended_attention_mask, + head_mask, + output_attentions, + output_hidden_states, training=training, ) @@ -1274,7 +1271,7 @@ def call( flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None - flat_inputs = [ + outputs = self.albert( flat_input_ids, flat_attention_mask, flat_token_type_ids, @@ -1283,9 +1280,8 @@ def call( inputs_embeds, output_attentions, output_hidden_states, - ] - - outputs = self.albert(flat_inputs, training=training) + training=training, + ) pooled_output = outputs[1] diff --git a/src/transformers/modeling_tf_bert.py b/src/transformers/modeling_tf_bert.py index b829b3b8ae69..4c76120f5401 100644 --- a/src/transformers/modeling_tf_bert.py +++ b/src/transformers/modeling_tf_bert.py @@ -36,7 +36,6 @@ TFQuestionAnsweringLoss, TFSequenceClassificationLoss, TFTokenClassificationLoss, - cast_bool_to_primitive, get_initializer, keras_serializable, shape_list, @@ -81,6 +80,7 @@ def gelu(x): Also see https://arxiv.org/abs/1606.08415 """ cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0))) + return x * cdf @@ -94,6 +94,7 @@ def gelu_new(x): `x` with the GELU activation applied. """ cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) + return x * cdf @@ -118,7 +119,6 @@ def __init__(self, config, **kwargs): self.vocab_size = config.vocab_size self.hidden_size = config.hidden_size self.initializer_range = config.initializer_range - self.position_embeddings = tf.keras.layers.Embedding( config.max_position_embeddings, config.hidden_size, @@ -149,7 +149,15 @@ def build(self, input_shape): ) super().build(input_shape) - def call(self, inputs, mode="embedding", training=False): + def call( + self, + input_ids=None, + position_ids=None, + token_type_ids=None, + inputs_embeds=None, + mode="embedding", + training=False, + ): """Get token embeddings of inputs. Args: inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids) @@ -165,15 +173,15 @@ def call(self, inputs, mode="embedding", training=False): https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 """ if mode == "embedding": - return self._embedding(inputs, training=training) + return self._embedding(input_ids, position_ids, token_type_ids, inputs_embeds, training=training) elif mode == "linear": - return self._linear(inputs) + return self._linear(input_ids) else: raise ValueError("mode {} is not valid.".format(mode)) - def _embedding(self, inputs, training=False): + def _embedding(self, input_ids, position_ids, token_type_ids, inputs_embeds, training=False): """Applies embedding based on inputs tensor.""" - input_ids, position_ids, token_type_ids, inputs_embeds = inputs + assert not (input_ids is None and inputs_embeds is None) if input_ids is not None: input_shape = shape_list(input_ids) @@ -181,19 +189,22 @@ def _embedding(self, inputs, training=False): input_shape = shape_list(inputs_embeds)[:-1] seq_length = input_shape[1] + if position_ids is None: position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] + if token_type_ids is None: token_type_ids = tf.fill(input_shape, 0) if inputs_embeds is None: inputs_embeds = tf.gather(self.word_embeddings, input_ids) + position_embeddings = self.position_embeddings(position_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings = inputs_embeds + position_embeddings + token_type_embeddings embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings, training=training) + return embeddings def _linear(self, inputs): @@ -205,7 +216,6 @@ def _linear(self, inputs): """ batch_size = shape_list(inputs)[0] length = shape_list(inputs)[1] - x = tf.reshape(inputs, [-1, self.hidden_size]) logits = tf.matmul(x, self.word_embeddings, transpose_b=True) @@ -215,6 +225,7 @@ def _linear(self, inputs): class TFBertSelfAttention(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) + if config.hidden_size % config.num_attention_heads != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " @@ -225,7 +236,6 @@ def __init__(self, config, **kwargs): assert config.hidden_size % config.num_attention_heads == 0 self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size - self.query = tf.keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" ) @@ -235,21 +245,18 @@ def __init__(self, config, **kwargs): self.value = tf.keras.layers.Dense( self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" ) - self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) def transpose_for_scores(self, x, batch_size): x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size)) - return tf.transpose(x, perm=[0, 2, 1, 3]) - def call(self, inputs, training=False): - hidden_states, attention_mask, head_mask, output_attentions = inputs + return tf.transpose(x, perm=[0, 2, 1, 3]) + def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False): batch_size = shape_list(hidden_states)[0] mixed_query_layer = self.query(hidden_states) mixed_key_layer = self.key(hidden_states) mixed_value_layer = self.value(hidden_states) - query_layer = self.transpose_for_scores(mixed_query_layer, batch_size) key_layer = self.transpose_for_scores(mixed_key_layer, batch_size) value_layer = self.transpose_for_scores(mixed_value_layer, batch_size) @@ -277,15 +284,11 @@ def call(self, inputs, training=False): attention_probs = attention_probs * head_mask context_layer = tf.matmul(attention_probs, value_layer) - context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3]) context_layer = tf.reshape( context_layer, (batch_size, -1, self.all_head_size) ) # (batch_size, seq_len_q, all_head_size) - - outputs = ( - (context_layer, attention_probs) if cast_bool_to_primitive(output_attentions) is True else (context_layer,) - ) + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) return outputs @@ -299,12 +302,11 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - def call(self, inputs, training=False): - hidden_states, input_tensor = inputs - + def call(self, hidden_states, input_tensor, training=False): hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states, training=training) hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states @@ -317,14 +319,13 @@ def __init__(self, config, **kwargs): def prune_heads(self, heads): raise NotImplementedError - def call(self, inputs, training=False): - input_tensor, attention_mask, head_mask, output_attentions = inputs - + def call(self, input_tensor, attention_mask, head_mask, output_attentions, training=False): self_outputs = self.self_attention( - [input_tensor, attention_mask, head_mask, output_attentions], training=training + input_tensor, attention_mask, head_mask, output_attentions, training=training ) - attention_output = self.dense_output([self_outputs[0], input_tensor], training=training) + attention_output = self.dense_output(self_outputs[0], input_tensor, training=training) outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs @@ -334,6 +335,7 @@ def __init__(self, config, **kwargs): self.dense = tf.keras.layers.Dense( config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) + if isinstance(config.hidden_act, str): self.intermediate_act_fn = ACT2FN[config.hidden_act] else: @@ -342,6 +344,7 @@ def __init__(self, config, **kwargs): def call(self, hidden_states): hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states @@ -354,12 +357,11 @@ def __init__(self, config, **kwargs): self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - def call(self, inputs, training=False): - hidden_states, input_tensor = inputs - + def call(self, hidden_states, input_tensor, training=False): hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states, training=training) hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states @@ -370,16 +372,15 @@ def __init__(self, config, **kwargs): self.intermediate = TFBertIntermediate(config, name="intermediate") self.bert_output = TFBertOutput(config, name="output") - def call(self, inputs, training=False): - hidden_states, attention_mask, head_mask, output_attentions = inputs - + def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False): attention_outputs = self.attention( - [hidden_states, attention_mask, head_mask, output_attentions], training=training + hidden_states, attention_mask, head_mask, output_attentions, training=training ) attention_output = attention_outputs[0] intermediate_output = self.intermediate(attention_output) - layer_output = self.bert_output([intermediate_output, attention_output], training=training) + layer_output = self.bert_output(intermediate_output, attention_output, training=training) outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them + return outputs @@ -388,32 +389,34 @@ def __init__(self, config, **kwargs): super().__init__(**kwargs) self.layer = [TFBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)] - def call(self, inputs, training=False): - hidden_states, attention_mask, head_mask, output_attentions, output_hidden_states = inputs - + def call(self, hidden_states, attention_mask, head_mask, output_attentions, output_hidden_states, training=False): all_hidden_states = () all_attentions = () + for i, layer_module in enumerate(self.layer): - if cast_bool_to_primitive(output_hidden_states) is True: + if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) layer_outputs = layer_module( - [hidden_states, attention_mask, head_mask[i], output_attentions], training=training + hidden_states, attention_mask, head_mask[i], output_attentions, training=training ) hidden_states = layer_outputs[0] - if cast_bool_to_primitive(output_attentions) is True: + if output_attentions: all_attentions = all_attentions + (layer_outputs[1],) # Add last layer - if cast_bool_to_primitive(output_hidden_states) is True: + if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) outputs = (hidden_states,) - if cast_bool_to_primitive(output_hidden_states) is True: + + if output_hidden_states: outputs = outputs + (all_hidden_states,) - if cast_bool_to_primitive(output_attentions) is True: + + if output_attentions: outputs = outputs + (all_attentions,) + return outputs # outputs, (hidden states), (attentions) @@ -432,6 +435,7 @@ def call(self, hidden_states): # to the first token. first_token_tensor = hidden_states[:, 0] pooled_output = self.dense(first_token_tensor) + return pooled_output @@ -441,16 +445,19 @@ def __init__(self, config, **kwargs): self.dense = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) + if isinstance(config.hidden_act, str): self.transform_act_fn = ACT2FN[config.hidden_act] else: self.transform_act_fn = config.hidden_act + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") def call(self, hidden_states): hidden_states = self.dense(hidden_states) hidden_states = self.transform_act_fn(hidden_states) hidden_states = self.LayerNorm(hidden_states) + return hidden_states @@ -472,6 +479,7 @@ def call(self, hidden_states): hidden_states = self.transform(hidden_states) hidden_states = self.input_embeddings(hidden_states, mode="linear") hidden_states = hidden_states + self.bias + return hidden_states @@ -482,6 +490,7 @@ def __init__(self, config, input_embeddings, **kwargs): def call(self, sequence_output): prediction_scores = self.predictions(sequence_output) + return prediction_scores @@ -494,6 +503,7 @@ def __init__(self, config, **kwargs): def call(self, pooled_output): seq_relationship_score = self.seq_relationship(pooled_output) + return seq_relationship_score @@ -507,7 +517,6 @@ def __init__(self, config, **kwargs): self.initializer_range = config.initializer_range self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states - self.embeddings = TFBertEmbeddings(config, name="embeddings") self.encoder = TFBertEncoder(config, name="encoder") self.pooler = TFBertPooler(config, name="pooler") @@ -605,18 +614,22 @@ def call( head_mask = [None] * self.num_hidden_layers # head_mask = tf.constant([0] * self.num_hidden_layers) - embedding_output = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training) + embedding_output = self.embeddings(input_ids, position_ids, token_type_ids, inputs_embeds, training=training) encoder_outputs = self.encoder( - [embedding_output, extended_attention_mask, head_mask, output_attentions, output_hidden_states], + embedding_output, + extended_attention_mask, + head_mask, + output_attentions, + output_hidden_states, training=training, ) sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output) - outputs = (sequence_output, pooled_output,) + encoder_outputs[ 1: ] # add hidden_states and attentions if they are here + return outputs # sequence_output, pooled_output, (hidden_states), (attentions) @@ -1211,8 +1224,7 @@ def call( if inputs_embeds is not None else None ) - - flat_inputs = [ + outputs = self.bert( flat_input_ids, flat_attention_mask, flat_token_type_ids, @@ -1221,16 +1233,12 @@ def call( flat_inputs_embeds, output_attentions, output_hidden_states, - ] - - outputs = self.bert(flat_inputs, training=training) - + training=training, + ) pooled_output = outputs[1] - pooled_output = self.dropout(pooled_output, training=training) logits = self.classifier(pooled_output) reshaped_logits = tf.reshape(logits, (-1, num_choices)) - outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: diff --git a/src/transformers/modeling_tf_ctrl.py b/src/transformers/modeling_tf_ctrl.py index 033e100d7233..caeec80f6d01 100644 --- a/src/transformers/modeling_tf_ctrl.py +++ b/src/transformers/modeling_tf_ctrl.py @@ -27,7 +27,6 @@ TFCausalLanguageModelingLoss, TFPreTrainedModel, TFSharedEmbeddings, - cast_bool_to_primitive, keras_serializable, shape_list, ) @@ -87,10 +86,11 @@ def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=N class TFMultiHeadAttention(tf.keras.layers.Layer): - def __init__(self, d_model_size, num_heads, **kwargs): + def __init__(self, d_model_size, num_heads, output_attentions=False, **kwargs): super().__init__(**kwargs) self.num_heads = num_heads self.d_model_size = d_model_size + self.output_attentions = output_attentions self.depth = int(d_model_size / self.num_heads) @@ -104,8 +104,7 @@ def split_into_heads(self, x, batch_size): x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth)) return tf.transpose(x, perm=[0, 2, 1, 3]) - def call(self, inputs, training=False): - v, k, q, mask, layer_past, attention_mask, head_mask, use_cache, output_attentions = inputs + def call(self, v, k, q, mask, layer_past, attention_mask, head_mask, use_cache, output_attentions, training=False): batch_size = shape_list(q)[0] q = self.Wq(q) @@ -121,10 +120,7 @@ def call(self, inputs, training=False): k = tf.concat((past_key, k), axis=-2) v = tf.concat((past_value, v), axis=-2) - # to cope with keras serialization - use_cache = cast_bool_to_primitive(use_cache, True) - - if use_cache is True: + if use_cache: present = tf.stack((k, v), axis=0) else: present = (None,) @@ -134,10 +130,11 @@ def call(self, inputs, training=False): attn = output[1] original_size_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model_size)) output = self.dense(original_size_attention) - outputs = (output, present) - if cast_bool_to_primitive(output_attentions) is True: + + if output_attentions: outputs = outputs + (attn,) + return outputs @@ -156,10 +153,16 @@ def call(self, inputs, trainable=False): class TFEncoderLayer(tf.keras.layers.Layer): - def __init__(self, d_model_size, num_heads, dff, rate=0.1, layer_norm_epsilon=1e-6, **kwargs): + def __init__( + self, d_model_size, num_heads, dff, rate=0.1, layer_norm_epsilon=1e-6, output_attentions=False, **kwargs + ): super().__init__(**kwargs) - self.multi_head_attention = TFMultiHeadAttention(d_model_size, num_heads, name="multi_head_attention") + self.output_attentions = output_attentions + + self.multi_head_attention = TFMultiHeadAttention( + d_model_size, num_heads, output_attentions=self.output_attentions, name="multi_head_attention" + ) self.ffn = TFPointWiseFeedForwardLayer(d_model_size, dff, name="ffn") self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm1") @@ -168,11 +171,18 @@ def __init__(self, d_model_size, num_heads, dff, rate=0.1, layer_norm_epsilon=1e self.dropout1 = tf.keras.layers.Dropout(rate) self.dropout2 = tf.keras.layers.Dropout(rate) - def call(self, inputs, training=False): - x, mask, layer_past, attention_mask, head_mask, use_cache, output_attentions = inputs + def call(self, x, mask, layer_past, attention_mask, head_mask, use_cache, output_attentions, training=False): normed = self.layernorm1(x) attn_outputs = self.multi_head_attention( - [normed, normed, normed, mask, layer_past, attention_mask, head_mask, use_cache, output_attentions], + normed, + normed, + normed, + mask, + layer_past, + attention_mask, + head_mask, + use_cache, + output_attentions, training=training, ) attn_output = attn_outputs[0] @@ -215,6 +225,7 @@ def __init__(self, config, **kwargs): config.dff, config.resid_pdrop, config.layer_norm_epsilon, + self.output_attentions, name="h_._{}".format(i), ) for i in range(config.n_layer) @@ -367,31 +378,37 @@ def call( all_hidden_states = () all_attentions = [] for i, (h, layer_past) in enumerate(zip(self.h, past)): - if cast_bool_to_primitive(output_hidden_states) is True: + if output_hidden_states: all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),) outputs = h( - [hidden_states, mask, layer_past, attention_mask, head_mask[i], use_cache, output_attentions], + hidden_states, + mask, + layer_past, + attention_mask, + head_mask[i], + use_cache, + output_attentions, training=training, ) hidden_states, present = outputs[:2] - if use_cache is True: + if use_cache: presents = presents + (present,) - if cast_bool_to_primitive(output_attentions) is True: + if output_attentions: all_attentions.append(outputs[2]) hidden_states = self.layernorm(hidden_states) hidden_states = tf.reshape(hidden_states, output_shape) - if cast_bool_to_primitive(output_hidden_states) is True: + if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) outputs = (hidden_states,) - if use_cache is True: + if use_cache: outputs = outputs + (presents,) - if cast_bool_to_primitive(output_hidden_states) is True: + if output_hidden_states: outputs = outputs + (all_hidden_states,) - if cast_bool_to_primitive(output_attentions) is True: + if output_attentions: # let the number of heads free (-1) so we can extract attention even after head pruning attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:] all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions) diff --git a/src/transformers/modeling_tf_distilbert.py b/src/transformers/modeling_tf_distilbert.py index 577be0b6fc9c..892417627f44 100644 --- a/src/transformers/modeling_tf_distilbert.py +++ b/src/transformers/modeling_tf_distilbert.py @@ -37,7 +37,6 @@ TFSequenceClassificationLoss, TFSharedEmbeddings, TFTokenClassificationLoss, - cast_bool_to_primitive, get_initializer, keras_serializable, shape_list, @@ -114,7 +113,7 @@ def build(self, input_shape): ) super().build(input_shape) - def call(self, inputs, inputs_embeds=None, mode="embedding", training=False): + def call(self, input_ids=None, position_ids=None, inputs_embeds=None, mode="embedding", training=False): """Get token embeddings of inputs. Args: inputs: list of two int64 tensors with shape [batch_size, length]: (input_ids, position_ids) @@ -130,13 +129,13 @@ def call(self, inputs, inputs_embeds=None, mode="embedding", training=False): https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 """ if mode == "embedding": - return self._embedding(inputs, inputs_embeds=inputs_embeds, training=training) + return self._embedding(input_ids, position_ids, inputs_embeds, training=training) elif mode == "linear": - return self._linear(inputs) + return self._linear(input_ids) else: raise ValueError("mode {} is not valid.".format(mode)) - def _embedding(self, inputs, inputs_embeds=None, training=False): + def _embedding(self, input_ids, position_ids, inputs_embeds, training=False): """ Parameters ---------- @@ -148,11 +147,7 @@ def _embedding(self, inputs, inputs_embeds=None, training=False): embeddings: tf.Tensor(bs, max_seq_length, dim) The embedded tokens (plus position embeddings, no token_type embeddings) """ - if not isinstance(inputs, (tuple, list)): - input_ids = inputs - position_ids = None - else: - input_ids, position_ids = inputs + assert not (input_ids is None and inputs_embeds is None) if input_ids is not None: seq_length = shape_list(input_ids)[1] @@ -194,6 +189,7 @@ def __init__(self, config, **kwargs): self.n_heads = config.n_heads self.dim = config.dim self.dropout = tf.keras.layers.Dropout(config.attention_dropout) + self.output_attentions = config.output_attentions assert self.dim % self.n_heads == 0, f"Hidden size {self.dim} not dividable by number of heads {self.n_heads}" @@ -215,7 +211,7 @@ def __init__(self, config, **kwargs): def prune_heads(self, heads): raise NotImplementedError - def call(self, inputs, training=False): + def call(self, query, key, value, mask, head_mask, output_attentions, training=False): """ Parameters ---------- @@ -231,7 +227,6 @@ def call(self, inputs, training=False): context: tf.Tensor(bs, seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True` """ - query, key, value, mask, head_mask, output_attentions = inputs bs, q_length, dim = shape_list(query) k_length = shape_list(key)[1] # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim) @@ -270,7 +265,7 @@ def unshape(x): context = unshape(context) # (bs, q_length, dim) context = self.out_lin(context) # (bs, q_length, dim) - if cast_bool_to_primitive(output_attentions) is True: + if output_attentions: return (context, weights) else: return (context,) @@ -310,6 +305,7 @@ def __init__(self, config, **kwargs): self.hidden_dim = config.hidden_dim self.dropout = tf.keras.layers.Dropout(config.dropout) self.activation = config.activation + self.output_attentions = config.output_attentions assert ( config.dim % config.n_heads == 0 @@ -321,7 +317,7 @@ def __init__(self, config, **kwargs): self.ffn = TFFFN(config, name="ffn") self.output_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="output_layer_norm") - def call(self, inputs, training=False): # removed: src_enc=None, src_len=None + def call(self, x, attn_mask, head_mask, output_attentions, training=False): # removed: src_enc=None, src_len=None """ Parameters ---------- @@ -335,11 +331,9 @@ def call(self, inputs, training=False): # removed: src_enc=None, src_len=None ffn_output: tf.Tensor(bs, seq_length, dim) The output of the transformer block contextualization. """ - x, attn_mask, head_mask, output_attentions = inputs - # Self-Attention - sa_output = self.attention([x, x, x, attn_mask, head_mask, output_attentions], training=training) - if cast_bool_to_primitive(output_attentions) is True: + sa_output = self.attention(x, x, x, attn_mask, head_mask, output_attentions, training=training) + if output_attentions: sa_output, sa_weights = sa_output # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length) else: # To handle these `output_attention` or `output_hidden_states` cases returning tuples # assert type(sa_output) == tuple @@ -351,7 +345,7 @@ def call(self, inputs, training=False): # removed: src_enc=None, src_len=None ffn_output = self.output_layer_norm(ffn_output + sa_output) # (bs, seq_length, dim) output = (ffn_output,) - if cast_bool_to_primitive(output_attentions) is True: + if output_attentions: output = (sa_weights,) + output return output @@ -360,10 +354,12 @@ class TFTransformer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.n_layers = config.n_layers + self.output_hidden_states = config.output_hidden_states + self.output_attentions = config.output_attentions self.layer = [TFTransformerBlock(config, name="layer_._{}".format(i)) for i in range(config.n_layers)] - def call(self, inputs, training=False): + def call(self, x, attn_mask, head_mask, output_attentions, output_hidden_states, training=False): """ Parameters ---------- @@ -383,34 +379,32 @@ def call(self, inputs, training=False): Tuple of length n_layers with the attention weights from each layer Optional: only if output_attentions=True """ - x, attn_mask, head_mask, output_attentions, output_hidden_states = inputs - all_hidden_states = () all_attentions = () hidden_state = x for i, layer_module in enumerate(self.layer): - if cast_bool_to_primitive(output_hidden_states) is True: + if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_state,) - layer_outputs = layer_module([hidden_state, attn_mask, head_mask[i], output_attentions], training=training) + layer_outputs = layer_module(hidden_state, attn_mask, head_mask[i], output_attentions, training=training) hidden_state = layer_outputs[-1] - if cast_bool_to_primitive(output_attentions) is True: - assert len(layer_outputs) == 2, f"Incorrect number of outputs {len(layer_outputs)} instead of 2" + if output_attentions: + assert len(layer_outputs) == 2 attentions = layer_outputs[0] all_attentions = all_attentions + (attentions,) else: assert len(layer_outputs) == 1, f"Incorrect number of outputs {len(layer_outputs)} instead of 1" # Add last layer - if cast_bool_to_primitive(output_hidden_states) is True: + if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_state,) outputs = (hidden_state,) - if cast_bool_to_primitive(output_hidden_states) is True: + if output_hidden_states: outputs = outputs + (all_hidden_states,) - if cast_bool_to_primitive(output_attentions) is True: + if output_attentions: outputs = outputs + (all_attentions,) return outputs # last-layer hidden state, (all hidden states), (all attentions) @@ -481,6 +475,7 @@ def call( if attention_mask is None: attention_mask = tf.ones(input_shape) # (bs, seq_length) + attention_mask = tf.cast(attention_mask, dtype=tf.float32) # Prepare head mask if needed @@ -491,11 +486,12 @@ def call( if head_mask is not None: raise NotImplementedError else: + head_mask = [None] * self.num_hidden_layers embedding_output = self.embeddings(input_ids, inputs_embeds=inputs_embeds) # (bs, seq_length, dim) tfmr_output = self.transformer( - [embedding_output, attention_mask, head_mask, output_attentions, output_hidden_states], training=training + embedding_output, attention_mask, head_mask, output_attentions, output_hidden_states, training=training ) return tfmr_output # last-layer hidden-state, (all hidden_states), (all attentions) @@ -986,24 +982,21 @@ def call( if inputs_embeds is not None else None ) - - flat_inputs = [ + distilbert_output = self.distilbert( flat_input_ids, flat_attention_mask, head_mask, flat_inputs_embeds, output_attentions, output_hidden_states, - ] - - distilbert_output = self.distilbert(flat_inputs, training=training) + training=training, + ) hidden_state = distilbert_output[0] # (bs, seq_len, dim) pooled_output = hidden_state[:, 0] # (bs, dim) pooled_output = self.pre_classifier(pooled_output) # (bs, dim) pooled_output = self.dropout(pooled_output, training=training) # (bs, dim) logits = self.classifier(pooled_output) reshaped_logits = tf.reshape(logits, (-1, num_choices)) - outputs = (reshaped_logits,) + distilbert_output[1:] # add hidden states and attention if they are here if labels is not None: diff --git a/src/transformers/modeling_tf_electra.py b/src/transformers/modeling_tf_electra.py index 3d04e22a2a95..48f598adaa32 100644 --- a/src/transformers/modeling_tf_electra.py +++ b/src/transformers/modeling_tf_electra.py @@ -2,7 +2,8 @@ import tensorflow as tf -from .configuration_electra import ElectraConfig +from transformers import ElectraConfig + from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable from .modeling_tf_bert import ACT2FN, TFBertEncoder, TFBertPreTrainedModel from .modeling_tf_utils import ( @@ -71,7 +72,15 @@ def build(self, input_shape): ) super().build(input_shape) - def call(self, inputs, mode="embedding", training=False): + def call( + self, + input_ids=None, + position_ids=None, + token_type_ids=None, + inputs_embeds=None, + mode="embedding", + training=False, + ): """Get token embeddings of inputs. Args: inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids) @@ -87,15 +96,15 @@ def call(self, inputs, mode="embedding", training=False): https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 """ if mode == "embedding": - return self._embedding(inputs, training=training) + return self._embedding(input_ids, position_ids, token_type_ids, inputs_embeds, training=training) elif mode == "linear": - return self._linear(inputs) + return self._linear(input_ids) else: raise ValueError("mode {} is not valid.".format(mode)) - def _embedding(self, inputs, training=False): + def _embedding(self, input_ids, position_ids, token_type_ids, inputs_embeds, training=False): """Applies embedding based on inputs tensor.""" - input_ids, position_ids, token_type_ids, inputs_embeds = inputs + assert not (input_ids is None and inputs_embeds is None) if input_ids is not None: input_shape = shape_list(input_ids) @@ -289,13 +298,17 @@ def call( extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) head_mask = self.get_head_mask(head_mask) - hidden_states = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training) + hidden_states = self.embeddings(input_ids, position_ids, token_type_ids, inputs_embeds, training=training) if hasattr(self, "embeddings_project"): hidden_states = self.embeddings_project(hidden_states, training=training) hidden_states = self.encoder( - [hidden_states, extended_attention_mask, head_mask, output_attentions, output_hidden_states], + hidden_states, + extended_attention_mask, + head_mask, + output_attentions, + output_hidden_states, training=training, ) diff --git a/src/transformers/modeling_tf_flaubert.py b/src/transformers/modeling_tf_flaubert.py index 9885004586a9..77aa4f249f5b 100644 --- a/src/transformers/modeling_tf_flaubert.py +++ b/src/transformers/modeling_tf_flaubert.py @@ -22,7 +22,7 @@ from .configuration_flaubert import FlaubertConfig from .file_utils import add_start_docstrings -from .modeling_tf_utils import cast_bool_to_primitive, keras_serializable, shape_list +from .modeling_tf_utils import keras_serializable, shape_list from .modeling_tf_xlm import ( TFXLMForMultipleChoice, TFXLMForQuestionAnsweringSimple, @@ -274,10 +274,10 @@ def call( # self attention if not self.pre_norm: attn_outputs = self.attentions[i]( - [tensor, attn_mask, None, cache, head_mask[i], output_attentions], training=training + tensor, attn_mask, None, cache, head_mask[i], output_attentions, training=training ) attn = attn_outputs[0] - if cast_bool_to_primitive(output_attentions, self.output_attentions) is True: + if output_attentions: attentions = attentions + (attn_outputs[1],) attn = self.dropout(attn, training=training) tensor = tensor + attn @@ -285,10 +285,10 @@ def call( else: tensor_normalized = self.layer_norm1[i](tensor) attn_outputs = self.attentions[i]( - [tensor_normalized, attn_mask, None, cache, head_mask[i]], training=training + tensor_normalized, attn_mask, None, cache, head_mask[i], training=training ) attn = attn_outputs[0] - if cast_bool_to_primitive(output_attentions, self.output_attentions) is True: + if output_attentions: attentions = attentions + (attn_outputs[1],) attn = self.dropout(attn, training=training) tensor = tensor + attn @@ -311,7 +311,7 @@ def call( tensor = tensor * mask[..., tf.newaxis] # Add last hidden state - if cast_bool_to_primitive(output_hidden_states, self.output_hidden_states) is True: + if output_hidden_states: hidden_states = hidden_states + (tensor,) # update cache length @@ -322,9 +322,9 @@ def call( # tensor = tensor.transpose(0, 1) outputs = (tensor,) - if cast_bool_to_primitive(output_hidden_states, self.output_hidden_states) is True: + if output_hidden_states: outputs = outputs + (hidden_states,) - if cast_bool_to_primitive(output_attentions, self.output_attentions) is True: + if output_attentions: outputs = outputs + (attentions,) return outputs # outputs, (hidden_states), (attentions) diff --git a/src/transformers/modeling_tf_gpt2.py b/src/transformers/modeling_tf_gpt2.py index de7dc4c3577f..8adaafb35ebf 100644 --- a/src/transformers/modeling_tf_gpt2.py +++ b/src/transformers/modeling_tf_gpt2.py @@ -29,7 +29,6 @@ TFPreTrainedModel, TFSequenceSummary, TFSharedEmbeddings, - cast_bool_to_primitive, get_initializer, keras_serializable, shape_list, @@ -75,6 +74,7 @@ def __init__(self, nx, n_ctx, config, scale=False, **kwargs): self.n_head = config.n_head self.split_size = n_state self.scale = scale + self.output_attentions = config.output_attentions self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name="c_attn") self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_proj") @@ -95,8 +95,7 @@ def causal_attention_mask(nd, ns, dtype): m = i >= j - ns + nd return tf.cast(m, dtype) - def _attn(self, inputs, training=False): - q, k, v, attention_mask, head_mask, output_attentions = inputs + def _attn(self, q, k, v, attention_mask, head_mask, output_attentions, training=False): # q, k, v have shape [batch, heads, sequence, features] w = tf.matmul(q, k, transpose_b=True) if self.scale: @@ -121,7 +120,7 @@ def _attn(self, inputs, training=False): w = w * head_mask outputs = [tf.matmul(w, v)] - if cast_bool_to_primitive(output_attentions) is True: + if output_attentions: outputs.append(w) return outputs @@ -137,9 +136,7 @@ def split_heads(self, x): x = tf.reshape(x, new_x_shape) return tf.transpose(x, (0, 2, 1, 3)) # (batch, head, seq_length, head_features) - def call(self, inputs, training=False): - x, layer_past, attention_mask, head_mask, use_cache, output_attentions = inputs - + def call(self, x, layer_past, attention_mask, head_mask, use_cache, output_attentions, training=False): x = self.c_attn(x) query, key, value = tf.split(x, 3, axis=2) query = self.split_heads(query) @@ -151,12 +148,12 @@ def call(self, inputs, training=False): value = tf.concat([past_value, value], axis=-2) # to cope with keras serialization - if cast_bool_to_primitive(use_cache, True) is True: + if use_cache: present = tf.stack([key, value], axis=0) else: present = (None,) - attn_outputs = self._attn([query, key, value, attention_mask, head_mask, output_attentions], training=training) + attn_outputs = self._attn(query, key, value, attention_mask, head_mask, output_attentions, training=training) a = attn_outputs[0] a = self.merge_heads(a) @@ -192,12 +189,10 @@ def __init__(self, n_ctx, config, scale=False, **kwargs): self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2") self.mlp = TFMLP(4 * nx, config, name="mlp") - def call(self, inputs, training=False): - x, layer_past, attention_mask, head_mask, use_cache, output_attentions = inputs - + def call(self, x, layer_past, attention_mask, head_mask, use_cache, output_attentions, training=False): a = self.ln_1(x) output_attn = self.attn( - [a, layer_past, attention_mask, head_mask, use_cache, output_attentions], training=training + a, layer_past, attention_mask, head_mask, use_cache, output_attentions, training=training ) a = output_attn[0] # output_attn: a, present, (attentions) x = x + a @@ -223,6 +218,8 @@ def __init__(self, config, *inputs, **kwargs): self.num_hidden_layers = config.n_layer self.vocab_size = config.vocab_size self.n_embd = config.n_embd + self.output_hidden_states = self.output_hidden_states + self.output_attentions = self.output_attentions self.wte = TFSharedEmbeddings( config.vocab_size, config.hidden_size, initializer_range=config.initializer_range, name="wte" @@ -362,34 +359,39 @@ def call( all_attentions = [] all_hidden_states = () for i, (block, layer_past) in enumerate(zip(self.h, past)): - if cast_bool_to_primitive(output_hidden_states) is True: + if output_hidden_states: all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),) outputs = block( - [hidden_states, layer_past, attention_mask, head_mask[i], use_cache, output_attentions], + hidden_states, + layer_past, + attention_mask, + head_mask[i], + use_cache, + output_attentions, training=training, ) hidden_states, present = outputs[:2] presents = presents + (present,) - if cast_bool_to_primitive(output_attentions) is True: + if output_attentions: all_attentions.append(outputs[2]) hidden_states = self.ln_f(hidden_states) hidden_states = tf.reshape(hidden_states, output_shape) # Add last hidden state - if cast_bool_to_primitive(output_hidden_states) is True: + if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) outputs = (hidden_states,) - if use_cache is True: + if use_cache: outputs = outputs + (presents,) - if cast_bool_to_primitive(output_hidden_states) is True: + if output_hidden_states: outputs = outputs + (all_hidden_states,) - if cast_bool_to_primitive(output_attentions) is True: + if output_attentions: # let the number of heads free (-1) so we can extract attention even after head pruning attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:] all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions) @@ -738,13 +740,11 @@ def call( input_shapes = shape_list(inputs_embeds)[:-1] seq_length = input_shapes[-1] - flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None - - flat_inputs = [ + transformer_outputs = self.transformer( flat_input_ids, past, flat_attention_mask, @@ -755,18 +755,13 @@ def call( use_cache, output_attentions, output_hidden_states, - ] - - transformer_outputs = self.transformer(flat_inputs, training=training) + training=training, + ) hidden_states = transformer_outputs[0] - hidden_states = tf.reshape(hidden_states, input_shapes + shape_list(hidden_states)[-1:]) - lm_logits = self.transformer.wte(hidden_states, mode="linear") - mc_logits = self.multiple_choice_head([hidden_states, mc_token_ids], training=training) - + mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids, training=training) mc_logits = tf.squeeze(mc_logits, axis=-1) - outputs = (lm_logits, mc_logits) + transformer_outputs[1:] return outputs # lm logits, mc logits, presents, (all hidden_states), (attentions) diff --git a/src/transformers/modeling_tf_mobilebert.py b/src/transformers/modeling_tf_mobilebert.py index 98fd9c080cde..5cee9e764be7 100644 --- a/src/transformers/modeling_tf_mobilebert.py +++ b/src/transformers/modeling_tf_mobilebert.py @@ -35,7 +35,6 @@ TFQuestionAnsweringLoss, TFSequenceClassificationLoss, TFTokenClassificationLoss, - cast_bool_to_primitive, get_initializer, keras_serializable, shape_list, @@ -130,7 +129,15 @@ def build(self, input_shape): ) super().build(input_shape) - def call(self, inputs, mode="embedding", training=False): + def call( + self, + input_ids=None, + position_ids=None, + token_type_ids=None, + inputs_embeds=None, + mode="embedding", + training=False, + ): """Get token embeddings of inputs. Args: inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids) @@ -146,15 +153,15 @@ def call(self, inputs, mode="embedding", training=False): https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 """ if mode == "embedding": - return self._embedding(inputs, training=training) + return self._embedding(input_ids, position_ids, token_type_ids, inputs_embeds, training=training) elif mode == "linear": - return self._linear(inputs) + return self._linear(input_ids) else: raise ValueError("mode {} is not valid.".format(mode)) - def _embedding(self, inputs, training=False): + def _embedding(self, input_ids, position_ids, token_type_ids, inputs_embeds, training=False): """Applies embedding based on inputs tensor.""" - input_ids, position_ids, token_type_ids, inputs_embeds = inputs + assert not (input_ids is None and inputs_embeds is None) if input_ids is not None: input_shape = shape_list(input_ids) @@ -196,6 +203,7 @@ def _embedding(self, inputs, training=False): embeddings = inputs_embeds + position_embeddings + token_type_embeddings embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings, training=training) + return embeddings def _linear(self, inputs): @@ -224,6 +232,7 @@ def __init__(self, config, **kwargs): ) self.num_attention_heads = config.num_attention_heads + self.output_attentions = config.output_attentions assert config.hidden_size % config.num_attention_heads == 0 self.attention_head_size = int(config.true_hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size @@ -244,14 +253,13 @@ def transpose_for_scores(self, x, batch_size): x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size)) return tf.transpose(x, perm=[0, 2, 1, 3]) - def call(self, inputs, training=False): - query_tensor, key_tensor, value_tensor, attention_mask, head_mask, output_attentions = inputs - + def call( + self, query_tensor, key_tensor, value_tensor, attention_mask, head_mask, output_attentions, training=False + ): batch_size = shape_list(attention_mask)[0] mixed_query_layer = self.query(query_tensor) mixed_key_layer = self.key(key_tensor) mixed_value_layer = self.value(value_tensor) - query_layer = self.transpose_for_scores(mixed_query_layer, batch_size) key_layer = self.transpose_for_scores(mixed_key_layer, batch_size) value_layer = self.transpose_for_scores(mixed_value_layer, batch_size) @@ -285,9 +293,7 @@ def call(self, inputs, training=False): context_layer, (batch_size, -1, self.all_head_size) ) # (batch_size, seq_len_q, all_head_size) - outputs = ( - (context_layer, attention_probs) if cast_bool_to_primitive(output_attentions) is True else (context_layer,) - ) + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) return outputs @@ -305,8 +311,7 @@ def __init__(self, config, **kwargs): if not self.use_bottleneck: self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - def call(self, inputs, training=False): - hidden_states, residual_tensor = inputs + def call(self, hidden_states, residual_tensor, training=False): hidden_states = self.dense(hidden_states) if not self.use_bottleneck: hidden_states = self.dropout(hidden_states, training=training) @@ -323,13 +328,22 @@ def __init__(self, config, **kwargs): def prune_heads(self, heads): raise NotImplementedError - def call(self, inputs, training=False): - query_tensor, key_tensor, value_tensor, layer_input, attention_mask, head_mask, output_attentions = inputs - + def call( + self, + query_tensor, + key_tensor, + value_tensor, + layer_input, + attention_mask, + head_mask, + output_attentions, + training=False, + ): self_outputs = self.self( - [query_tensor, key_tensor, value_tensor, attention_mask, head_mask, output_attentions], training=training + query_tensor, key_tensor, value_tensor, attention_mask, head_mask, output_attentions, training=training ) - attention_output = self.mobilebert_output([self_outputs[0], layer_input], training=training) + + attention_output = self.mobilebert_output(self_outputs[0], layer_input, training=training) outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs @@ -349,8 +363,7 @@ def __init__(self, config, **kwargs): ) self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - def call(self, inputs, training=False): - hidden_states, residual_tensor = inputs + def call(self, hidden_states, residual_tensor, training=False): layer_outputs = self.dense(hidden_states) layer_outputs = self.dropout(layer_outputs, training=training) layer_outputs = self.LayerNorm(layer_outputs + residual_tensor) @@ -372,16 +385,14 @@ def __init__(self, config, **kwargs): else: self.bottleneck = TFOutputBottleneck(config, name="bottleneck") - def call(self, inputs, training=False): - hidden_states, residual_tensor_1, residual_tensor_2 = inputs - + def call(self, hidden_states, residual_tensor_1, residual_tensor_2, training=False): hidden_states = self.dense(hidden_states) if not self.use_bottleneck: hidden_states = self.dropout(hidden_states, training=training) hidden_states = self.LayerNorm(hidden_states + residual_tensor_1) else: hidden_states = self.LayerNorm(hidden_states + residual_tensor_1) - hidden_states = self.bottleneck([hidden_states, residual_tensor_2]) + hidden_states = self.bottleneck(hidden_states, residual_tensor_2) return hidden_states @@ -466,7 +477,6 @@ def __init__(self, config, **kwargs): super().__init__(**kwargs) self.use_bottleneck = config.use_bottleneck self.num_feedforward_networks = config.num_feedforward_networks - self.attention = TFMobileBertAttention(config, name="attention") self.intermediate = TFMobileBertIntermediate(config, name="intermediate") self.mobilebert_output = TFMobileBertOutput(config, name="output") @@ -478,16 +488,20 @@ def __init__(self, config, **kwargs): TFFFNLayer(config, name="ffn.{}".format(i)) for i in range(config.num_feedforward_networks - 1) ] - def call(self, inputs, training=False): - hidden_states, attention_mask, head_mask, output_attentions = inputs - + def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False): if self.use_bottleneck: query_tensor, key_tensor, value_tensor, layer_input = self.bottleneck(hidden_states) else: query_tensor, key_tensor, value_tensor, layer_input = [hidden_states] * 4 attention_outputs = self.attention( - [query_tensor, key_tensor, value_tensor, layer_input, attention_mask, head_mask, output_attentions], + query_tensor, + key_tensor, + value_tensor, + layer_input, + attention_mask, + head_mask, + output_attentions, training=training, ) @@ -500,48 +514,57 @@ def call(self, inputs, training=False): s += (attention_output,) intermediate_output = self.intermediate(attention_output) - layer_output = self.mobilebert_output( - [intermediate_output, attention_output, hidden_states], training=training - ) + layer_output = self.mobilebert_output(intermediate_output, attention_output, hidden_states, training=training) + outputs = ( (layer_output,) + attention_outputs[1:] - + (0, query_tensor, key_tensor, value_tensor, layer_input, attention_output, intermediate_output) + + ( + tf.constant(0), + query_tensor, + key_tensor, + value_tensor, + layer_input, + attention_output, + intermediate_output, + ) + s ) # add attentions if we output them + return outputs class TFMobileBertEncoder(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) + self.output_attentions = config.output_attentions + self.output_hidden_states = config.output_hidden_states self.layer = [TFMobileBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)] - def call(self, inputs, training=False): - hidden_states, attention_mask, head_mask, output_attentions, output_hidden_states = inputs - + def call(self, hidden_states, attention_mask, head_mask, output_attentions, output_hidden_states, training=False): all_hidden_states = () all_attentions = () for i, layer_module in enumerate(self.layer): - if cast_bool_to_primitive(output_hidden_states) is True: + if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) layer_outputs = layer_module( - [hidden_states, attention_mask, head_mask[i], output_attentions], training=training + hidden_states, attention_mask, head_mask[i], output_attentions, training=training ) + hidden_states = layer_outputs[0] - if cast_bool_to_primitive(output_attentions) is True: + if output_attentions: all_attentions = all_attentions + (layer_outputs[1],) # Add last layer - if cast_bool_to_primitive(output_hidden_states) is True: + if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) outputs = (hidden_states,) - if cast_bool_to_primitive(output_hidden_states) is True: + if output_hidden_states: outputs = outputs + (all_hidden_states,) - if cast_bool_to_primitive(output_attentions) is True: + if output_attentions: outputs = outputs + (all_attentions,) return outputs # outputs, (hidden states), (attentions) @@ -732,11 +755,14 @@ def call( raise NotImplementedError else: head_mask = [None] * self.num_hidden_layers - # head_mask = tf.constant([0] * self.num_hidden_layers) - embedding_output = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training) + embedding_output = self.embeddings(input_ids, position_ids, token_type_ids, inputs_embeds, training=training) encoder_outputs = self.encoder( - [embedding_output, extended_attention_mask, head_mask, output_attentions, output_hidden_states], + embedding_output, + extended_attention_mask, + head_mask, + output_attentions, + output_hidden_states, training=training, ) @@ -1360,8 +1386,7 @@ def call( if inputs_embeds is not None else None ) - - flat_inputs = [ + outputs = self.mobilebert( flat_input_ids, flat_attention_mask, flat_token_type_ids, @@ -1370,16 +1395,12 @@ def call( flat_inputs_embeds, output_attentions, output_hidden_states, - ] - - outputs = self.mobilebert(flat_inputs, training=training) - + training=training, + ) pooled_output = outputs[1] - pooled_output = self.dropout(pooled_output, training=training) logits = self.classifier(pooled_output) reshaped_logits = tf.reshape(logits, (-1, num_choices)) - outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: diff --git a/src/transformers/modeling_tf_openai.py b/src/transformers/modeling_tf_openai.py index 7d7adb1407ea..d5174f142a8c 100644 --- a/src/transformers/modeling_tf_openai.py +++ b/src/transformers/modeling_tf_openai.py @@ -29,7 +29,6 @@ TFPreTrainedModel, TFSequenceSummary, TFSharedEmbeddings, - cast_bool_to_primitive, get_initializer, keras_serializable, shape_list, @@ -84,6 +83,7 @@ def __init__(self, nx, n_ctx, config, scale=False, **kwargs): self.n_head = config.n_head self.split_size = n_state self.scale = scale + self.output_attentions = config.output_attentions self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name="c_attn") self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_proj") @@ -104,8 +104,7 @@ def causal_attention_mask(nd, ns, dtype): m = i >= j - ns + nd return tf.cast(m, dtype) - def _attn(self, inputs, training=False): - q, k, v, attention_mask, head_mask, output_attentions = inputs + def _attn(self, q, k, v, attention_mask, head_mask, output_attentions, training=False): # q, k, v have shape [batch, heads, sequence, features] w = tf.matmul(q, k, transpose_b=True) if self.scale: @@ -130,7 +129,7 @@ def _attn(self, inputs, training=False): w = w * head_mask outputs = [tf.matmul(w, v)] - if cast_bool_to_primitive(output_attentions) is True: + if output_attentions: outputs.append(w) return outputs @@ -146,16 +145,14 @@ def split_heads(self, x): x = tf.reshape(x, new_x_shape) return tf.transpose(x, (0, 2, 1, 3)) # (batch, head, seq_length, head_features) - def call(self, inputs, training=False): - x, attention_mask, head_mask, output_attentions = inputs - + def call(self, x, attention_mask, head_mask, output_attentions, training=False): x = self.c_attn(x) query, key, value = tf.split(x, 3, axis=2) query = self.split_heads(query) key = self.split_heads(key) value = self.split_heads(value) - attn_outputs = self._attn([query, key, value, attention_mask, head_mask, output_attentions], training=training) + attn_outputs = self._attn(query, key, value, attention_mask, head_mask, output_attentions, training=training) a = attn_outputs[0] a = self.merge_heads(a) @@ -191,10 +188,8 @@ def __init__(self, n_ctx, config, scale=False, **kwargs): self.mlp = TFMLP(4 * nx, config, name="mlp") self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2") - def call(self, inputs, training=False): - x, attention_mask, head_mask, output_attentions = inputs - - output_attn = self.attn([x, attention_mask, head_mask, output_attentions], training=training) + def call(self, x, attention_mask, head_mask, output_attentions, training=False): + output_attn = self.attn(x, attention_mask, head_mask, output_attentions, training=training) a = output_attn[0] # output_attn: a, (attentions) n = self.ln_1(x + a) @@ -341,23 +336,23 @@ def call( all_attentions = [] all_hidden_states = () for i, block in enumerate(self.h): - if cast_bool_to_primitive(output_hidden_states) is True: + if output_hidden_states: all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),) - outputs = block([hidden_states, attention_mask, head_mask[i], output_attentions], training=training) + outputs = block(hidden_states, attention_mask, head_mask[i], output_attentions, training=training) hidden_states = outputs[0] - if cast_bool_to_primitive(output_attentions) is True: + if output_attentions: all_attentions.append(outputs[1]) hidden_states = tf.reshape(hidden_states, output_shape) # Add last hidden state - if cast_bool_to_primitive(output_hidden_states) is True: + if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) outputs = (hidden_states,) - if cast_bool_to_primitive(output_hidden_states) is True: + if output_hidden_states: outputs = outputs + (all_hidden_states,) - if cast_bool_to_primitive(output_attentions) is True: + if output_attentions: # let the number of heads free (-1) so we can extract attention even after head pruning attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:] all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions) @@ -671,13 +666,11 @@ def call( input_shapes = shape_list(inputs_embeds)[:-1] seq_length = input_shapes[-1] - flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None - - flat_inputs = [ + transformer_outputs = self.transformer( flat_input_ids, flat_attention_mask, flat_token_type_ids, @@ -686,18 +679,13 @@ def call( inputs_embeds, output_attentions, output_hidden_states, - ] - - transformer_outputs = self.transformer(flat_inputs, training=training) + training=training, + ) hidden_states = transformer_outputs[0] - hidden_states = tf.reshape(hidden_states, input_shapes + shape_list(hidden_states)[-1:]) - lm_logits = self.transformer.tokens_embed(hidden_states, mode="linear") - mc_logits = self.multiple_choice_head([hidden_states, mc_token_ids], training=training) - + mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids, training=training) mc_logits = tf.squeeze(mc_logits, axis=-1) - outputs = (lm_logits, mc_logits) + transformer_outputs[1:] return outputs # lm logits, mc logits, (all hidden_states), (attentions) diff --git a/src/transformers/modeling_tf_roberta.py b/src/transformers/modeling_tf_roberta.py index 99ef96bc477e..fbe1a4be5869 100644 --- a/src/transformers/modeling_tf_roberta.py +++ b/src/transformers/modeling_tf_roberta.py @@ -86,9 +86,9 @@ def create_position_ids_from_inputs_embeds(self, inputs_embeds): position_ids = tf.range(self.padding_idx + 1, seq_length + self.padding_idx + 1, dtype=tf.int32)[tf.newaxis, :] return position_ids - def _embedding(self, inputs, training=False): + def _embedding(self, input_ids, position_ids, token_type_ids, inputs_embeds, training=False): """Applies embedding based on inputs tensor.""" - input_ids, position_ids, token_type_ids, inputs_embeds = inputs + assert not (input_ids is None and inputs_embeds is None) if position_ids is None: if input_ids is not None: @@ -97,7 +97,7 @@ def _embedding(self, inputs, training=False): else: position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds) - return super()._embedding([input_ids, position_ids, token_type_ids, inputs_embeds], training=training) + return super()._embedding(input_ids, position_ids, token_type_ids, inputs_embeds, training=training) @keras_serializable @@ -546,8 +546,7 @@ def call( flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None - - flat_inputs = [ + outputs = self.roberta( flat_input_ids, flat_attention_mask, flat_token_type_ids, @@ -556,16 +555,12 @@ def call( inputs_embeds, output_attentions, output_hidden_states, - ] - - outputs = self.roberta(flat_inputs, training=training) - + training=training, + ) pooled_output = outputs[1] - pooled_output = self.dropout(pooled_output, training=training) logits = self.classifier(pooled_output) reshaped_logits = tf.reshape(logits, (-1, num_choices)) - outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: diff --git a/src/transformers/modeling_tf_t5.py b/src/transformers/modeling_tf_t5.py index 1dea84777023..9b885d496fbd 100644 --- a/src/transformers/modeling_tf_t5.py +++ b/src/transformers/modeling_tf_t5.py @@ -115,6 +115,7 @@ def __init__(self, config, has_relative_attention_bias=False, **kwargs): self.is_decoder = config.is_decoder self.use_cache = config.use_cache self.has_relative_attention_bias = has_relative_attention_bias + self.output_attentions = config.output_attentions self.relative_attention_num_buckets = config.relative_attention_num_buckets self.d_model = config.d_model @@ -296,7 +297,7 @@ def unshape(x): outputs = (context,) + present_key_value_state - if cast_bool_to_primitive(output_attentions, True) is True: + if output_attentions: outputs = outputs + (weights,) if self.has_relative_attention_bias: outputs = outputs + (position_bias,) @@ -699,7 +700,7 @@ def call( hidden_states = self.dropout(inputs_embeds, training=training) for i, (layer_module, past_key_value_state) in enumerate(zip(self.block, past_key_value_states)): - if cast_bool_to_primitive(output_hidden_states) is True: + if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) layer_outputs = layer_module( @@ -727,23 +728,23 @@ def call( # append next layer key value states present_key_value_states = present_key_value_states + (present_key_value_state,) - if cast_bool_to_primitive(output_attentions) is True: + if output_attentions: all_attentions = all_attentions + (layer_outputs[2],) hidden_states = self.final_layer_norm(hidden_states) hidden_states = self.dropout(hidden_states, training=training) # Add last layer - if cast_bool_to_primitive(output_hidden_states) is True: + if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) outputs = (hidden_states,) # need to check if is decoder here as well for special cases when using keras compile if cast_bool_to_primitive(use_cache, self.use_cache) is True and self.is_decoder: outputs = outputs + (present_key_value_states,) - if cast_bool_to_primitive(output_hidden_states) is True: + if output_hidden_states: outputs = outputs + (all_hidden_states,) - if cast_bool_to_primitive(output_attentions) is True: + if output_attentions: outputs = outputs + (all_attentions,) return outputs # last-layer hidden state, (all hidden states), (all attentions) diff --git a/src/transformers/modeling_tf_transfo_xl.py b/src/transformers/modeling_tf_transfo_xl.py index 33fd3ba7ff7e..d1979174a3a9 100644 --- a/src/transformers/modeling_tf_transfo_xl.py +++ b/src/transformers/modeling_tf_transfo_xl.py @@ -24,13 +24,7 @@ from .configuration_transfo_xl import TransfoXLConfig from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask -from .modeling_tf_utils import ( - TFPreTrainedModel, - cast_bool_to_primitive, - get_initializer, - keras_serializable, - shape_list, -) +from .modeling_tf_utils import TFPreTrainedModel, get_initializer, keras_serializable, shape_list from .tokenization_utils import BatchEncoding @@ -119,6 +113,7 @@ def __init__( r_w_bias=None, layer_norm_epsilon=1e-5, init_std=0.02, + output_attentions=False, **kwargs ): super().__init__(**kwargs) @@ -127,6 +122,7 @@ def __init__( self.d_model = d_model self.d_head = d_head self.dropout = dropout + self.output_attentions = output_attentions self.qkv_net = tf.keras.layers.Dense( 3 * n_head * d_head, kernel_initializer=get_initializer(init_std), use_bias=False, name="qkv_net" @@ -175,8 +171,7 @@ def _rel_shift(self, x): return x - def call(self, inputs, training=False): - w, r, attn_mask, mems, head_mask, output_attentions = inputs + def call(self, w, r, attn_mask, mems, head_mask, output_attentions, training=False): qlen, rlen, bsz = shape_list(w)[0], shape_list(r)[0], shape_list(w)[1] if mems is not None: @@ -249,7 +244,7 @@ def call(self, inputs, training=False): # residual connection + layer normalization outputs = [self.layer_norm(w + attn_out)] - if cast_bool_to_primitive(output_attentions) is True: + if output_attentions: outputs.append(attn_prob) return outputs @@ -272,6 +267,7 @@ def __init__( r_r_bias=None, layer_norm_epsilon=1e-5, init_std=0.02, + output_attentions=False, **kwargs ): super().__init__(**kwargs) @@ -290,6 +286,7 @@ def __init__( r_r_bias=r_r_bias, init_std=init_std, layer_norm_epsilon=layer_norm_epsilon, + output_attentions=output_attentions, name="dec_attn", ) self.pos_ff = TFPositionwiseFF( @@ -302,11 +299,8 @@ def __init__( name="pos_ff", ) - def call(self, inputs, training=False): - dec_inp, r, dec_attn_mask, mems, head_mask, output_attentions = inputs - attn_outputs = self.dec_attn( - [dec_inp, r, dec_attn_mask, mems, head_mask, output_attentions], training=training - ) + def call(self, dec_inp, r, dec_attn_mask, mems, head_mask, output_attentions, training=False): + attn_outputs = self.dec_attn(dec_inp, r, dec_attn_mask, mems, head_mask, output_attentions, training=training) ff_output = self.pos_ff(attn_outputs[0], training=training) outputs = [ff_output] + attn_outputs[1:] @@ -443,6 +437,7 @@ def __init__(self, config, **kwargs): r_r_bias=None if self.untie_r else self.r_r_bias, layer_norm_epsilon=config.layer_norm_epsilon, init_std=config.init_std, + output_attentions=self.output_attentions, name="layers_._{}".format(i), ) ) @@ -625,10 +620,10 @@ def call( hids.append(core_out) mems_i = None if mems is None else mems[i] layer_outputs = layer( - [core_out, pos_emb, dec_attn_mask, mems_i, head_mask[i], output_attentions], training=training, + core_out, pos_emb, dec_attn_mask, mems_i, head_mask[i], output_attentions, training=training, ) core_out = layer_outputs[0] - if cast_bool_to_primitive(output_attentions) is True: + if output_attentions: attentions.append(layer_outputs[1]) else: # learnable embeddings and absolute embeddings raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint @@ -639,12 +634,12 @@ def call( # We transpose back here to shape [bsz, len, hidden_dim] outputs = [tf.transpose(core_out, perm=(1, 0, 2)), new_mems] - if cast_bool_to_primitive(output_hidden_states): + if output_hidden_states: # Add last layer and transpose to library standard shape [bsz, len, hidden_dim] hids.append(core_out) hids = list(tf.transpose(t, perm=(1, 0, 2)) for t in hids) outputs.append(hids) - if cast_bool_to_primitive(output_attentions) is True: + if output_attentions: # Transpose to library standard shape [bsz, n_heads, query_seq_len, key_seq_len] attentions = list(tf.transpose(t, perm=(2, 3, 0, 1)) for t in attentions) outputs.append(attentions) @@ -860,14 +855,14 @@ def call( bsz, tgt_len = shape_list(inputs_embeds)[:2] transformer_outputs = self.transformer( - [input_ids, mems, head_mask, inputs_embeds, output_attentions, output_hidden_states], training=training + input_ids, mems, head_mask, inputs_embeds, output_attentions, output_hidden_states, training=training ) last_hidden = transformer_outputs[0] pred_hid = last_hidden[:, -tgt_len:] outputs = transformer_outputs[1:] - softmax_output = self.crit([pred_hid, labels], training=training) + softmax_output = self.crit(pred_hid, labels, training=training) outputs = [softmax_output] + outputs return outputs # logits, new_mems, (all hidden states), (all attentions) diff --git a/src/transformers/modeling_tf_transfo_xl_utilities.py b/src/transformers/modeling_tf_transfo_xl_utilities.py index 1f6edf3a9b98..7b7757eeb5aa 100644 --- a/src/transformers/modeling_tf_transfo_xl_utilities.py +++ b/src/transformers/modeling_tf_transfo_xl_utilities.py @@ -114,8 +114,7 @@ def _gather_logprob(logprob, target): idx = tf.stack([r, target], 1) return tf.gather_nd(logprob, idx) - def call(self, inputs, return_mean=True, training=False): - hidden, target = inputs + def call(self, hidden, target, return_mean=True, training=False): head_logprob = 0 if self.n_clusters == 0: output = self._logit(hidden, self.out_layers[0][0], self.out_layers[0][1], self.out_projs[0]) diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py index f241fc8dcad2..4f1de0b3e4fc 100644 --- a/src/transformers/modeling_tf_utils.py +++ b/src/transformers/modeling_tf_utils.py @@ -17,6 +17,7 @@ import functools import logging import os +import warnings from typing import Dict, List, Optional, Union import h5py @@ -173,7 +174,11 @@ def compute_loss(self, labels, logits): ) # make sure only labels that are not equal to -100 # are taken into account as loss - active_loss = tf.reshape(labels, (-1,)) != -100 + if tf.math.reduce_any(labels == -1).numpy() is True: + warnings.warn("Using `-1` to mask the loss for the token is deprecated. Please use `-100` instead.") + active_loss = tf.reshape(labels, (-1,)) != -1 + else: + active_loss = tf.reshape(labels, (-1,)) != -100 reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss) labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss) @@ -233,7 +238,10 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin): @property def dummy_inputs(self) -> Dict[str, tf.Tensor]: """ - :obj:`Dict[str, tf.Tensor]`: Dummy inputs to build the network. + Dummy inputs to build the network. + + Returns: + :obj:`Dict[str, tf.Tensor]`: The dummy inputs. """ return {"input_ids": tf.constant(DUMMY_INPUTS)} @@ -774,14 +782,16 @@ def _embedding(self, input_ids): return tf.gather(self.weight, input_ids) def _linear(self, inputs): - """Computes logits by running inputs through a linear layer. - Args: - inputs: A float32 tensor with shape [..., hidden_size] - Returns: - float32 tensor with shape [..., vocab_size]. """ - first_dims = shape_list(inputs)[:-1] + Computes logits by running inputs through a linear layer. + Args: + inputs: A float32 tensor with shape [..., hidden_size] + + Returns: + float32 tensor with shape [..., vocab_size]. + """ + first_dims = shape_list(inputs)[:-1] x = tf.reshape(inputs, [-1, self.hidden_size]) logits = tf.matmul(x, self.weight, transpose_b=True) @@ -789,7 +799,7 @@ def _linear(self, inputs): class TFSequenceSummary(tf.keras.layers.Layer): - r""" + """ Compute a single vector summary of a sequence hidden states. Args: @@ -852,26 +862,9 @@ def __init__(self, config: PretrainedConfig, initializer_range: float = 0.02, ** if self.has_last_dropout: self.last_dropout = tf.keras.layers.Dropout(config.summary_last_dropout) - def call(self, inputs, training=False) -> tf.Tensor: - """ - Compute a single vector summary of a sequence hidden states. - - Args: - inputs (:obj:`Union[tf.Tensor, Tuple[tf.Tensor], List[tf.Tensor], Dict[str, tf.Tensor]]`): - One or two tensors representing: - - - **hidden_states** (:obj:`tf.Tensor` of shape :obj:`[batch_size, seq_len, hidden_size]`) -- The hidden - states of the last layer. - - **cls_index** :obj:`tf.Tensor` of shape :obj:`[batch_size]` or :obj:`[batch_size, ...]` where ... are - optional leading dimensions of :obj:`hidden_states`. Used if :obj:`summary_type == "cls_index"` and - takes the last token of the sequence as classification token. - - Returns: - :obj:`tf.Tensor`: The summary of the sequence hidden states. - """ + def call(self, inputs, cls_index=None, training=False): if not isinstance(inputs, (dict, tuple, list)): hidden_states = inputs - cls_index = None elif isinstance(inputs, (tuple, list)): hidden_states = inputs[0] cls_index = inputs[1] if len(inputs) > 1 else None diff --git a/src/transformers/modeling_tf_xlm.py b/src/transformers/modeling_tf_xlm.py index 7a5f029e56cf..e1eddcc57cf5 100644 --- a/src/transformers/modeling_tf_xlm.py +++ b/src/transformers/modeling_tf_xlm.py @@ -39,7 +39,6 @@ TFSequenceSummary, TFSharedEmbeddings, TFTokenClassificationLoss, - cast_bool_to_primitive, get_initializer, keras_serializable, shape_list, @@ -123,6 +122,7 @@ def __init__(self, n_heads, dim, config, **kwargs): self.layer_id = next(TFMultiHeadAttention.NEW_ID) self.dim = dim self.n_heads = n_heads + self.output_attentions = config.output_attentions assert self.dim % self.n_heads == 0 self.q_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="q_lin") @@ -135,11 +135,10 @@ def __init__(self, n_heads, dim, config, **kwargs): def prune_heads(self, heads): raise NotImplementedError - def call(self, inputs, training=False): + def call(self, input, mask, kv, cache, head_mask, output_attentions, training=False): """ Self-attention (if kv is None) or attention over source sentence (provided by kv). """ - input, mask, kv, cache, head_mask, output_attentions = inputs # Input is (bs, qlen, dim) # Mask is (bs, klen) (non-causal) or (bs, klen, klen) bs, qlen, dim = shape_list(input) @@ -196,7 +195,7 @@ def unshape(x): context = unshape(context) # (bs, qlen, dim) outputs = (self.out_lin(context),) - if cast_bool_to_primitive(output_attentions) is True: + if output_attentions: outputs = outputs + (weights,) return outputs @@ -445,6 +444,7 @@ def call( inputs_embeds = self.embeddings(input_ids) tensor = inputs_embeds + self.position_embeddings(position_ids) + if langs is not None and self.use_lang_emb and self.n_langs > 1: tensor = tensor + self.lang_embeddings(langs) if token_type_ids is not None: @@ -457,15 +457,15 @@ def call( hidden_states = () attentions = () for i in range(self.n_layers): - if cast_bool_to_primitive(output_hidden_states) is True: + if output_hidden_states: hidden_states = hidden_states + (tensor,) # self attention attn_outputs = self.attentions[i]( - [tensor, attn_mask, None, cache, head_mask[i], output_attentions], training=training + tensor, attn_mask, None, cache, head_mask[i], output_attentions, training=training ) attn = attn_outputs[0] - if cast_bool_to_primitive(output_attentions) is True: + if output_attentions: attentions = attentions + (attn_outputs[1],) attn = self.dropout(attn, training=training) tensor = tensor + attn @@ -484,7 +484,7 @@ def call( tensor = tensor * mask[..., tf.newaxis] # Add last hidden state - if cast_bool_to_primitive(output_hidden_states) is True: + if output_hidden_states: hidden_states = hidden_states + (tensor,) # update cache length @@ -495,9 +495,9 @@ def call( # tensor = tensor.transpose(0, 1) outputs = (tensor,) - if cast_bool_to_primitive(output_hidden_states) is True: + if output_hidden_states: outputs = outputs + (hidden_states,) - if cast_bool_to_primitive(output_attentions) is True: + if output_attentions: outputs = outputs + (attentions,) return outputs # outputs, (hidden_states), (attentions) @@ -930,7 +930,7 @@ def call( flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None flat_langs = tf.reshape(langs, (-1, seq_length)) if langs is not None else None flat_inputs_embeds = ( - tf.reshape(inputs_embeds, (-1, inputs_embeds.shape[-2], inputs_embeds.shape[-1])) + tf.reshape(inputs_embeds, (-1, seq_length, shape_list(inputs_embeds)[3])) if inputs_embeds is not None else None ) @@ -943,7 +943,7 @@ def call( ) lengths = None - flat_inputs = [ + transformer_outputs = self.transformer( flat_input_ids, flat_attention_mask, flat_langs, @@ -955,14 +955,12 @@ def call( flat_inputs_embeds, output_attentions, output_hidden_states, - ] - - transformer_outputs = self.transformer(flat_inputs, training=training) + training=training, + ) output = transformer_outputs[0] logits = self.sequence_summary(output) logits = self.logits_proj(logits) reshaped_logits = tf.reshape(logits, (-1, num_choices)) - outputs = (reshaped_logits,) + transformer_outputs[1:] # add hidden states and attention if they are here if labels is not None: diff --git a/src/transformers/modeling_tf_xlnet.py b/src/transformers/modeling_tf_xlnet.py index 1c8cba4e9801..e255e5adfd18 100644 --- a/src/transformers/modeling_tf_xlnet.py +++ b/src/transformers/modeling_tf_xlnet.py @@ -38,7 +38,6 @@ TFSequenceSummary, TFSharedEmbeddings, TFTokenClassificationLoss, - cast_bool_to_primitive, get_initializer, keras_serializable, shape_list, @@ -92,6 +91,7 @@ def __init__(self, config, **kwargs): self.d_model = config.d_model self.scale = 1 / (config.d_head ** 0.5) self.initializer_range = config.initializer_range + self.output_attentions = config.output_attentions self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.dropout = tf.keras.layers.Dropout(config.dropout) @@ -142,11 +142,10 @@ def rel_shift(self, x, klen=-1): return x - def rel_attn_core(self, inputs, training=False): + def rel_attn_core( + self, q_head, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask, head_mask, output_attentions, training=False + ): """Core relative positional attention operations.""" - - q_head, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask, head_mask, output_attentions = inputs - # content based attention score ac = tf.einsum("ibnd,jbnd->ijbn", q_head + self.r_w_bias, k_head_h) @@ -182,16 +181,14 @@ def rel_attn_core(self, inputs, training=False): # attention output attn_vec = tf.einsum("ijbn,jbnd->ibnd", attn_prob, v_head_h) - if cast_bool_to_primitive(output_attentions) is True: + if output_attentions: return attn_vec, attn_prob return attn_vec - def post_attention(self, inputs, residual=True, training=False): + def post_attention(self, h, attn_vec, residual=True, training=False): """Post-attention processing.""" # post-attention projection (back to `d_model`) - h, attn_vec = inputs - attn_out = tf.einsum("ibnd,hnd->ibh", attn_vec, self.o) attn_out = self.dropout(attn_out, training=training) @@ -202,9 +199,20 @@ def post_attention(self, inputs, residual=True, training=False): return output - def call(self, inputs, training=False): - (h, g, attn_mask_h, attn_mask_g, r, seg_mat, mems, target_mapping, head_mask, output_attentions) = inputs - + def call( + self, + h, + g, + attn_mask_h, + attn_mask_g, + r, + seg_mat, + mems, + target_mapping, + head_mask, + output_attentions, + training=False, + ): if g is not None: # Two-stream attention with relative positional encoding. # content based attention score @@ -228,15 +236,22 @@ def call(self, inputs, training=False): # core attention ops attn_vec_h = self.rel_attn_core( - [q_head_h, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_h, head_mask, output_attentions], + q_head_h, + k_head_h, + v_head_h, + k_head_r, + seg_mat, + attn_mask_h, + head_mask, + output_attentions, training=training, ) - if cast_bool_to_primitive(output_attentions) is True: + if output_attentions: attn_vec_h, attn_prob_h = attn_vec_h # post processing - output_h = self.post_attention([h, attn_vec_h], training=training) + output_h = self.post_attention(h, attn_vec_h, training=training) # g-stream # query-stream query head @@ -246,27 +261,41 @@ def call(self, inputs, training=False): if target_mapping is not None: q_head_g = tf.einsum("mbnd,mlb->lbnd", q_head_g, target_mapping) attn_vec_g = self.rel_attn_core( - [q_head_g, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_g, head_mask, output_attentions], + q_head_g, + k_head_h, + v_head_h, + k_head_r, + seg_mat, + attn_mask_g, + head_mask, + output_attentions, training=training, ) - if cast_bool_to_primitive(output_attentions) is True: + if output_attentions: attn_vec_g, attn_prob_g = attn_vec_g attn_vec_g = tf.einsum("lbnd,mlb->mbnd", attn_vec_g, target_mapping) else: attn_vec_g = self.rel_attn_core( - [q_head_g, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_g, head_mask, output_attentions], + q_head_g, + k_head_h, + v_head_h, + k_head_r, + seg_mat, + attn_mask_g, + head_mask, + output_attentions, training=training, ) - if cast_bool_to_primitive(output_attentions) is True: + if output_attentions: attn_vec_g, attn_prob_g = attn_vec_g # post processing - output_g = self.post_attention([g, attn_vec_g], training=training) + output_g = self.post_attention(g, attn_vec_g, training=training) - if cast_bool_to_primitive(output_attentions) is True: + if output_attentions: attn_prob = attn_prob_h, attn_prob_g else: @@ -286,19 +315,26 @@ def call(self, inputs, training=False): # core attention ops attn_vec = self.rel_attn_core( - [q_head_h, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_h, head_mask, output_attentions], + q_head_h, + k_head_h, + v_head_h, + k_head_r, + seg_mat, + attn_mask_h, + head_mask, + output_attentions, training=training, ) - if cast_bool_to_primitive(output_attentions) is True: + if output_attentions: attn_vec, attn_prob = attn_vec # post processing - output_h = self.post_attention([h, attn_vec], training=training) + output_h = self.post_attention(h, attn_vec, training=training) output_g = None outputs = (output_h, output_g) - if cast_bool_to_primitive(output_attentions) is True: + if output_attentions: outputs = outputs + (attn_prob,) return outputs @@ -337,8 +373,33 @@ def __init__(self, config, **kwargs): self.ff = TFXLNetFeedForward(config, name="ff") self.dropout = tf.keras.layers.Dropout(config.dropout) - def call(self, inputs, training=False): - outputs = self.rel_attn(inputs, training=training) + def call( + self, + output_h, + output_g, + non_tgt_mask, + attn_mask, + pos_emb, + seg_mat, + mems, + target_mapping, + head_mask, + output_attentions, + training=False, + ): + outputs = self.rel_attn( + output_h, + output_g, + non_tgt_mask, + attn_mask, + pos_emb, + seg_mat, + mems, + target_mapping, + head_mask, + output_attentions, + training=training, + ) output_h, output_g = outputs[:2] if output_g is not None: @@ -686,32 +747,30 @@ def call( hidden_states = [] for i, layer_module in enumerate(self.layer): # cache new mems - if self.mem_len is not None and self.mem_len > 0 and use_cache is True: + if self.mem_len is not None and self.mem_len > 0 and use_cache: new_mems = new_mems + (self.cache_mem(output_h, mems[i]),) - if cast_bool_to_primitive(output_hidden_states) is True: + if output_hidden_states: hidden_states.append((output_h, output_g) if output_g is not None else output_h) outputs = layer_module( - [ - output_h, - output_g, - non_tgt_mask, - attn_mask, - pos_emb, - seg_mat, - mems[i], - target_mapping, - head_mask[i], - output_attentions, - ], + output_h, + output_g, + non_tgt_mask, + attn_mask, + pos_emb, + seg_mat, + mems[i], + target_mapping, + head_mask[i], + output_attentions, training=training, ) output_h, output_g = outputs[:2] - if cast_bool_to_primitive(output_attentions) is True: + if output_attentions: attentions.append(outputs[2]) # Add last hidden state - if cast_bool_to_primitive(output_hidden_states) is True: + if output_hidden_states: hidden_states.append((output_h, output_g) if output_g is not None else output_h) output = self.dropout(output_g if output_g is not None else output_h, training=training) @@ -719,16 +778,16 @@ def call( # Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method) outputs = (tf.transpose(output, perm=(1, 0, 2)),) - if self.mem_len is not None and self.mem_len > 0 and use_cache is True: + if self.mem_len is not None and self.mem_len > 0 and use_cache: outputs = outputs + (new_mems,) - if cast_bool_to_primitive(output_hidden_states) is True: + if output_hidden_states: if output_g is not None: hidden_states = tuple(tf.transpose(h, perm=(1, 0, 2)) for hs in hidden_states for h in hs) else: hidden_states = tuple(tf.transpose(hs, perm=(1, 0, 2)) for hs in hidden_states) outputs = outputs + (hidden_states,) - if cast_bool_to_primitive(output_attentions) is True: + if output_attentions: attentions = tuple(tf.transpose(t, perm=(2, 3, 0, 1)) for t in attentions) outputs = outputs + (attentions,) @@ -1240,8 +1299,7 @@ def call( if inputs_embeds is not None else None ) - - flat_inputs = [ + transformer_outputs = self.transformer( flat_input_ids, flat_attention_mask, mems, @@ -1254,14 +1312,12 @@ def call( use_cache, output_attentions, output_hidden_states, - ] - - transformer_outputs = self.transformer(flat_inputs, training=training) + training=training, + ) output = transformer_outputs[0] logits = self.sequence_summary(output) logits = self.logits_proj(logits) reshaped_logits = tf.reshape(logits, (-1, num_choices)) - outputs = (reshaped_logits,) + transformer_outputs[1:] # add hidden states and attention if they are here if labels is not None: diff --git a/src/transformers/trainer_tf.py b/src/transformers/trainer_tf.py index 03d028994a03..aaca022e815e 100644 --- a/src/transformers/trainer_tf.py +++ b/src/transformers/trainer_tf.py @@ -4,7 +4,6 @@ import logging import math import os -import sys import warnings from typing import Callable, Dict, Optional, Tuple @@ -25,15 +24,6 @@ logger = logging.getLogger(__name__) -if parse(tf.__version__).release < (2, 2, 0): - logger.info( - "You need to run the TensorFlow trainer with at least the version 2.2.0, your version is {}".format( - tf.__version__ - ) - ) - sys.exit(1) - - class TFTrainer: """ TFTrainer is a simple but feature-complete training and eval loop for TensorFlow, @@ -77,6 +67,11 @@ def __init__( None, ), ): + assert parse(tf.__version__).release >= (2, 2, 0), ( + "You need to run the TensorFlow trainer with at least the version 2.2.0, your version is %r " + % tf.__version__ + ) + self.model = model self.args = args self.train_dataset = train_dataset diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py index 88bfaa63cdc1..9aafb5d60075 100644 --- a/tests/test_modeling_tf_common.py +++ b/tests/test_modeling_tf_common.py @@ -23,7 +23,7 @@ from importlib import import_module from transformers import is_tf_available, is_torch_available -from transformers.testing_utils import _tf_gpu_memory_limit, require_tf +from transformers.testing_utils import _tf_gpu_memory_limit, require_tf, slow if is_tf_available(): @@ -130,6 +130,61 @@ def test_save_load(self): self.assert_outputs_same(after_outputs, outputs) + @slow + def test_saved_model_with_hidden_states_output(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.output_hidden_states = True + + for model_class in self.all_model_classes: + inputs_dict = self._prepare_for_class(inputs_dict, model_class) + model = model_class(config) + num_out = len(model(inputs_dict)) + model._saved_model_inputs_spec = None + model._set_save_spec(inputs_dict) + + with tempfile.TemporaryDirectory() as tmpdirname: + tf.saved_model.save(model, tmpdirname) + model = tf.keras.models.load_model(tmpdirname) + outputs = model(inputs_dict) + hidden_states = [t.numpy() for t in outputs[-1]] + self.assertEqual(len(outputs), num_out) + self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1) + self.assertListEqual( + list(hidden_states[0].shape[-2:]), [self.model_tester.seq_length, self.model_tester.hidden_size], + ) + + @slow + def test_saved_model_with_attentions_output(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.output_attentions = True + encoder_seq_length = ( + self.model_tester.encoder_seq_length + if hasattr(self.model_tester, "encoder_seq_length") + else self.model_tester.seq_length + ) + encoder_key_length = ( + self.model_tester.key_length if hasattr(self.model_tester, "key_length") else encoder_seq_length + ) + + for model_class in self.all_model_classes: + inputs_dict = self._prepare_for_class(inputs_dict, model_class) + model = model_class(config) + num_out = len(model(inputs_dict)) + model._saved_model_inputs_spec = None + model._set_save_spec(inputs_dict) + + with tempfile.TemporaryDirectory() as tmpdirname: + tf.saved_model.save(model, tmpdirname) + model = tf.keras.models.load_model(tmpdirname) + outputs = model(inputs_dict) + attentions = [t.numpy() for t in outputs[-1]] + self.assertEqual(len(outputs), num_out) + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], + ) + def test_keras_save_load(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/test_modeling_tf_xlnet.py b/tests/test_modeling_tf_xlnet.py index ecc498c1d9ad..0299cb2fb466 100644 --- a/tests/test_modeling_tf_xlnet.py +++ b/tests/test_modeling_tf_xlnet.py @@ -342,11 +342,17 @@ def create_and_check_xlnet_for_multiple_choice( "attention_mask": multiple_choice_input_mask, "token_type_ids": multiple_choice_token_type_ids, } - (logits,) = model(inputs) + (logits, mems_1) = model(inputs) result = { + "mems_1": [mem.numpy() for mem in mems_1], "logits": logits.numpy(), } + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) + self.parent.assertListEqual( + list(list(mem.shape) for mem in result["mems_1"]), + [[self.seq_length, self.batch_size * self.num_choices, self.hidden_size]] * self.num_hidden_layers, + ) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() From cedc547e7e009e4745db350505848fd5c4f8f6f3 Mon Sep 17 00:00:00 2001 From: Jay Mody Date: Mon, 3 Aug 2020 09:00:39 -0400 Subject: [PATCH 083/127] Adds train_batch_size, eval_batch_size, and n_gpu to to_sanitized_dict output for logging. (#5331) * Adds train_batch_size, eval_batch_size, and n_gpu to to_sanitized_dict() output * Update wandb config logging to use to_sanitized_dict * removed n_gpu from sanitized dict * fix quality check errors --- src/transformers/trainer.py | 2 +- src/transformers/training_args.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 8a3209355ae9..e1429713fb9b 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -383,7 +383,7 @@ def setup_wandb(self): logger.info( 'Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"' ) - wandb.init(project=os.getenv("WANDB_PROJECT", "huggingface"), config=vars(self.args)) + wandb.init(project=os.getenv("WANDB_PROJECT", "huggingface"), config=self.args.to_sanitized_dict()) # keep track of model topology and gradients, unsupported on TPU if not is_torch_tpu_available() and os.getenv("WANDB_WATCH") != "false": wandb.watch( diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index e6506d97632f..ad33266a8181 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -310,7 +310,10 @@ def to_sanitized_dict(self) -> Dict[str, Any]: Sanitized serialization to use with TensorBoardâ€™s hparams """ d = dataclasses.asdict(self) + d = {**d, **{"train_batch_size": self.train_batch_size, "eval_batch_size": self.eval_batch_size}} + valid_types = [bool, int, float, str] if is_torch_available(): valid_types.append(torch.Tensor) + return {k: v if type(v) in valid_types else str(v) for k, v in d.items()} From 0b418673575fb5aa2f6f657fb33d64eafb1d700f Mon Sep 17 00:00:00 2001 From: Suraj Patil Date: Mon, 3 Aug 2020 19:49:35 +0530 Subject: [PATCH 084/127] fix labels (#6213) --- src/transformers/data/data_collator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py index 085f7a68a8aa..cf8eb996f8e5 100644 --- a/src/transformers/data/data_collator.py +++ b/src/transformers/data/data_collator.py @@ -87,7 +87,8 @@ def __call__(self, examples: List[Union[torch.Tensor, Dict[str, torch.Tensor]]]) return {"input_ids": inputs, "labels": labels} else: labels = batch.clone().detach() - labels[labels == self.tokenizer.pad_token_id] = -100 + if self.tokenizer.pad_token_id is not None: + labels[labels == self.tokenizer.pad_token_id] = -100 return {"input_ids": batch, "labels": labels} def _tensorize_batch(self, examples: List[torch.Tensor]) -> torch.Tensor: From 06f1692b023a701ab2bb443fa4f0bdd58c6bd234 Mon Sep 17 00:00:00 2001 From: Maurice Gonzenbach Date: Mon, 3 Aug 2020 16:21:23 +0200 Subject: [PATCH 085/127] Fix _shift_right function in TFT5PreTrainedModel (#6214) --- src/transformers/modeling_tf_t5.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/transformers/modeling_tf_t5.py b/src/transformers/modeling_tf_t5.py index 9b885d496fbd..9858b8ae7665 100644 --- a/src/transformers/modeling_tf_t5.py +++ b/src/transformers/modeling_tf_t5.py @@ -783,8 +783,7 @@ def _shift_right(self, input_ids): decoder_start_token_id is not None ), "self.model.config.decoder_start_token_id has to be defined. In TF T5 it is usually set to the pad_token_id. See T5 docs for more information" - # shift inputs to the right - shifted_input_ids = tf.zeros_like(input_ids, dtype=tf.int32) + shifted_input_ids = tf.cast(input_ids, tf.int32) shifted_input_ids = tf.roll(shifted_input_ids, 1, axis=-1) start_tokens = tf.fill((shape_list(shifted_input_ids)[0], 1), decoder_start_token_id) shifted_input_ids = tf.concat([start_tokens, shifted_input_ids[:, 1:]], -1) @@ -795,9 +794,12 @@ def _shift_right(self, input_ids): shifted_input_ids == -100, tf.fill(shape_list(shifted_input_ids), pad_token_id), shifted_input_ids ) - assert tf.math.reduce_any( - shifted_input_ids >= 0 - ).numpy(), "Verify that `labels` has only positive values and -100" + # "Verify that `labels` has only positive values and -100" + assert_gte0 = tf.debugging.assert_greater_equal(shifted_input_ids, tf.cast(0, tf.int32)) + + # Make sure the assertion op is called by wrapping the result in an identity no-op + with tf.control_dependencies([assert_gte0]): + shifted_input_ids = tf.identity(shifted_input_ids) return shifted_input_ids From b6b2f2270fe6c32852fc1b887afe354b7b79d18c Mon Sep 17 00:00:00 2001 From: Sam Shleifer Date: Mon, 3 Aug 2020 10:36:26 -0400 Subject: [PATCH 086/127] s2s: fix LR logging, remove some dead code. (#6205) --- examples/lightning_base.py | 6 +----- examples/seq2seq/callbacks.py | 4 ++++ examples/seq2seq/train_mbart_cc25_enro.sh | 2 -- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/examples/lightning_base.py b/examples/lightning_base.py index 754538e79279..ae03e295614a 100644 --- a/examples/lightning_base.py +++ b/examples/lightning_base.py @@ -58,7 +58,6 @@ def __init__( self.hparams = hparams self.step_count = 0 - self.tfmr_ckpts = {} self.output_dir = Path(self.hparams.output_dir) cache_dir = self.hparams.cache_dir if self.hparams.cache_dir else None if config is None: @@ -99,7 +98,7 @@ def load_hf_checkpoint(self, *args, **kwargs): self.model = self.model_type.from_pretrained(*args, **kwargs) def configure_optimizers(self): - "Prepare optimizer and schedule (linear warmup and decay)" + """Prepare optimizer and schedule (linear warmup and decay)""" model = self.model no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ @@ -159,11 +158,9 @@ def _feature_file(self, mode): @pl.utilities.rank_zero_only def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> None: save_path = self.output_dir.joinpath("best_tfmr") - save_path.mkdir(exist_ok=True) self.model.config.save_step = self.step_count self.model.save_pretrained(save_path) self.tokenizer.save_pretrained(save_path) - self.tfmr_ckpts[self.step_count] = save_path @staticmethod def add_model_specific_args(parser, root_dir): @@ -274,7 +271,6 @@ def add_generic_args(parser, root_dir) -> None: default=1, help="Number of updates steps to accumulate before performing a backward/update pass.", ) - parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") diff --git a/examples/seq2seq/callbacks.py b/examples/seq2seq/callbacks.py index 1de3aa5d4674..68e06a5f48a7 100644 --- a/examples/seq2seq/callbacks.py +++ b/examples/seq2seq/callbacks.py @@ -19,6 +19,10 @@ def count_trainable_parameters(model): class Seq2SeqLoggingCallback(pl.Callback): + def on_batch_end(self, trainer, pl_module): + lrs = {f"lr_group_{i}": param["lr"] for i, param in enumerate(pl_module.trainer.optimizers[0].param_groups)} + pl_module.logger.log_metrics(lrs) + @rank_zero_only def _write_logs( self, trainer: pl.Trainer, pl_module: pl.LightningModule, type_path: str, save_generations=True diff --git a/examples/seq2seq/train_mbart_cc25_enro.sh b/examples/seq2seq/train_mbart_cc25_enro.sh index 4ec18de36953..b8122aee3f41 100755 --- a/examples/seq2seq/train_mbart_cc25_enro.sh +++ b/examples/seq2seq/train_mbart_cc25_enro.sh @@ -5,7 +5,6 @@ python finetune.py \ --learning_rate=3e-5 \ --fp16 \ --do_train \ - --do_predict \ --val_check_interval=0.25 \ --adam_eps 1e-06 \ --num_train_epochs 6 --src_lang en_XX --tgt_lang ro_RO \ @@ -15,6 +14,5 @@ python finetune.py \ --task translation \ --warmup_steps 500 \ --freeze_embeds \ - --early_stopping_patience 4 \ --model_name_or_path=facebook/mbart-large-cc25 \ $@ From e4920c92d65f5efded4cc4c8c754d0d553ef4bbc Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Mon, 3 Aug 2020 11:44:46 -0400 Subject: [PATCH 087/127] Doc pipelines (#6175) * Init work on pipelines doc * Work in progress * Work in progress * Doc pipelines * Rm unwanted default * Apply suggestions from code review Lysandre comments Co-authored-by: Lysandre Debut Co-authored-by: Lysandre Debut --- docs/source/index.rst | 1 + docs/source/internal/pipelines_utils.rst | 40 + docs/source/main_classes/model.rst | 6 + docs/source/main_classes/pipelines.rst | 80 +- src/transformers/pipelines.py | 1001 ++++++++++++---------- 5 files changed, 655 insertions(+), 473 deletions(-) create mode 100644 docs/source/internal/pipelines_utils.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index 26e950875ef9..30ad430f4f3b 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -207,3 +207,4 @@ conversion utilities for the following models: model_doc/dpr internal/modeling_utils internal/tokenization_utils + internal/pipelines_utils \ No newline at end of file diff --git a/docs/source/internal/pipelines_utils.rst b/docs/source/internal/pipelines_utils.rst new file mode 100644 index 000000000000..c6fda75803c2 --- /dev/null +++ b/docs/source/internal/pipelines_utils.rst @@ -0,0 +1,40 @@ +Utilities for pipelines +----------------------- + +This page lists all the utility functions the library provides for pipelines. + +Most of those are only useful if you are studying the code of the models in the library. + + +Argument handling +~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.pipelines.ArgumentHandler + +.. autoclass:: transformers.pipelines.ZeroShotClassificationArgumentHandler + +.. autoclass:: transformers.pipelines.QuestionAnsweringArgumentHandler + + +Data format +~~~~~~~~~~~ + +.. autoclass:: transformers.pipelines.PipelineDataFormat + :members: + +.. autoclass:: transformers.pipelines.CsvPipelineDataFormat + :members: + +.. autoclass:: transformers.pipelines.JsonPipelineDataFormat + :members: + +.. autoclass:: transformers.pipelines.PipedPipelineDataFormat + :members: + + +Utilities +~~~~~~~~~ + +.. autofunction:: transformers.pipelines.get_framework + +.. autoclass:: transformers.pipelines.PipelineException diff --git a/docs/source/main_classes/model.rst b/docs/source/main_classes/model.rst index bea43e94f65a..d89e788f191b 100644 --- a/docs/source/main_classes/model.rst +++ b/docs/source/main_classes/model.rst @@ -41,3 +41,9 @@ The other methods that are common to each model are defined in :class:`~transfor .. autoclass:: transformers.modeling_tf_utils.TFModelUtilsMixin :members: + + +Generative models +~~~~~~~~~~~~~~~~~ + +Coming soon diff --git a/docs/source/main_classes/pipelines.rst b/docs/source/main_classes/pipelines.rst index 214858fb5abe..067b7eca9308 100644 --- a/docs/source/main_classes/pipelines.rst +++ b/docs/source/main_classes/pipelines.rst @@ -3,13 +3,23 @@ Pipelines The pipelines are a great and easy way to use models for inference. These pipelines are objects that abstract most of the complex code from the library, offering a simple API dedicated to several tasks, including Named Entity -Recognition, Masked Language Modeling, Sentiment Analysis, Feature Extraction and Question Answering. +Recognition, Masked Language Modeling, Sentiment Analysis, Feature Extraction and Question Answering. See the +:doc:`task summary <../task_summary>` for examples of use. There are two categories of pipeline abstractions to be aware about: -- The :func:`~transformers.pipeline` which is the most powerful object encapsulating all other pipelines -- The other task-specific pipelines, such as :class:`~transformers.TokenClassificationPipeline` - or :class:`~transformers.QuestionAnsweringPipeline` +- The :func:`~transformers.pipeline` which is the most powerful object encapsulating all other pipelines. +- The other task-specific pipelines: + + - :class:`~transformers.ConversationalPipeline` + - :class:`~transformers.FeatureExtractionPipeline` + - :class:`~transformers.FillMaskPipeline` + - :class:`~transformers.QuestionAnsweringPipeline` + - :class:`~transformers.SummarizationPipeline` + - :class:`~transformers.TextClassificationPipeline` + - :class:`~transformers.TextGenerationPipeline` + - :class:`~transformers.TokenClassificationPipeline` + - :class:`~transformers.TranslationPipeline` The pipeline abstraction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -21,61 +31,75 @@ other pipeline but requires an additional argument which is the `task`. The task specific pipelines -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Parent class: Pipeline -========================================= - -.. autoclass:: transformers.Pipeline - :members: predict, transform, save_pretrained +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -TokenClassificationPipeline +ConversationalPipeline ========================================== -.. autoclass:: transformers.TokenClassificationPipeline +.. autoclass:: transformers.Conversation -NerPipeline +.. autoclass:: transformers.ConversationalPipeline + :special-members: __call__ + :members: + +FeatureExtractionPipeline ========================================== -This class is an alias of the :class:`~transformers.TokenClassificationPipeline` defined above. Please refer to that pipeline for -documentation and usage examples. +.. autoclass:: transformers.FeatureExtractionPipeline + :special-members: __call__ + :members: FillMaskPipeline ========================================== .. autoclass:: transformers.FillMaskPipeline + :special-members: __call__ + :members: -FeatureExtractionPipeline -========================================== - -.. autoclass:: transformers.FeatureExtractionPipeline - -TextClassificationPipeline +NerPipeline ========================================== -.. autoclass:: transformers.TextClassificationPipeline +This class is an alias of the :class:`~transformers.TokenClassificationPipeline` defined below. Please refer to that +pipeline for documentation and usage examples. QuestionAnsweringPipeline ========================================== .. autoclass:: transformers.QuestionAnsweringPipeline - + :special-members: __call__ + :members: SummarizationPipeline ========================================== .. autoclass:: transformers.SummarizationPipeline + :special-members: __call__ + :members: +TextClassificationPipeline +========================================== + +.. autoclass:: transformers.TextClassificationPipeline + :special-members: __call__ + :members: TextGenerationPipeline ========================================== .. autoclass:: transformers.TextGenerationPipeline + :special-members: __call__ + :members: - -ConversationalPipeline +TokenClassificationPipeline ========================================== -.. autoclass:: transformers.Conversation +.. autoclass:: transformers.TokenClassificationPipeline + :special-members: __call__ + :members: + -.. autoclass:: transformers.ConversationalPipeline \ No newline at end of file +Parent class: :obj:`Pipeline` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.Pipeline + :members: diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py index b40f734ef2b6..3cd252fd8f4a 100755 --- a/src/transformers/pipelines.py +++ b/src/transformers/pipelines.py @@ -33,7 +33,7 @@ from .configuration_auto import AutoConfig from .configuration_utils import PretrainedConfig from .data import SquadExample, squad_convert_examples_to_features -from .file_utils import is_tf_available, is_torch_available +from .file_utils import add_end_docstrings, is_tf_available, is_torch_available from .modelcard import ModelCard from .tokenization_auto import AutoTokenizer from .tokenization_bert import BasicTokenizer @@ -82,8 +82,13 @@ def get_framework(model=None): - """ Select framework (TensorFlow/PyTorch) to use. - If both frameworks are installed and no specific model is provided, defaults to using PyTorch. + """ + Select framework (TensorFlow or PyTorch) to use. + + Args: + model (:obj:`str`, :class:`~transformers.PreTrainedModel` or :class:`~transformers.TFPreTrainedModel`, `optional`): + If both frameworks are installed, picks the one corresponding to the model passed (either a model class or + the model name). If no specific model is provided, defaults to using PyTorch. """ if is_tf_available() and is_torch_available() and model is not None and not isinstance(model, str): # Both framework are available but the user supplied a model class instance. @@ -103,7 +108,12 @@ def get_framework(model=None): class PipelineException(Exception): """ - Raised by pipelines when handling __call__ + Raised by a :class:`~transformers.Pipeline` when handling __call__. + + Args: + task (:obj:`str`): The task of the pipeline. + model (:obj:`str`): The model used by the pipeline. + reason (:obj:`str`): The error message to display. """ def __init__(self, task: str, model: str, reason: str): @@ -115,7 +125,7 @@ def __init__(self, task: str, model: str, reason: str): class ArgumentHandler(ABC): """ - Base interface for handling varargs for each Pipeline + Base interface for handling arguments for each :class:`~transformers.pipelines.Pipeline`. """ @abstractmethod @@ -125,7 +135,7 @@ def __call__(self, *args, **kwargs): class DefaultArgumentHandler(ArgumentHandler): """ - Default varargs argument parser handling parameters for each Pipeline + Default argument parser handling parameters for each :class:`~transformers.pipelines.Pipeline`. """ @staticmethod @@ -178,18 +188,25 @@ class PipelineDataFormat: """ Base class for all the pipeline supported data format both for reading and writing. Supported data formats currently includes: - - JSON - - CSV - - stdin/stdout (pipe) + - JSON + - CSV + - stdin/stdout (pipe) - PipelineDataFormat also includes some utilities to work with multi-columns like mapping from datasets columns - to pipelines keyword arguments through the `dataset_kwarg_1=dataset_column_1` format. + :obj:`PipelineDataFormat` also includes some utilities to work with multi-columns like mapping from datasets + columns to pipelines keyword arguments through the :obj:`dataset_kwarg_1=dataset_column_1` format. + + Args: + output_path (:obj:`str`, `optional`): Where to save the outgoing data. + input_path (:obj:`str`, `optional`): Where to look for the input data. + column (:obj:`str`, `optional`): The column to read. + overwrite (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to overwrite the :obj:`output_path`. """ SUPPORTED_FORMATS = ["json", "csv", "pipe"] def __init__( - self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False, + self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite: bool = False, ): self.output_path = output_path self.input_path = input_path @@ -212,19 +229,25 @@ def __iter__(self): raise NotImplementedError() @abstractmethod - def save(self, data: dict): + def save(self, data: Union[dict, List[dict]]): """ - Save the provided data object with the representation for the current `DataFormat`. - :param data: data to store - :return: + Save the provided data object with the representation for the current + :class:`~transformers.pipelines.PipelineDataFormat`. + + Args: + data (:obj:`dict` or list of :obj:`dict`): The data to store. """ raise NotImplementedError() def save_binary(self, data: Union[dict, List[dict]]) -> str: """ Save the provided data object as a pickle-formatted binary data on the disk. - :param data: data to store - :return: (str) Path where the data has been saved + + Args: + data (:obj:`dict` or list of :obj:`dict`): The data to store. + + Returns: + :obj:`str`: Path where the data has been saved. """ path, _ = os.path.splitext(self.output_path) binary_path = os.path.extsep.join((path, "pickle")) @@ -237,7 +260,26 @@ def save_binary(self, data: Union[dict, List[dict]]) -> str: @staticmethod def from_str( format: str, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False, - ): + ) -> "PipelineDataFormat": + """ + Creates an instance of the right subclass of :class:`~transformers.pipelines.PipelineDataFormat` depending + on :obj:`format`. + + Args: + format: (:obj:`str`): + The format of the desired pipeline. Acceptable values are :obj:`"json"`, :obj:`"csv"` or :obj:`"pipe"`. + output_path (:obj:`str`, `optional`): + Where to save the outgoing data. + input_path (:obj:`str`, `optional`): + Where to look for the input data. + column (:obj:`str`, `optional`): + The column to read. + overwrite (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to overwrite the :obj:`output_path`. + + Returns: + :class:`~transformers.pipelines.PipelineDataFormat`: The proper data format. + """ if format == "json": return JsonPipelineDataFormat(output_path, input_path, column, overwrite=overwrite) elif format == "csv": @@ -249,6 +291,17 @@ def from_str( class CsvPipelineDataFormat(PipelineDataFormat): + """ + Support for pipelines using CSV data format. + + Args: + output_path (:obj:`str`, `optional`): Where to save the outgoing data. + input_path (:obj:`str`, `optional`): Where to look for the input data. + column (:obj:`str`, `optional`): The column to read. + overwrite (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to overwrite the :obj:`output_path`. + """ + def __init__( self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False, ): @@ -264,6 +317,13 @@ def __iter__(self): yield row[self.column[0]] def save(self, data: List[dict]): + """ + Save the provided data object with the representation for the current + :class:`~transformers.pipelines.PipelineDataFormat`. + + Args: + data (:obj:`List[dict]`): The data to store. + """ with open(self.output_path, "w") as f: if len(data) > 0: writer = csv.DictWriter(f, list(data[0].keys())) @@ -272,6 +332,17 @@ def save(self, data: List[dict]): class JsonPipelineDataFormat(PipelineDataFormat): + """ + Support for pipelines using JSON file format. + + Args: + output_path (:obj:`str`, `optional`): Where to save the outgoing data. + input_path (:obj:`str`, `optional`): Where to look for the input data. + column (:obj:`str`, `optional`): The column to read. + overwrite (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to overwrite the :obj:`output_path`. + """ + def __init__( self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False, ): @@ -288,6 +359,12 @@ def __iter__(self): yield entry[self.column[0]] def save(self, data: dict): + """ + Save the provided data object in a json file. + + Args: + data (:obj:`dict`): The data to store. + """ with open(self.output_path, "w") as f: json.dump(data, f) @@ -298,6 +375,13 @@ class PipedPipelineDataFormat(PipelineDataFormat): For multi columns data, columns should separated by \t If columns are provided, then the output will be a dictionary with {column_x: value_x} + + Args: + output_path (:obj:`str`, `optional`): Where to save the outgoing data. + input_path (:obj:`str`, `optional`): Where to look for the input data. + column (:obj:`str`, `optional`): The column to read. + overwrite (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to overwrite the :obj:`output_path`. """ def __iter__(self): @@ -317,6 +401,12 @@ def __iter__(self): yield line def save(self, data: dict): + """ + Print the data. + + Args: + data (:obj:`dict`): The data to store. + """ print(data) def save_binary(self, data: Union[dict, List[dict]]) -> str: @@ -343,24 +433,7 @@ def predict(self, X): raise NotImplementedError() -class Pipeline(_ScikitCompat): - """ - The Pipeline class is the class from which all pipelines inherit. Refer to this class for methods shared across - different pipelines. - - Base class implementing pipelined operations. - Pipeline workflow is defined as a sequence of the following operations: - - Input -> Tokenization -> Model Inference -> Post-Processing (Task dependent) -> Output - - Pipeline supports running on CPU or GPU through the device argument. Users can specify - device argument as an integer, -1 meaning "CPU", >= 0 referring the CUDA device ordinal. - - Some pipeline, like for instance FeatureExtractionPipeline ('feature-extraction') outputs large - tensor object as nested-lists. In order to avoid dumping such large structure as textual data we - provide the binary_output constructor argument. If set to True, the output will be stored in the - pickle format. - +PIPELINE_INIT_ARGS = r""" Arguments: model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`): The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from @@ -369,28 +442,44 @@ class Pipeline(_ScikitCompat): tokenizer (:obj:`~transformers.PreTrainedTokenizer`): The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from :class:`~transformers.PreTrainedTokenizer`. - modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`): + modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`): Model card attributed to the model for this pipeline. - framework (:obj:`str`, `optional`, defaults to :obj:`None`): - The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be - installed. + framework (:obj:`str`, `optional`): + The framework to use, either :obj:`"pt"` for PyTorch or :obj:`"tf"` for TensorFlow. The specified framework + must be installed. If no framework is specified, will default to the one currently installed. If no framework is specified - and both frameworks are installed, will default to PyTorch. - args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`): + and both frameworks are installed, will default to the framework of the :obj:`model`, or to PyTorch if no + model is provided. + task (:obj:`str`, defaults to :obj:`""`): + A task-identifier for the pipeline. + args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`): Reference to the object in charge of parsing supplied pipeline parameters. - device (:obj:`int`, `optional`, defaults to :obj:`-1`): - Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model + device (:obj:`int`, `optional`, defaults to -1): + Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, a positive will run the model on the associated CUDA device id. binary_output (:obj:`bool`, `optional`, defaults to :obj:`False`): - Flag indicating if the output the pipeline should happen in a binary format (i.e. pickle) or as raw text. + Flag indicating if the output the pipeline should happen in a binary format (i.e., pickle) or as raw text. +""" - Return: - :obj:`List` or :obj:`Dict`: - Pipeline returns list or dictionary depending on: - - Whether the user supplied multiple samples - - Whether the pipeline exposes multiple fields in the output object +@add_end_docstrings(PIPELINE_INIT_ARGS) +class Pipeline(_ScikitCompat): + """ + The Pipeline class is the class from which all pipelines inherit. Refer to this class for methods shared across + different pipelines. + + Base class implementing pipelined operations. + Pipeline workflow is defined as a sequence of the following operations: + + Input -> Tokenization -> Model Inference -> Post-Processing (task dependent) -> Output + + Pipeline supports running on CPU or GPU through the device argument (see below). + + Some pipeline, like for instance :class:`~transformers.FeatureExtractionPipeline` (:obj:`'feature-extraction'` ) + output large tensor object as nested-lists. In order to avoid dumping such large structure as textual data we + provide the :obj:`binary_output` constructor argument. If set to :obj:`True`, the output will be stored in the + pickle format. """ default_input_names = None @@ -408,7 +497,7 @@ def __init__( ): if framework is None: - framework = get_framework() + framework = get_framework(model) self.task = task self.model = model @@ -428,9 +517,13 @@ def __init__( if task_specific_params is not None and task in task_specific_params: self.model.config.update(task_specific_params.get(task)) - def save_pretrained(self, save_directory): + def save_pretrained(self, save_directory: str): """ - Save the pipeline's model and tokenizer to the specified save_directory + Save the pipeline's model and tokenizer. + + Args: + save_directory (:obj:`str`): + A path to the directory where to saved. It will be created if it doesn't exist. """ if os.path.isfile(save_directory): logger.error("Provided path ({}) should be a directory, not a file".format(save_directory)) @@ -458,14 +551,17 @@ def predict(self, X): def device_placement(self): """ Context Manager allowing tensor allocation on the user-specified device in framework agnostic way. - example: - # Explicitly ask for tensor allocation on CUDA device :0 - nlp = pipeline(..., device=0) - with nlp.device_placement(): - # Every framework specific tensor allocation will be done on the request device - output = nlp(...) + Returns: Context manager + + Examples:: + + # Explicitly ask for tensor allocation on CUDA device :0 + pipe = pipeline(..., device=0) + with pipe.device_placement(): + # Every framework specific tensor allocation will be done on the request device + output = pipe(...) """ if self.framework == "tf": with tf.device("/CPU:0" if self.device == -1 else "/device:GPU:{}".format(self.device)): @@ -479,14 +575,22 @@ def device_placement(self): def ensure_tensor_on_device(self, **inputs): """ Ensure PyTorch tensors are on the specified device. - :param inputs: - :return: + + Args: + inputs (keyword arguments that should be :obj:`torch.Tensor`): The tensors to place on :obj:`self.device`. + + Return: + :obj:`Dict[str, torch.Tensor]`: The same as :obj:`inputs` but on the proper device. """ return {name: tensor.to(self.device) for name, tensor in inputs.items()} - def check_model_type(self, supported_models): + def check_model_type(self, supported_models: Union[List[str], dict]): """ - Check if the model class is in the supported class list of the pipeline. + Check if the model class is in supported by the pipeline. + + Args: + supported_models (:obj:`List[str]` or :obj:`dict`): + The list of models supported by the pipeline, or a dictionary with model class values. """ if not isinstance(supported_models, list): # Create from a model mapping supported_models = [item[1].__name__ for item in supported_models.items()] @@ -538,15 +642,14 @@ def _forward(self, inputs, return_tensors=False): return predictions.numpy() +# Can't use @add_end_docstrings(PIPELINE_INIT_ARGS) here because this one does not accept `binary_output` class FeatureExtractionPipeline(Pipeline): """ - Feature extraction pipeline using Model head. This pipeline extracts the hidden states from the base transformer, - which can be used as features in downstream tasks. + Feature extraction pipeline using no model head. This pipeline extracts the hidden states from the base + transformer, which can be used as features in downstream tasks. - This feature extraction pipeline can currently be loaded from the :func:`~transformers.pipeline` method using - the following task identifier(s): - - - "feature-extraction", for extracting features of a sequence. + This feature extraction pipeline can currently be loaded from :func:`~transformers.pipeline` using the task + identifier: :obj:`"feature-extraction"`. All models may be used for this pipeline. See a list of all models, including community-contributed models on `huggingface.co/models `__. @@ -559,18 +662,21 @@ class FeatureExtractionPipeline(Pipeline): tokenizer (:obj:`~transformers.PreTrainedTokenizer`): The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from :class:`~transformers.PreTrainedTokenizer`. - modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`): + modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`): Model card attributed to the model for this pipeline. - framework (:obj:`str`, `optional`, defaults to :obj:`None`): - The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be - installed. + framework (:obj:`str`, `optional`): + The framework to use, either :obj:`"pt"` for PyTorch or :obj:`"tf"` for TensorFlow. The specified framework + must be installed. If no framework is specified, will default to the one currently installed. If no framework is specified - and both frameworks are installed, will default to PyTorch. - args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`): + and both frameworks are installed, will default to the framework of the :obj:`model`, or to PyTorch if no + model is provided. + task (:obj:`str`, defaults to :obj:`""`): + A task-identifier for the pipeline. + args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`): Reference to the object in charge of parsing supplied pipeline parameters. - device (:obj:`int`, `optional`, defaults to :obj:`-1`): - Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model + device (:obj:`int`, `optional`, defaults to -1): + Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, a positive will run the model on the associated CUDA device id. """ @@ -596,20 +702,29 @@ def __init__( ) def __call__(self, *args, **kwargs): + """ + Extract the features of the input(s). + + Args: + args (:obj:`str` or :obj:`List[str]`): One or several texts (or one list of texts) to get the features of. + + Return: + A nested list of :obj:`float`: The features computed by the model. + """ return super().__call__(*args, **kwargs).tolist() +@add_end_docstrings(PIPELINE_INIT_ARGS) class TextGenerationPipeline(Pipeline): """ - Language generation pipeline using any ModelWithLMHead head. This pipeline predicts the words that will follow a specified text prompt. + Language generation pipeline using any :obj:`ModelWithLMHead`. This pipeline predicts the words that will follow a + specified text prompt. - This language generation pipeline can currently be loaded from the :func:`~transformers.pipeline` method using - the following task identifier(s): + This language generation pipeline can currently be loaded from :func:`~transformers.pipeline` using the following + task identifier: :obj:`"text-generation"`. - - "text-generation", for generating text from a specified prompt. - - The models that this pipeline can use are models that have been trained with an autoregressive language modeling objective, - which includes the uni-directional models in the library (e.g. gpt2). + The models that this pipeline can use are models that have been trained with an autoregressive language modeling + objective, which includes the uni-directional models in the library (e.g. gpt2). See the list of available community models on `huggingface.co/models `__. """ @@ -673,7 +788,30 @@ def _parse_and_tokenize(self, *args, padding=True, add_special_tokens=True, **kw def __call__( self, *args, return_tensors=False, return_text=True, clean_up_tokenization_spaces=False, **generate_kwargs ): + """ + Complete the prompt(s) given as inputs. + + Args: + args (:obj:`str` or :obj:`List[str]`): + One or several prompts (or one list of prompts) to complete. + return_tensors (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to include the tensors of predictions (as token indinces) in the outputs. + return_text (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to include the decoded texts in the outputs. + clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to clean up the potential extra spaces in the text output. + generate_kwargs: + Additional keyword arguments to pass along to the generate method of the model (see the generate + method corresponding to your framework `here <./model.html#generative-models>`__). + + Return: + A list or a list of list of :obj:`dict`: Each result comes as a dictionary with the + following keys: + - **generated_text** (:obj:`str`, present when ``return_text=True``) -- The generated text. + - **generated_token_ids** (:obj:`torch.Tensor` or :obj:`tf.Tensor`, present when ``return_tensors=True``) + -- The token ids of the generated text. + """ text_inputs = self._args_parser(*args) results = [] @@ -758,41 +896,25 @@ def __call__( return results +@add_end_docstrings( + PIPELINE_INIT_ARGS, + r""" + return_all_scores (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether to return all prediction scores or just the one of the predicted class. + """, +) class TextClassificationPipeline(Pipeline): """ - Text classification pipeline using ModelForSequenceClassification head. See the - `sequence classification usage <../usage.html#sequence-classification>`__ examples for more information. - - This text classification pipeline can currently be loaded from the :func:`~transformers.pipeline` method using - the following task identifier(s): + Text classification pipeline using any :obj:`ModelForSequenceClassification`. See the + `sequence classification examples <../task_summary.html#sequence-classification>`__ for more information. - - "sentiment-analysis", for classifying sequences according to positive or negative sentiments. + This text classification pipeline can currently be loaded from :func:`~transformers.pipeline` using the following + task identifier: :obj:`"sentiment-analysis"` (for classifying sequences according to positive or negative + sentiments). The models that this pipeline can use are models that have been fine-tuned on a sequence classification task. See the up-to-date list of available models on `huggingface.co/models `__. - - Arguments: - model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`): - The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from - :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for - TensorFlow. - tokenizer (:obj:`~transformers.PreTrainedTokenizer`): - The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from - :class:`~transformers.PreTrainedTokenizer`. - modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`): - Model card attributed to the model for this pipeline. - framework (:obj:`str`, `optional`, defaults to :obj:`None`): - The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be - installed. - - If no framework is specified, will default to the one currently installed. If no framework is specified - and both frameworks are installed, will default to PyTorch. - args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`): - Reference to the object in charge of parsing supplied pipeline parameters. - device (:obj:`int`, `optional`, defaults to :obj:`-1`): - Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model - on the associated CUDA device id. """ def __init__(self, return_all_scores: bool = False, **kwargs): @@ -807,6 +929,22 @@ def __init__(self, return_all_scores: bool = False, **kwargs): self.return_all_scores = return_all_scores def __call__(self, *args, **kwargs): + """ + Classify the text(s) given as inputs. + + Args: + args (:obj:`str` or :obj:`List[str]`): + One or several textts (or one list of prompts) to classify. + + Return: + A list or a list of list of :obj:`dict`: Each result comes as list of dictionaries with the + following keys: + + - **label** (:obj:`str`) -- The label predicted. + - **score** (:obj:`float`) -- The corresponding probability. + + If ``self.return_all_scores=True``, one such dictionary is returned per label. + """ outputs = super().__call__(*args, **kwargs) scores = np.exp(outputs) / np.exp(outputs).sum(-1, keepdims=True) if self.return_all_scores: @@ -853,46 +991,23 @@ def __call__(self, sequences, labels, hypothesis_template): return sequence_pairs +@add_end_docstrings(PIPELINE_INIT_ARGS) class ZeroShotClassificationPipeline(Pipeline): """ - NLI-based zero-shot classification pipeline using a ModelForSequenceClassification head with models trained on - NLI tasks. + NLI-based zero-shot classification pipeline using a :obj:`ModelForSequenceClassification` trained on NLI (natural + language inference) tasks. Any combination of sequences and labels can be passed and each combination will be posed as a premise/hypothesis - pair and passed to the pre-trained model. Then logit for `entailment` is then taken as the logit for the + pair and passed to the pretrained model. Then, the logit for `entailment` is taken as the logit for the candidate label being valid. Any NLI model can be used as long as the first output logit corresponds to `contradiction` and the last to `entailment`. - This pipeline can currently be loaded from the :func:`~transformers.pipeline` method using the following task - identifier(s): + This NLI pipeline can currently be loaded from :func:`~transformers.pipeline` using the following + task identifier: :obj:`"zero-shot-classification"`. - - "zero-shot-classification" - - The models that this pipeline can use are models that have been fine-tuned on a Natural Language Inference task. + The models that this pipeline can use are models that have been fine-tuned on an NLI task. See the up-to-date list of available models on `huggingface.co/models `__. - - Arguments: - model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`): - The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from - :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for - TensorFlow. - tokenizer (:obj:`~transformers.PreTrainedTokenizer`): - The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from - :class:`~transformers.PreTrainedTokenizer`. - modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`): - Model card attributed to the model for this pipeline. - framework (:obj:`str`, `optional`, defaults to :obj:`None`): - The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be - installed. - - If no framework is specified, will default to the one currently installed. If no framework is specified - and both frameworks are installed, will default to PyTorch. - args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`): - Reference to the object in charge of parsing supplied pipeline parameters. - device (:obj:`int`, `optional`, defaults to :obj:`-1`): - Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model - on the associated CUDA device id. """ def __init__(self, args_parser=ZeroShotClassificationArgumentHandler(), *args, **kwargs): @@ -915,29 +1030,33 @@ def _parse_and_tokenize(self, *args, padding=True, add_special_tokens=True, **kw def __call__(self, sequences, candidate_labels, hypothesis_template="This example is {}.", multi_class=False): """ - NLI-based zero-shot classification. Any combination of sequences and labels can be passed and each - combination will be posed as a premise/hypothesis pair and passed to the pre-trained model. Then logit for - `entailment` is then taken as the logit for the candidate label being valid. Any NLI model can be used as - long as the first output logit corresponds to `contradiction` and the last to `entailment`. + Classify the sequence(s) given as inputs. Args: - sequences (:obj:`str` or obj:`List`): - The sequence or sequences to classify. Truncated if model input is too large. - candidate_labels (:obj:`str` or obj:`List`): + sequences (:obj:`str` or obj:`List[str]`): + The sequence(s) to classify, will be truncated if the model input is too large. + candidate_labels (:obj:`str` or obj:`List[str]`): The set of possible class labels to classify each sequence into. Can be a single label, a string of comma-separated labels, or a list of labels. - hypothesis_template (obj:`str`, defaults to "This example is {}."): + hypothesis_template (obj:`str`, `optional`, defaults to :obj:`"This example is {}."`): The template used to turn each label into an NLI-style hypothesis. This template must include a {} or similar syntax for the candidate label to be inserted into the template. For example, the default - template is "This example is {}." With the candidate label "sports", this would be fed into the model - like ` sequence to classify This example is sports . `. The default template works - well in many cases, but it may be worthwhile to experiment with different templates depending on the - task setting. - multi_class (obj:`bool`, defaults to False): - When False, it is assumed that only one candidate label can be true, and the scores are normalized - such that the sum of the label likelihoods for each sequence is 1. When True, the labels are - considered independent and probabilities are normalized for each candidate by doing a of softmax of + template is :obj:`"This example is {}."` With the candidate label :obj:`"sports"`, this would be fed + into the model like :obj:`" sequence to classify This example is sports . "`. The + default template works well in many cases, but it may be worthwhile to experiment with different + templates depending on the task setting. + multi_class (obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not multiple candidate labels can be true. If :obj:`False`, the scores are normalized + such that the sum of the label likelihoods for each sequence is 1. If :obj:`True`, the labels are + considered independent and probabilities are normalized for each candidate by doing a softmax of the entailment score vs. the contradiction score. + Return: + A :obj:`dict` or a list of :obj:`dict`: Each result comes as a dictionary with the + following keys: + + - **sequence** (:obj:`str`) -- The sequence for which this is the output. + - **labels** (:obj:`List[str]`) -- The labels sorted by order of likelihood. + - **scores** (:obj:` List[float]`) -- The probabilities for each of the labels. """ outputs = super().__call__(sequences, candidate_labels, hypothesis_template) num_sequences = 1 if isinstance(sequences, str) else len(sequences) @@ -973,42 +1092,28 @@ def __call__(self, sequences, candidate_labels, hypothesis_template="This exampl return result +@add_end_docstrings( + PIPELINE_INIT_ARGS, + r""" + topk (:obj:`int`, defaults to 5): The number of predictions to return. + """, +) class FillMaskPipeline(Pipeline): """ - Masked language modeling prediction pipeline using ModelWithLMHead head. See the - `masked language modeling usage <../usage.html#masked-language-modeling>`__ examples for more information. + Masked language modeling prediction pipeline using any :obj:`ModelWithLMHead`. See the + `masked language modeling examples <../task_summary.html#masked-language-modeling>`__ for more information. - This mask filling pipeline can currently be loaded from the :func:`~transformers.pipeline` method using - the following task identifier(s): - - - "fill-mask", for predicting masked tokens in a sequence. + This mask filling pipeline can currently be loaded from :func:`~transformers.pipeline` using the following + task identifier: :obj:`"fill-mask"`. The models that this pipeline can use are models that have been trained with a masked language modeling objective, which includes the bi-directional models in the library. See the up-to-date list of available models on `huggingface.co/models `__. - Arguments: - model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`): - The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from - :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for - TensorFlow. - tokenizer (:obj:`~transformers.PreTrainedTokenizer`): - The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from - :class:`~transformers.PreTrainedTokenizer`. - modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`): - Model card attributed to the model for this pipeline. - framework (:obj:`str`, `optional`, defaults to :obj:`None`): - The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be - installed. + .. note:: - If no framework is specified, will default to the one currently installed. If no framework is specified - and both frameworks are installed, will default to PyTorch. - args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`): - Reference to the object in charge of parsing supplied pipeline parameters. - device (:obj:`int`, `optional`, defaults to :obj:`-1`): - Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model - on the associated CUDA device id. + This pipeline only works for inputs with exactly one token masked. """ def __init__( @@ -1053,6 +1158,21 @@ def ensure_exactly_one_mask_token(self, masked_index: np.ndarray): ) def __call__(self, *args, **kwargs): + """ + Fill the masked token in the text(s) given as inputs. + + Args: + args (:obj:`str` or :obj:`List[str]`): One or several texts (or one list of prompts) with masked tokens. + + Return: + A list or a list of list of :obj:`dict`: Each result comes as list of dictionaries with the + following keys: + + - **sequence** (:obj:`str`) -- The corresponding input with the mask token prediction. + - **score** (:obj:`float`) -- The corresponding probability. + - **token** (:obj:`int`) -- The predicted token id (to replace the masked one). + - **token** (:obj:`str`) -- The predicted token (to replace the masked one). + """ inputs = self._parse_and_tokenize(*args, **kwargs) outputs = self._forward(inputs, return_tensors=True) @@ -1105,41 +1225,27 @@ def __call__(self, *args, **kwargs): return results +@add_end_docstrings( + PIPELINE_INIT_ARGS, + r""" + ignore_labels (:obj:`List[str]`, defaults to :obj:`["O"]`): + A list of labels to ignore. + grouped_entities (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to group the tokens corresponding to the same entity together in the predictions or not. + """, +) class TokenClassificationPipeline(Pipeline): """ - Named Entity Recognition pipeline using ModelForTokenClassification head. See the - `named entity recognition usage <../usage.html#named-entity-recognition>`__ examples for more information. + Named Entity Recognition pipeline using any :obj:`ModelForTokenClassification`. See the + `named entity recognition examples <../task_summary.html#named-entity-recognition>`__ for more information. - This token recognition pipeline can currently be loaded from the :func:`~transformers.pipeline` method using - the following task identifier(s): - - - "ner", for predicting the classes of tokens in a sequence: person, organisation, location or miscellaneous. + This token recognition pipeline can currently be loaded from :func:`~transformers.pipeline` using the following + task identifier: :obj:`"ner"` (for predicting the classes of tokens in a sequence: person, organisation, location + or miscellaneous). The models that this pipeline can use are models that have been fine-tuned on a token classification task. See the up-to-date list of available models on `huggingface.co/models `__. - - Arguments: - model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`): - The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from - :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for - TensorFlow. - tokenizer (:obj:`~transformers.PreTrainedTokenizer`): - The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from - :class:`~transformers.PreTrainedTokenizer`. - modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`): - Model card attributed to the model for this pipeline. - framework (:obj:`str`, `optional`, defaults to :obj:`None`): - The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be - installed. - - If no framework is specified, will default to the one currently installed. If no framework is specified - and both frameworks are installed, will default to PyTorch. - args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`): - Reference to the object in charge of parsing supplied pipeline parameters. - device (:obj:`int`, `optional`, defaults to :obj:`-1`): - Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model - on the associated CUDA device id. """ default_input_names = "sequences" @@ -1179,6 +1285,24 @@ def __init__( self.grouped_entities = grouped_entities def __call__(self, *args, **kwargs): + """ + Classify each token of the text(s) given as inputs. + + Args: + args (:obj:`str` or :obj:`List[str]`): + One or several texts (or one list of texts) for token classification. + + Return: + A list or a list of list of :obj:`dict`: Each result comes as a list of dictionaries (one for each token in + the corresponding input, or each entity if this pipeline was instantiated with + :obj:`grouped_entities=True`) with the following keys: + + - **word** (:obj:`str`) -- The token/word classified. + - **score** (:obj:`float`) -- The corresponding probability for :obj:`entity`. + - **entity** (:obj:`str`) -- The entity predicted for that token/word. + - **index** (:obj:`int`, only present when ``self.grouped_entities=False``) -- The index of the + corresponding token in the sentence. + """ inputs = self._args_parser(*args, **kwargs) answers = [] for sentence in inputs: @@ -1235,7 +1359,10 @@ def __call__(self, *args, **kwargs): def group_sub_entities(self, entities: List[dict]) -> dict: """ - Returns grouped sub entities + Group together the adjacent tokens with the same entity predicted. + + Args: + entities (:obj:`dict`): The entities predicted by the pipeline. """ # Get the first entity in the entity group entity = entities[0]["entity"] @@ -1251,7 +1378,10 @@ def group_sub_entities(self, entities: List[dict]) -> dict: def group_entities(self, entities: List[dict]) -> List[dict]: """ - Returns grouped entities + Find and group together the adjacent tokens with the same entity predicted. + + Args: + entities (:obj:`dict`): The entities predicted by the pipeline. """ entity_groups = [] @@ -1295,10 +1425,10 @@ def group_entities(self, entities: List[dict]) -> List[dict]: class QuestionAnsweringArgumentHandler(ArgumentHandler): """ QuestionAnsweringPipeline requires the user to provide multiple arguments (i.e. question & context) to be mapped - to internal SquadExample / SquadFeature structures. + to internal :class:`~transformers.SquadExample`. - QuestionAnsweringArgumentHandler manages all the possible to create SquadExample from the command-line supplied - arguments. + QuestionAnsweringArgumentHandler manages all the possible to create a :class:`~transformers.SquadExample` from + the command-line supplied arguments. """ def __call__(self, *args, **kwargs): @@ -1354,41 +1484,18 @@ def __call__(self, *args, **kwargs): return inputs +@add_end_docstrings(PIPELINE_INIT_ARGS) class QuestionAnsweringPipeline(Pipeline): """ - Question Answering pipeline using ModelForQuestionAnswering head. See the - `question answering usage <../usage.html#question-answering>`__ examples for more information. - - This question answering can currently be loaded from the :func:`~transformers.pipeline` method using - the following task identifier(s): + Question Answering pipeline using any :obj:`ModelForQuestionAnswering`. See the + `question answering examples <../task_summary.html#question-answering>`__ for more information. - - "question-answering", for answering questions given a context. + This question answering pipeline can currently be loaded from :func:`~transformers.pipeline` using the following + task identifier: :obj:`"question-answering"`. The models that this pipeline can use are models that have been fine-tuned on a question answering task. See the up-to-date list of available models on `huggingface.co/models `__. - - Arguments: - model (:obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`): - The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from - :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for - TensorFlow. - tokenizer (:obj:`~transformers.PreTrainedTokenizer`): - The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from - :class:`~transformers.PreTrainedTokenizer`. - modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`): - Model card attributed to the model for this pipeline. - framework (:obj:`str`, `optional`, defaults to :obj:`None`): - The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be - installed. - - If no framework is specified, will default to the one currently installed. If no framework is specified - and both frameworks are installed, will default to PyTorch. - args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`): - Reference to the object in charge of parsing supplied pipeline parameters. - device (:obj:`int`, `optional`, defaults to :obj:`-1`): - Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model - on the associated CUDA device id. """ default_input_names = "question,context" @@ -1423,15 +1530,19 @@ def create_sample( question: Union[str, List[str]], context: Union[str, List[str]] ) -> Union[SquadExample, List[SquadExample]]: """ - QuestionAnsweringPipeline leverages the SquadExample/SquadFeatures internally. - This helper method encapsulate all the logic for converting question(s) and context(s) to SquadExample(s). + QuestionAnsweringPipeline leverages the :class:`~transformers.SquadExample` internally. + This helper method encapsulate all the logic for converting question(s) and context(s) to + :class:`~transformers.SquadExample`. + We currently support extractive question answering. + Arguments: - question: (str, List[str]) The question to be ask for the associated context - context: (str, List[str]) The context in which we will look for the answer. + question (:obj:`str` or :obj:`List[str]`): The question(s) asked. + context (:obj:`str` or :obj:`List[str]`): The context(s) in which we will look for the answer. Returns: - SquadExample initialized with the corresponding question and context. + One or a list of :class:`~transformers.SquadExample`: The corresponding + :class:`~transformers.SquadExample` grouping question and context. """ if isinstance(question, list): return [SquadExample(None, q, c, None, None, None) for q, c in zip(question, context)] @@ -1440,18 +1551,45 @@ def create_sample( def __call__(self, *args, **kwargs): """ + Answer the question(s) given as inputs by using the context(s). + Args: - We support multiple use-cases, the following are exclusive: - X: sequence of SquadExample - data: sequence of SquadExample - question: (str, List[str]), batch of question(s) to map along with context - context: (str, List[str]), batch of context(s) associated with the provided question keyword argument - Returns: - dict: {'answer': str, 'score": float, 'start": int, "end": int} - answer: the textual answer in the intial context - score: the score the current answer scored for the model - start: the character index in the original string corresponding to the beginning of the answer' span - end: the character index in the original string corresponding to the ending of the answer' span + args (:class:`~transformers.SquadExample` or a list of :class:`~transformers.SquadExample`): + One or several :class:`~transformers.SquadExample` containing the question and context. + X (:class:`~transformers.SquadExample` or a list of :class:`~transformers.SquadExample`, `optional`): + One or several :class:`~transformers.SquadExample` containing the question and context + (will be treated the same way as if passed as the first positional argument). + data (:class:`~transformers.SquadExample` or a list of :class:`~transformers.SquadExample`, `optional`): + One or several :class:`~transformers.SquadExample` containing the question and context + (will be treated the same way as if passed as the first positional argument). + question (:obj:`str` or :obj:`List[str]`): + One or several question(s) (must be used in conjunction with the :obj:`context` argument). + context (:obj:`str` or :obj:`List[str]`): + One or several context(s) associated with the qustion(s) (must be used in conjunction with the + :obj:`question` argument). + topk (:obj:`int`, `optional`, defaults to 1): + The number of answers to return (will be chosen by order of likelihood). + doc_stride (:obj:`int`, `optional`, defaults to 128): + If the context is too long to fit with the question for the model, it will be split in several chunks + with some overlap. This argument controls the size of that overlap. + max_answer_len (:obj:`int`, `optional`, defaults to 15): + The maximum length of predicted answers (e.g., only answers with a shorter length are considered). + max_seq_len (:obj:`int`, `optional`, defaults to 384): + The maximum length of the total sentence (context + question) after tokenization. The context will be + split in several chunks (using :obj:`doc_stride`) if needed. + max_question_len (:obj:`int`, `optional`, defaults to 64): + The maximum length of the question after tokenization. It will be truncated if needed. + handle_impossible_answer (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not we accept impossible as an answer. + + Return: + A :obj:`dict` or a list of :obj:`dict`: Each result comes as a dictionary with the + following keys: + + - **score** (:obj:`float`) -- The probability associated to the answer. + - **start** (:obj:`int`) -- The start index of the answer (in the tokenized version of the input). + - **end** (:obj:`int`) -- The end index of the answer (in the tokenized version of the input). + - **answer** (:obj:`str`) -- The answer to the question. """ # Set defaults values kwargs.setdefault("topk", 1) @@ -1551,17 +1689,18 @@ def __call__(self, *args, **kwargs): def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple: """ - Take the output of any QuestionAnswering head and will generate probalities for each span to be + Take the output of any :obj:`ModelForQuestionAnswering` and will generate probalities for each span to be the actual answer. + In addition, it filters out some unwanted/impossible cases like answer len being greater than max_answer_len or answer end position being before the starting position. The method supports output the k-best answer through the topk argument. Args: - start: numpy array, holding individual start probabilities for each token - end: numpy array, holding individual end probabilities for each token - topk: int, indicates how many possible answer span(s) to extract from the model's output - max_answer_len: int, maximum size of the answer to extract from the model's output + start (:obj:`np.ndarray`): Individual start probabilities for each token. + end (:obj:`np.ndarray`): Individual end probabilities for each token. + topk (:obj:`int`): Indicates how many possible answer span(s) to extract from the model output. + max_answer_len (:obj:`int`): Maximum size of the answer to extract from the model's output. """ # Ensure we have batch axis if start.ndim == 1: @@ -1589,18 +1728,18 @@ def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: start, end = np.unravel_index(idx_sort, candidates.shape)[1:] return start, end, candidates[0, start, end] - def span_to_answer(self, text: str, start: int, end: int): + def span_to_answer(self, text: str, start: int, end: int) -> Dict[str, Union[str, int]]: """ When decoding from token probalities, this method maps token indexes to actual word in the initial context. Args: - text: str, the actual context to extract the answer from - start: int, starting answer token index - end: int, ending answer token index + text (:obj:`str`): The actual context to extract the answer from. + start (:obj:`int`): The answer starting token index. + end (:obj:`int`): The answer end token index. Returns: - dict: {'answer': str, 'start': int, 'end': int} + Dictionary like :obj:`{'answer': str, 'start': int, 'end': int}` """ words = [] token_idx = char_start_idx = char_end_idx = chars_idx = 0 @@ -1634,9 +1773,18 @@ def span_to_answer(self, text: str, start: int, end: int): } +@add_end_docstrings(PIPELINE_INIT_ARGS) class SummarizationPipeline(Pipeline): """ - Summarize news articles and other documents + Summarize news articles and other documents. + + This summarizing pipeline can currently be loaded from :func:`~transformers.pipeline` using the following + task identifier: :obj:`"summarization"`. + + The models that this pipeline can use are models that have been fine-tuned on a summarization task, + which is currently, '`bart-large-cnn`', '`t5-small`', '`t5-base`', '`t5-large`', '`t5-3b`', '`t5-11b`'. + See the up-to-date list of available models on + `huggingface.co/models `__. Usage:: @@ -1647,39 +1795,6 @@ class SummarizationPipeline(Pipeline): # use t5 in tf summarizer = pipeline("summarization", model="t5-base", tokenizer="t5-base", framework="tf") summarizer("Sam Shleifer writes the best docstring examples in the whole world.", min_length=5, max_length=20) - - The models that this pipeline can use are models that have been fine-tuned on a summarization task, - which is currently, '`bart-large-cnn`', '`t5-small`', '`t5-base`', '`t5-large`', '`t5-3b`', '`t5-11b`'. - See the up-to-date list of available models on - `huggingface.co/models `__. - - Arguments: - model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`): - The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string - checkpoint identifier or an actual pre-trained model inheriting from - :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for - TensorFlow. - - If :obj:`None`, the default of the pipeline will be loaded. - tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`): - The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`, - a string checkpoint identifier or an actual pre-trained tokenizer inheriting from - :class:`~transformers.PreTrainedTokenizer`. - - If :obj:`None`, the default of the pipeline will be loaded. - modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`): - Model card attributed to the model for this pipeline. - framework (:obj:`str`, `optional`, defaults to :obj:`None`): - The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be - installed. - - If no framework is specified, will default to the one currently installed. If no framework is specified - and both frameworks are installed, will default to PyTorch. - args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`): - Reference to the object in charge of parsing supplied pipeline parameters. - device (:obj:`int`, `optional`, defaults to :obj:`-1`): - Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model - on the associated CUDA device id. """ def __init__(self, *args, **kwargs): @@ -1694,20 +1809,29 @@ def __call__( self, *documents, return_tensors=False, return_text=True, clean_up_tokenization_spaces=False, **generate_kwargs ): r""" - Args: - *documents: (list of strings) articles to be summarized - return_text: (bool, default=True) whether to add a decoded "summary_text" to each result - return_tensors: (bool, default=False) whether to return the raw "summary_token_ids" to each result - - clean_up_tokenization_spaces: (`optional`) bool whether to include extra spaces in the output - **generate_kwargs: extra kwargs passed to `self.model.generate`_ + Summarize the text(s) given as inputs. - Returns: - list of dicts with 'summary_text' and/or 'summary_token_ids' for each document_to_summarize + Args: + documents (`str` or :obj:`List[str]`): + One or several articles (or one list of articles) to summarize. + return_text (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to include the decoded texts in the outputs + return_tensors (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to include the tensors of predictions (as token indinces) in the outputs. + clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to clean up the potential extra spaces in the text output. + generate_kwargs: + Additional keyword arguments to pass along to the generate method of the model (see the generate + method corresponding to your framework `here <./model.html#generative-models>`__). - .. _`self.model.generate`: - https://huggingface.co/transformers/model_doc/bart.html#transformers.BartForConditionalGeneration.generate + Return: + A list or a list of list of :obj:`dict`: Each result comes as a dictionary with the + following keys: + - **summary_text** (:obj:`str`, present when ``return_text=True``) -- The summary of the corresponding + input. + - **summary_token_ids** (:obj:`torch.Tensor` or :obj:`tf.Tensor`, present when ``return_tensors=True``) + -- The token ids of the summary. """ assert return_tensors or return_text, "You must specify return_tensors=True or return_text=True" assert len(documents) > 0, "Please provide a document to summarize" @@ -1779,43 +1903,21 @@ def __call__( return results +@add_end_docstrings(PIPELINE_INIT_ARGS) class TranslationPipeline(Pipeline): """ Translates from one language to another. - Usage:: - en_fr_translator = pipeline("translation_en_to_fr") - en_fr_translator("How old are you?") + This translation pipeline can currently be loaded from :func:`~transformers.pipeline` using the following + task identifier: :obj:`"translation_xx_to_yy"`. - The models that this pipeline can use are models that have been fine-tuned on a translation task, - currently: "t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b" + The models that this pipeline can use are models that have been fine-tuned on a translation task. See the up-to-date list of available models on `huggingface.co/models `__. - Arguments: - model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`): - The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string - checkpoint identifier or an actual pre-trained model inheriting from - :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for - TensorFlow. - If :obj:`None`, the default of the pipeline will be loaded. - tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`): - The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`, - a string checkpoint identifier or an actual pre-trained tokenizer inheriting from - :class:`~transformers.PreTrainedTokenizer`. - If :obj:`None`, the default of the pipeline will be loaded. - modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`): - Model card attributed to the model for this pipeline. - framework (:obj:`str`, `optional`, defaults to :obj:`None`): - The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be - installed. - If no framework is specified, will default to the one currently installed. If no framework is specified - and both frameworks are installed, will default to PyTorch. - args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`): - Reference to the object in charge of parsing supplied pipeline parameters. - device (:obj:`int`, `optional`, defaults to :obj:`-1`): - Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model - on the associated CUDA device id. + Usage:: + en_fr_translator = pipeline("translation_en_to_fr") + en_fr_translator("How old are you?") """ def __init__(self, *args, **kwargs): @@ -1829,17 +1931,28 @@ def __call__( self, *args, return_tensors=False, return_text=True, clean_up_tokenization_spaces=False, **generate_kwargs ): r""" + Translate the text(s) given as inputs. + Args: - *args: (list of strings) texts to be translated - return_text: (bool, default=True) whether to add a decoded "translation_text" to each result - return_tensors: (bool, default=False) whether to return the raw "translation_token_ids" to each result + args (:obj:`str` or :obj:`List[str]`): + Texts to be translated. + return_tensors (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to include the tensors of predictions (as token indinces) in the outputs. + return_text (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to include the decoded texts in the outputs. + clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to clean up the potential extra spaces in the text output. + generate_kwargs: + Additional keyword arguments to pass along to the generate method of the model (see the generate + method corresponding to your framework `here <./model.html#generative-models>`__). - **generate_kwargs: extra kwargs passed to `self.model.generate`_ + Return: + A list or a list of list of :obj:`dict`: Each result comes as a dictionary with the + following keys: - Returns: - list of dicts with 'translation_text' and/or 'translation_token_ids' for each text_to_translate - .. _`self.model.generate`: - https://huggingface.co/transformers/model_doc/bart.html#transformers.BartForConditionalGeneration.generate + - **translation_text** (:obj:`str`, present when ``return_text=True``) -- The translation. + - **translation_token_ids** (:obj:`torch.Tensor` or :obj:`tf.Tensor`, present when ``return_tensors=True``) + -- The token ids of the translation. """ assert return_tensors or return_text, "You must specify return_tensors=True or return_text=True" @@ -1901,10 +2014,20 @@ def __call__( class Conversation: """ Utility class containing a conversation and its history. This class is meant to be used as an input to the - :obj:`~transformers.ConversationalPipeline`. The conversation contains a number of utility function to manage the addition of new - user input and generated model responses. A conversation needs to contain an unprocessed user input before being - passed to the :obj:`~transformers.ConversationalPipeline`. This user input is either created when the class is instantiated, or by calling - `append_response("input")` after a conversation turn. + :class:`~transformers.ConversationalPipeline`. The conversation contains a number of utility function to manage the + addition of new user input and generated model responses. A conversation needs to contain an unprocessed user input + before being passed to the :class:`~transformers.ConversationalPipeline`. This user input is either created when + the class is instantiated, or by calling :obj:`conversional_pipeline.append_response("input")` after a conversation + turn. + + Arguments: + text (:obj:`str`, `optional`): + The initial user input to start the conversation. If not provided, a user input needs to be provided + manually using the :meth:`~transformers.Conversation.add_user_input` method before the conversation can + begin. + conversation_id (:obj:`uuid.UUID`, `optional`): + Unique identifier for the conversation. If not provided, a random UUID4 id will be assigned to the + conversation. Usage:: @@ -1917,14 +2040,6 @@ class Conversation: conversation.append_response("The Big lebowski.") conversation.add_user_input("Is it good?") - - Arguments: - text (:obj:`str`, `optional`, defaults to :obj:`None`): - The initial user input to start the conversation. - If :obj:`None`, a user input needs to be provided manually using `add_user_input` before the conversation can begin. - conversation_id (:obj:`uuid.UUID`, `optional`, defaults to :obj:`None`): - Unique identifier for the conversation - If :obj:`None`, the random UUID4 id will be assigned to the conversation. """ def __init__(self, text: str = None, conversation_id: UUID = None): @@ -1938,12 +2053,13 @@ def __init__(self, text: str = None, conversation_id: UUID = None): def add_user_input(self, text: str, overwrite: bool = False): """ - Add a user input to the conversation for the next round. This populates the internal `new_user_input` field. + Add a user input to the conversation for the next round. This populates the internal :obj:`new_user_input` + field. Args: - text: str, the user input for the next conversation round - overwrite: bool, flag indicating if existing and unprocessed user input should be overwritten when this function is called - + text (:obj:`str`): The user input for the next conversation round. + overwrite (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not existing and unprocessed user input should be overwritten when this function is called. """ if self.new_user_input: if overwrite: @@ -1963,8 +2079,8 @@ def add_user_input(self, text: str, overwrite: bool = False): def mark_processed(self): """ - Mark the conversation as processed (moves the content of `new_user_input` to `past_user_inputs`) and empties the - `new_user_input` field. + Mark the conversation as processed (moves the content of :obj:`new_user_input` to :obj:`past_user_inputs`) and + empties the :obj:`new_user_input` field. """ if self.new_user_input: self.past_user_inputs.append(self.new_user_input) @@ -1975,17 +2091,17 @@ def append_response(self, response: str): Append a response to the list of generated responses. Args: - response: str, the model generated response + response (:obj:`str`): The model generated response. """ self.generated_responses.append(response) def set_history(self, history: List[int]): """ - Updates the value of the history of the conversation. The history is represented by a list of `token_ids`. The - history is used by the model to generate responses based on the previous conversation turns. + Updates the value of the history of the conversation. The history is represented by a list of :obj:`token_ids`. + The history is used by the model to generate responses based on the previous conversation turns. Args: - history: (list of int), history of tokens provided and generated for this conversation + history (:obj:`List[int]`): History of tokens provided and generated for this conversation. """ self.history = history @@ -1994,7 +2110,7 @@ def __repr__(self): Generates a string representation of the conversation. Return: - :obj:`str` or :obj:`Dict`: + :obj:`str`: Example: Conversation id: 7d15686b-dc94-49f2-9c4b-c9eac6a1f114 @@ -2010,10 +2126,25 @@ def __repr__(self): return output +@add_end_docstrings( + PIPELINE_INIT_ARGS, + r""" + min_length_for_response (:obj:`int`, `optional`, defaults to 32): + The minimum length (in number of tokens) for a response. + """, +) class ConversationalPipeline(Pipeline): """ Multi-turn conversational pipeline. + This conversational pipeline can currently be loaded from :func:`~transformers.pipeline` using the following + task identifier: :obj:`"conversational"`. + + The models that this pipeline can use are models that have been fine-tuned on a multi-turn conversational task, + currently: `'microsoft/DialoGPT-small'`, `'microsoft/DialoGPT-medium'`, `'microsoft/DialoGPT-large'`. + See the up-to-date list of available models on + `huggingface.co/models `__. + Usage:: conversational_pipeline = pipeline("conversational") @@ -2027,36 +2158,6 @@ class ConversationalPipeline(Pipeline): conversation_2.add_user_input("What is the genre of this book?") conversational_pipeline([conversation_1, conversation_2]) - - The models that this pipeline can use are models that have been fine-tuned on a multi-turn conversational task, - currently: "microsoft/DialoGPT-small", "microsoft/DialoGPT-medium", "microsoft/DialoGPT-large" - See the up-to-date list of available models on - `huggingface.co/models `__. - - Arguments: - model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`): - The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string - checkpoint identifier or an actual pre-trained model inheriting from - :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for - TensorFlow. - If :obj:`None`, the default of the pipeline will be loaded. - tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`): - The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`, - a string checkpoint identifier or an actual pre-trained tokenizer inheriting from - :class:`~transformers.PreTrainedTokenizer`. - If :obj:`None`, the default of the pipeline will be loaded. - modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`): - Model card attributed to the model for this pipeline. - framework (:obj:`str`, `optional`, defaults to :obj:`None`): - The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be - installed. - If no framework is specified, will default to the one currently installed. If no framework is specified - and both frameworks are installed, will default to PyTorch. - args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`): - Reference to the object in charge of parsing supplied pipeline parameters. - device (:obj:`int`, `optional`, defaults to :obj:`-1`): - Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model - on the associated CUDA device id. """ def __init__(self, min_length_for_response=32, *args, **kwargs): @@ -2075,12 +2176,20 @@ def __call__( **generate_kwargs ): r""" + Generate responses for the conversation(s) given as inputs. + Args: - conversations: (list of :class:`~transformers.pipelines.Conversation`) Conversations to generate responses for - **generate_kwargs: extra kwargs passed to `self.model.generate`_ + conversations (a :class:`~transformers.Conversation` or a list of :class:`~transformers.Conversation`): + Conversations to generate responses for. + clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to clean up the potential extra spaces in the text output. + generate_kwargs: + Additional keyword arguments to pass along to the generate method of the model (see the generate + method corresponding to your framework `here <./model.html#generative-models>`__). Returns: - list of conversations with updated generated responses for those containing a new user input + :class:`~transformers.Conversation` or a list of :class:`~transformers.Conversation`: Conversation(s) with + updated generated responses for those containing a new user input. """ # Input validation @@ -2315,56 +2424,58 @@ def pipeline( **kwargs ) -> Pipeline: """ - Utility factory method to build a pipeline. + Utility factory method to build a :class:`~transformers.Pipeline`. - Pipeline are made of: - - - A Tokenizer instance in charge of mapping raw textual input to token - - A Model instance - - Some (optional) post processing for enhancing model's output + Pipelines are made of: + - A :doc:`tokenizer ` in charge of mapping raw textual input to token. + - A :doc:`model ` to make predictions from the inputs. + - Some (optional) post processing for enhancing model's output. Args: task (:obj:`str`): The task defining which pipeline will be returned. Currently accepted tasks are: - - "feature-extraction": will return a :class:`~transformers.FeatureExtractionPipeline` - - "sentiment-analysis": will return a :class:`~transformers.TextClassificationPipeline` - - "ner": will return a :class:`~transformers.TokenClassificationPipeline` - - "question-answering": will return a :class:`~transformers.QuestionAnsweringPipeline` - - "fill-mask": will return a :class:`~transformers.FillMaskPipeline` - - "summarization": will return a :class:`~transformers.SummarizationPipeline` - - "translation_xx_to_yy": will return a :class:`~transformers.TranslationPipeline` - - "text-generation": will return a :class:`~transformers.TextGenerationPipeline` - model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`): - The model that will be used by the pipeline to make predictions. This can be :obj:`None`, - a model identifier or an actual pre-trained model inheriting from - :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for - TensorFlow. - - If :obj:`None`, the default for this pipeline will be loaded. - config (:obj:`str` or :obj:`~transformers.PretrainedConfig`, `optional`, defaults to :obj:`None`): - The configuration that will be used by the pipeline to instantiate the model. This can be :obj:`None`, - a model identifier or an actual pre-trained model configuration inheriting from + - :obj:`"feature-extraction"`: will return a :class:`~transformers.FeatureExtractionPipeline`. + - :obj:`"sentiment-analysis"`: will return a :class:`~transformers.TextClassificationPipeline`. + - :obj:`"ner"`: will return a :class:`~transformers.TokenClassificationPipeline`. + - :obj:`"question-answering"`: will return a :class:`~transformers.QuestionAnsweringPipeline`. + - :obj:`"fill-mask"`: will return a :class:`~transformers.FillMaskPipeline`. + - :obj:`"summarization"`: will return a :class:`~transformers.SummarizationPipeline`. + - :obj:`"translation_xx_to_yy"`: will return a :class:`~transformers.TranslationPipeline`. + - :obj:`"text-generation"`: will return a :class:`~transformers.TextGenerationPipeline`. + - :obj:`"conversation"`: will return a :class:`~transformers.ConversationalPipeline`. + model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`): + The model that will be used by the pipeline to make predictions. This can be a model identifier or an + actual instance of a pretrained model inheriting from :class:`~transformers.PreTrainedModel` (for PyTorch) + or :class:`~transformers.TFPreTrainedModel` (for TensorFlow). + + If not provided, the default for the :obj:`task` will be loaded. + config (:obj:`str` or :obj:`~transformers.PretrainedConfig`, `optional`): + The configuration that will be used by the pipeline to instantiate the model. This can be a model + identifier or an actual pretrained model configuration inheriting from :class:`~transformers.PretrainedConfig`. - If :obj:`None`, the default for this pipeline will be loaded. - tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`): - The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`, - a model identifier or an actual pre-trained tokenizer inheriting from + If not provided, the default for the :obj:`task` will be loaded. + tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`): + The tokenizer that will be used by the pipeline to encode data for the model. This can be a model + identifier or an actual pretrained tokenizer inheriting from :class:`~transformers.PreTrainedTokenizer`. - If :obj:`None`, the default for this pipeline will be loaded. - framework (:obj:`str`, `optional`, defaults to :obj:`None`): - The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be - installed. + If not provided, the default for the :obj:`task` will be loaded. + framework (:obj:`str`, `optional`): + The framework to use, either :obj:`"pt"` for PyTorch or :obj:`"tf"` for TensorFlow. The specified framework + must be installed. If no framework is specified, will default to the one currently installed. If no framework is specified - and both frameworks are installed, will default to PyTorch. + and both frameworks are installed, will default to the framework of the :obj:`model`, or to PyTorch if no + model is provided. + kwargs: + Additional keyword arguments passed along to the specific pipeline init (see the documentation for the + corresponding pipeline class for possible values). Returns: - :class:`~transformers.Pipeline`: Class inheriting from :class:`~transformers.Pipeline`, according to - the task. + :class:`~transformers.Pipeline`: A suitable pipeline for the task. Examples:: From 3c289fb38c61db0efc11a6b32451734fb385fccc Mon Sep 17 00:00:00 2001 From: Kevin Canwen Xu Date: Tue, 4 Aug 2020 01:17:56 +0800 Subject: [PATCH 088/127] Remove outdated BERT tips (#6217) * Remove out-dated BERT tips * Update modeling_outputs.py * Update bert.rst * Update bert.rst --- docs/source/model_doc/bert.rst | 9 ++------- src/transformers/modeling_outputs.py | 4 ---- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/docs/source/model_doc/bert.rst b/docs/source/model_doc/bert.rst index cbc1c8aa7741..1666260f96e5 100644 --- a/docs/source/model_doc/bert.rst +++ b/docs/source/model_doc/bert.rst @@ -27,13 +27,8 @@ Tips: - BERT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than the left. -- BERT was trained with a masked language modeling (MLM) objective. It is therefore efficient at predicting masked - tokens and at NLU in general, but is not optimal for text generation. Models trained with a causal language - modeling (CLM) objective are better in that regard. -- Alongside MLM, BERT was trained using a next sentence prediction (NSP) objective using the [CLS] token as a sequence - approximate. The user may use this token (the first token in a sequence built with special tokens) to get a sequence - prediction rather than a token prediction. However, averaging over the sequence may yield better results than using - the [CLS] token. +- BERT was trained with the masked language modeling (MLM) and next sentence prediction (NSP) objectives. It is efficient at predicting masked + tokens and at NLU in general, but is not optimal for text generation. The original code can be found `here `_. diff --git a/src/transformers/modeling_outputs.py b/src/transformers/modeling_outputs.py index 3a91d17904d5..1c36dc2d81ac 100644 --- a/src/transformers/modeling_outputs.py +++ b/src/transformers/modeling_outputs.py @@ -45,10 +45,6 @@ class BaseModelOutputWithPooling(ModelOutput): further processed by a Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence prediction (classification) objective during pretraining. - - This output is usually *not* a good summary - of the semantic content of the input, you're often better with averaging or pooling - the sequence of hidden-states for the whole input sequence. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. From 0513f8d275022d4055b710a33cd520b2000982bf Mon Sep 17 00:00:00 2001 From: Victor SANH Date: Mon, 3 Aug 2020 15:02:51 -0400 Subject: [PATCH 089/127] correct label extraction + add note on discrepancies on trained MNLI model and HANS (#6221) --- examples/adversarial/utils_hans.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/examples/adversarial/utils_hans.py b/examples/adversarial/utils_hans.py index 8f230fad9819..1860c98dccf7 100644 --- a/examples/adversarial/utils_hans.py +++ b/examples/adversarial/utils_hans.py @@ -255,7 +255,11 @@ def get_dev_examples(self, data_dir): return self._create_examples(self._read_tsv(os.path.join(data_dir, "heuristics_evaluation_set.txt")), "dev") def get_labels(self): - """See base class.""" + """See base class. + Note that we follow the standard three labels for MNLI + (see :class:`~transformers.data.processors.utils.MnliProcessor`) + but the HANS evaluation groups `contradiction` and `neutral` into `non-entailment` (label 0) while + `entailment` is label 1.""" return ["contradiction", "entailment", "neutral"] def _create_examples(self, lines, set_type): @@ -268,7 +272,7 @@ def _create_examples(self, lines, set_type): text_a = line[5] text_b = line[6] pairID = line[7][2:] if line[7].startswith("ex") else line[7] - label = line[-1] + label = line[0] examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, pairID=pairID)) return examples From 57eb1cb68d1c567b25ac256444e5c1a77b8817a7 Mon Sep 17 00:00:00 2001 From: Sam Shleifer Date: Mon, 3 Aug 2020 18:22:31 -0400 Subject: [PATCH 090/127] [s2s] Document better mbart finetuning command (#6229) * Document better MT command * improve multigpu command --- examples/seq2seq/README.md | 10 ++++------ examples/seq2seq/train_mbart_cc25_enro.sh | 2 +- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/examples/seq2seq/README.md b/examples/seq2seq/README.md index 9d12dc33481f..dd026784169a 100644 --- a/examples/seq2seq/README.md +++ b/examples/seq2seq/README.md @@ -113,22 +113,20 @@ Best performing command: # optionally export ENRO_DIR='wmt_en_ro' # Download instructions above # export WANDB_PROJECT="MT" # optional -export MAX_LEN=200 +export MAX_LEN=128 export BS=4 -export GAS=8 # gradient accumulation steps ./train_mbart_cc25_enro.sh --output_dir enro_finetune_baseline --label_smoothing 0.1 --fp16_opt_level=O1 --logger_name wandb --sortish_sampler ``` -This should take < 6h/epoch on a 16GB v100 and achieve val_avg_ BLEU score above 25. (you can see metrics in wandb or metrics.json). -To get results in line with fairseq, you need to do some postprocessing. +This should take < 6h/epoch on a 16GB v100 and achieve test BLEU above 26 +To get results in line with fairseq, you need to do some postprocessing. (see `romanian_postprocessing.md`) MultiGPU command (using 8 GPUS as an example) ```bash export ENRO_DIR='wmt_en_ro' # Download instructions above # export WANDB_PROJECT="MT" # optional -export MAX_LEN=200 +export MAX_LEN=128 export BS=4 -export GAS=1 # gradient accumulation steps ./train_mbart_cc25_enro.sh --output_dir enro_finetune_baseline --gpus 8 --logger_name wandb ``` ### Finetuning Outputs diff --git a/examples/seq2seq/train_mbart_cc25_enro.sh b/examples/seq2seq/train_mbart_cc25_enro.sh index b8122aee3f41..90bfce3e94dd 100755 --- a/examples/seq2seq/train_mbart_cc25_enro.sh +++ b/examples/seq2seq/train_mbart_cc25_enro.sh @@ -10,7 +10,7 @@ python finetune.py \ --num_train_epochs 6 --src_lang en_XX --tgt_lang ro_RO \ --data_dir $ENRO_DIR \ --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \ - --train_batch_size=$BS --eval_batch_size=$BS --gradient_accumulation_steps=$GAS \ + --train_batch_size=$BS --eval_batch_size=$BS \ --task translation \ --warmup_steps 500 \ --freeze_embeds \ From d740351f7dfd6176e40efaeca694aca5622a55cd Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Tue, 4 Aug 2020 02:37:12 -0400 Subject: [PATCH 091/127] Upgrade pip when doing CI (#6234) * Upgrade pip when doing CI * Don't forget Github CI --- .circleci/config.yml | 7 +++++++ .github/workflows/github-torch-hub.yml | 1 + .github/workflows/self-push.yml | 1 + .github/workflows/self-scheduled.yml | 1 + 4 files changed, 10 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index 100109539b69..7d6e3018f87d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -10,6 +10,7 @@ jobs: parallelism: 1 steps: - checkout + - run: sudo pip install --upgrade pip - run: sudo pip install .[sklearn,tf-cpu,torch,testing] - run: sudo pip install codecov pytest-cov - run: python -m pytest -n 8 --dist=loadfile -s ./tests/ --cov | tee output.txt @@ -27,6 +28,7 @@ jobs: parallelism: 1 steps: - checkout + - run: sudo pip install --upgrade pip - run: sudo pip install .[sklearn,torch,testing] - run: python -m pytest -n 8 --dist=loadfile -s ./tests/ | tee output.txt - store_artifacts: @@ -43,6 +45,7 @@ jobs: parallelism: 1 steps: - checkout + - run: sudo pip install --upgrade pip - run: sudo pip install .[sklearn,tf-cpu,testing] - run: python -m pytest -n 8 --dist=loadfile -s ./tests/ | tee output.txt - store_artifacts: @@ -56,6 +59,7 @@ jobs: RUN_CUSTOM_TOKENIZERS: yes steps: - checkout + - run: sudo pip install --upgrade pip - run: sudo pip install .[ja,testing] - run: python -m pytest -s ./tests/test_tokenization_bert_japanese.py | tee output.txt - store_artifacts: @@ -71,6 +75,7 @@ jobs: parallelism: 1 steps: - checkout + - run: sudo pip install --upgrade pip - run: sudo pip install .[sklearn,torch,testing] - run: sudo pip install -r examples/requirements.txt - run: python -m pytest -n 8 --dist=loadfile -rA -s ./examples/ | tee output.txt @@ -83,6 +88,7 @@ jobs: - image: circleci/python:3.6 steps: - checkout + - run: sudo pip install --upgrade pip - run: sudo pip install .[tf,torch,docs] - run: cd docs && make html SPHINXOPTS="-W" - store_artifacts: @@ -106,6 +112,7 @@ jobs: parallelism: 1 steps: - checkout + - run: sudo pip install --upgrade pip # we need a version of isort with https://github.com/timothycrosley/isort/pull/1000 - run: sudo pip install git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort - run: sudo pip install .[tf,torch,quality] diff --git a/.github/workflows/github-torch-hub.yml b/.github/workflows/github-torch-hub.yml index ace9e029635b..923b0983b4c3 100644 --- a/.github/workflows/github-torch-hub.yml +++ b/.github/workflows/github-torch-hub.yml @@ -20,6 +20,7 @@ jobs: python-version: 3.7 - name: Install dependencies run: | + pip install --upgrade pip pip install torch pip install numpy tokenizers filelock requests tqdm regex sentencepiece sacremoses packaging diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml index 027ccebf1c35..8784c9b08964 100644 --- a/.github/workflows/self-push.yml +++ b/.github/workflows/self-push.yml @@ -35,6 +35,7 @@ jobs: - name: Install dependencies run: | source .env/bin/activate + pip install --upgrade pip pip install torch pip install .[sklearn,testing] diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index f0bd06000eec..a61a9b58079f 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -31,6 +31,7 @@ jobs: - name: Install dependencies run: | source .env/bin/activate + pip install --upgrade pip pip install .[sklearn,torch,testing] - name: Are GPUs recognized by our DL frameworks From b390a5672aea995e65d031b3759274d92188e553 Mon Sep 17 00:00:00 2001 From: Gong Linyuan Date: Tue, 4 Aug 2020 14:38:30 +0800 Subject: [PATCH 092/127] Make the order of additional special tokens deterministic (#5704) * Make the order of additional special tokens deterministic regardless of hash seeds * Fix --- src/transformers/tokenization_utils_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index f4cab85c71ff..267d72485fff 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -23,7 +23,7 @@ import logging import os import warnings -from collections import UserDict +from collections import OrderedDict, UserDict from enum import Enum from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union @@ -1071,7 +1071,7 @@ def all_special_tokens_extended(self) -> List[Union[str, AddedToken]]: set_attr = self.special_tokens_map_extended for attr_value in set_attr.values(): all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value]) - all_toks = list(set(all_toks)) + all_toks = list(OrderedDict.fromkeys(all_toks)) return all_toks @property From 5deed37f9f1a0f5794a2a7cd02164ff265c59524 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Mon, 3 Aug 2020 23:42:56 -0700 Subject: [PATCH 093/127] cleanup torch unittests (#6196) * improve unit tests this is a sample of one test according to the request in https://github.com/huggingface/transformers/issues/5973 before I apply it to the rest * batch 1 * batch 2 * batch 3 * batch 4 * batch 5 * style * non-tf template * last deletion of check_loss_output --- .../tests/test_modeling_xxx.py | 23 +++------ tests/test_modeling_albert.py | 32 ++++-------- tests/test_modeling_bert.py | 48 ++++++------------ tests/test_modeling_ctrl.py | 11 ++--- tests/test_modeling_distilbert.py | 28 +++-------- tests/test_modeling_dpr.py | 15 +++--- tests/test_modeling_electra.py | 27 +++------- tests/test_modeling_encoder_decoder.py | 4 -- tests/test_modeling_flaubert.py | 47 +++++++----------- tests/test_modeling_gpt2.py | 21 +++----- tests/test_modeling_longformer.py | 32 ++++-------- tests/test_modeling_mobilebert.py | 42 +++++----------- tests/test_modeling_openai.py | 19 ++----- tests/test_modeling_reformer.py | 25 +++------- tests/test_modeling_roberta.py | 23 +++------ tests/test_modeling_t5.py | 3 -- tests/test_modeling_xlm.py | 47 +++++++----------- tests/test_modeling_xlnet.py | 49 +++++++------------ 18 files changed, 157 insertions(+), 339 deletions(-) diff --git a/templates/adding_a_new_model/tests/test_modeling_xxx.py b/templates/adding_a_new_model/tests/test_modeling_xxx.py index d81c9a5009a8..3adaeee43009 100644 --- a/templates/adding_a_new_model/tests/test_modeling_xxx.py +++ b/templates/adding_a_new_model/tests/test_modeling_xxx.py @@ -126,9 +126,6 @@ def prepare_config_and_inputs(self): return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels - def check_loss_output(self, result): - self.parent.assertListEqual(list(result["loss"].size()), []) - def create_and_check_xxx_model( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): @@ -138,10 +135,8 @@ def create_and_check_xxx_model( result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) result = model(input_ids, token_type_ids=token_type_ids) result = model(input_ids) - self.parent.assertListEqual( - list(result["last_hidden_state"].size()), [self.batch_size, self.seq_length, self.hidden_size] - ) - self.parent.assertListEqual(list(result["pooler_output"].size()), [self.batch_size, self.hidden_size]) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) def create_and_check_xxx_for_masked_lm( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -152,8 +147,7 @@ def create_and_check_xxx_for_masked_lm( result = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels ) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) def create_and_check_xxx_for_question_answering( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -168,9 +162,8 @@ def create_and_check_xxx_for_question_answering( start_positions=sequence_labels, end_positions=sequence_labels, ) - self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length]) - self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length]) - self.check_loss_output(result) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) def create_and_check_xxx_for_sequence_classification( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -180,8 +173,7 @@ def create_and_check_xxx_for_sequence_classification( model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels]) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) def create_and_check_xxx_for_token_classification( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -191,8 +183,7 @@ def create_and_check_xxx_for_token_classification( model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() diff --git a/tests/test_modeling_albert.py b/tests/test_modeling_albert.py index c7ad2d21922d..7abda856003a 100644 --- a/tests/test_modeling_albert.py +++ b/tests/test_modeling_albert.py @@ -103,9 +103,6 @@ def prepare_config_and_inputs(self): return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels - def check_loss_output(self, result): - self.parent.assertListEqual(list(result["loss"].size()), []) - def create_and_check_albert_model( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): @@ -115,10 +112,8 @@ def create_and_check_albert_model( result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) result = model(input_ids, token_type_ids=token_type_ids) result = model(input_ids) - self.parent.assertListEqual( - list(result["last_hidden_state"].size()), [self.batch_size, self.seq_length, self.hidden_size] - ) - self.parent.assertListEqual(list(result["pooler_output"].size()), [self.batch_size, self.hidden_size]) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) def create_and_check_albert_for_pretraining( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -133,11 +128,8 @@ def create_and_check_albert_for_pretraining( labels=token_labels, sentence_order_label=sequence_labels, ) - self.parent.assertListEqual( - list(result["prediction_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size] - ) - self.parent.assertListEqual(list(result["sop_logits"].size()), [self.batch_size, config.num_labels]) - self.check_loss_output(result) + self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + self.parent.assertEqual(result.sop_logits.shape, (self.batch_size, config.num_labels)) def create_and_check_albert_for_masked_lm( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -146,8 +138,7 @@ def create_and_check_albert_for_masked_lm( model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) def create_and_check_albert_for_question_answering( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -162,9 +153,8 @@ def create_and_check_albert_for_question_answering( start_positions=sequence_labels, end_positions=sequence_labels, ) - self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length]) - self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length]) - self.check_loss_output(result) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) def create_and_check_albert_for_sequence_classification( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -174,8 +164,7 @@ def create_and_check_albert_for_sequence_classification( model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels]) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) def create_and_check_albert_for_token_classification( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -185,8 +174,7 @@ def create_and_check_albert_for_token_classification( model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) def create_and_check_albert_for_multiple_choice( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -204,7 +192,7 @@ def create_and_check_albert_for_multiple_choice( token_type_ids=multiple_choice_token_type_ids, labels=choice_labels, ) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices]) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices)) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() diff --git a/tests/test_modeling_bert.py b/tests/test_modeling_bert.py index a85d48983e55..60460aeb94b8 100644 --- a/tests/test_modeling_bert.py +++ b/tests/test_modeling_bert.py @@ -152,9 +152,6 @@ def prepare_config_and_inputs_for_decoder(self): encoder_attention_mask, ) - def check_loss_output(self, result): - self.parent.assertListEqual(list(result["loss"].size()), []) - def create_and_check_bert_model( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): @@ -164,10 +161,8 @@ def create_and_check_bert_model( result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) result = model(input_ids, token_type_ids=token_type_ids) result = model(input_ids) - self.parent.assertListEqual( - list(result["last_hidden_state"].size()), [self.batch_size, self.seq_length, self.hidden_size] - ) - self.parent.assertListEqual(list(result["pooler_output"].size()), [self.batch_size, self.hidden_size]) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) def create_and_check_bert_model_as_decoder( self, @@ -198,10 +193,8 @@ def create_and_check_bert_model_as_decoder( encoder_hidden_states=encoder_hidden_states, ) result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) - self.parent.assertListEqual( - list(result["last_hidden_state"].size()), [self.batch_size, self.seq_length, self.hidden_size] - ) - self.parent.assertListEqual(list(result["pooler_output"].size()), [self.batch_size, self.hidden_size]) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) def create_and_check_bert_for_causal_lm( self, @@ -219,8 +212,7 @@ def create_and_check_bert_for_causal_lm( model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) def create_and_check_bert_for_masked_lm( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -229,8 +221,7 @@ def create_and_check_bert_for_masked_lm( model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) def create_and_check_bert_model_for_causal_lm_as_decoder( self, @@ -262,8 +253,7 @@ def create_and_check_bert_model_for_causal_lm_as_decoder( labels=token_labels, encoder_hidden_states=encoder_hidden_states, ) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) def create_and_check_bert_for_next_sequence_prediction( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -274,8 +264,7 @@ def create_and_check_bert_for_next_sequence_prediction( result = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, next_sentence_label=sequence_labels, ) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, 2]) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, 2)) def create_and_check_bert_for_pretraining( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -290,11 +279,8 @@ def create_and_check_bert_for_pretraining( labels=token_labels, next_sentence_label=sequence_labels, ) - self.parent.assertListEqual( - list(result["prediction_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size] - ) - self.parent.assertListEqual(list(result["seq_relationship_logits"].size()), [self.batch_size, 2]) - self.check_loss_output(result) + self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + self.parent.assertEqual(result.seq_relationship_logits.shape, (self.batch_size, 2)) def create_and_check_bert_for_question_answering( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -309,9 +295,8 @@ def create_and_check_bert_for_question_answering( start_positions=sequence_labels, end_positions=sequence_labels, ) - self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length]) - self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length]) - self.check_loss_output(result) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) def create_and_check_bert_for_sequence_classification( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -321,8 +306,7 @@ def create_and_check_bert_for_sequence_classification( model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels]) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) def create_and_check_bert_for_token_classification( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -332,8 +316,7 @@ def create_and_check_bert_for_token_classification( model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) def create_and_check_bert_for_multiple_choice( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -351,8 +334,7 @@ def create_and_check_bert_for_multiple_choice( token_type_ids=multiple_choice_token_type_ids, labels=choice_labels, ) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices]) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices)) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() diff --git a/tests/test_modeling_ctrl.py b/tests/test_modeling_ctrl.py index 29e5554f4044..eaa0dd7c1ce1 100644 --- a/tests/test_modeling_ctrl.py +++ b/tests/test_modeling_ctrl.py @@ -108,9 +108,6 @@ def prepare_config_and_inputs(self): choice_labels, ) - def check_loss_output(self, result): - self.parent.assertListEqual(list(result["loss"].size()), []) - def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): model = CTRLModel(config=config) model.to(torch_device) @@ -119,9 +116,7 @@ def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask) model(input_ids, token_type_ids=token_type_ids) result = model(input_ids) - self.parent.assertListEqual( - list(result["last_hidden_state"].size()), [self.batch_size, self.seq_length, self.hidden_size] - ) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) self.parent.assertEqual(len(result["past_key_values"]), config.n_layer) def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): @@ -130,8 +125,8 @@ def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mas model.eval() result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids) - self.parent.assertListEqual(list(result["loss"].size()), []) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]) + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() diff --git a/tests/test_modeling_distilbert.py b/tests/test_modeling_distilbert.py index 37e380c1c770..8e76e23dd2af 100644 --- a/tests/test_modeling_distilbert.py +++ b/tests/test_modeling_distilbert.py @@ -115,9 +115,6 @@ def prepare_config_and_inputs(self): return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels - def check_loss_output(self, result): - self.parent.assertListEqual(list(result["loss"].size()), []) - def create_and_check_distilbert_model( self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels ): @@ -126,8 +123,8 @@ def create_and_check_distilbert_model( model.eval() result = model(input_ids, input_mask) result = model(input_ids) - self.parent.assertListEqual( - list(result["last_hidden_state"].size()), [self.batch_size, self.seq_length, self.hidden_size] + self.parent.assertEqual( + result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size) ) def create_and_check_distilbert_for_masked_lm( @@ -137,10 +134,7 @@ def create_and_check_distilbert_for_masked_lm( model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, labels=token_labels) - self.parent.assertListEqual( - list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size] - ) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) def create_and_check_distilbert_for_question_answering( self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -151,9 +145,8 @@ def create_and_check_distilbert_for_question_answering( result = model( input_ids, attention_mask=input_mask, start_positions=sequence_labels, end_positions=sequence_labels ) - self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length]) - self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length]) - self.check_loss_output(result) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) def create_and_check_distilbert_for_sequence_classification( self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -163,8 +156,7 @@ def create_and_check_distilbert_for_sequence_classification( model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, labels=sequence_labels) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels]) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) def create_and_check_distilbert_for_token_classification( self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -175,10 +167,7 @@ def create_and_check_distilbert_for_token_classification( model.eval() result = model(input_ids, attention_mask=input_mask, labels=token_labels) - self.parent.assertListEqual( - list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels] - ) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) def create_and_check_distilbert_for_multiple_choice( self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -192,8 +181,7 @@ def create_and_check_distilbert_for_multiple_choice( result = model( multiple_choice_inputs_ids, attention_mask=multiple_choice_input_mask, labels=choice_labels, ) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices]) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices)) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() diff --git a/tests/test_modeling_dpr.py b/tests/test_modeling_dpr.py index c3016dab3fb3..d6206f17172e 100644 --- a/tests/test_modeling_dpr.py +++ b/tests/test_modeling_dpr.py @@ -130,9 +130,7 @@ def create_and_check_dpr_context_encoder( result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) result = model(input_ids, token_type_ids=token_type_ids) result = model(input_ids) - self.parent.assertListEqual( - list(result["pooler_output"].size()), [self.batch_size, self.projection_dim or self.hidden_size] - ) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.projection_dim or self.hidden_size)) def create_and_check_dpr_question_encoder( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -143,9 +141,7 @@ def create_and_check_dpr_question_encoder( result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) result = model(input_ids, token_type_ids=token_type_ids) result = model(input_ids) - self.parent.assertListEqual( - list(result["pooler_output"].size()), [self.batch_size, self.projection_dim or self.hidden_size] - ) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.projection_dim or self.hidden_size)) def create_and_check_dpr_reader( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -154,9 +150,10 @@ def create_and_check_dpr_reader( model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask,) - self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length]) - self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length]) - self.parent.assertListEqual(list(result["relevance_logits"].size()), [self.batch_size]) + + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.relevance_logits.shape, (self.batch_size,)) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() diff --git a/tests/test_modeling_electra.py b/tests/test_modeling_electra.py index 9fb1a0f46aa9..935f4a272981 100644 --- a/tests/test_modeling_electra.py +++ b/tests/test_modeling_electra.py @@ -111,9 +111,6 @@ def prepare_config_and_inputs(self): fake_token_labels, ) - def check_loss_output(self, result): - self.parent.assertListEqual(list(result["loss"].size()), []) - def create_and_check_electra_model( self, config, @@ -131,9 +128,7 @@ def create_and_check_electra_model( result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) result = model(input_ids, token_type_ids=token_type_ids) result = model(input_ids) - self.parent.assertListEqual( - list(result["last_hidden_state"].size()), [self.batch_size, self.seq_length, self.hidden_size] - ) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) def create_and_check_electra_for_masked_lm( self, @@ -150,8 +145,7 @@ def create_and_check_electra_for_masked_lm( model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) def create_and_check_electra_for_token_classification( self, @@ -169,8 +163,7 @@ def create_and_check_electra_for_token_classification( model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) def create_and_check_electra_for_pretraining( self, @@ -188,8 +181,7 @@ def create_and_check_electra_for_pretraining( model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=fake_token_labels) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length]) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length)) def create_and_check_electra_for_sequence_classification( self, @@ -207,8 +199,7 @@ def create_and_check_electra_for_sequence_classification( model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels]) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) def create_and_check_electra_for_question_answering( self, @@ -231,9 +222,8 @@ def create_and_check_electra_for_question_answering( start_positions=sequence_labels, end_positions=sequence_labels, ) - self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length]) - self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length]) - self.check_loss_output(result) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) def create_and_check_electra_for_multiple_choice( self, @@ -259,8 +249,7 @@ def create_and_check_electra_for_multiple_choice( token_type_ids=multiple_choice_token_type_ids, labels=choice_labels, ) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices]) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices)) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() diff --git a/tests/test_modeling_encoder_decoder.py b/tests/test_modeling_encoder_decoder.py index e61ef2cee4fc..f46fbeb82a0a 100644 --- a/tests/test_modeling_encoder_decoder.py +++ b/tests/test_modeling_encoder_decoder.py @@ -253,9 +253,6 @@ def create_and_check_save_and_load_encoder_decoder_model( max_diff = np.amax(np.abs(out_1 - out_2)) self.assertLessEqual(max_diff, 1e-5) - def check_loss_output(self, loss): - self.assertEqual(loss.size(), ()) - def create_and_check_bert_encoder_decoder_model_labels( self, config, @@ -281,7 +278,6 @@ def create_and_check_bert_encoder_decoder_model_labels( ) mlm_loss = outputs_encoder_decoder[0] - self.check_loss_output(mlm_loss) # check that backprop works mlm_loss.backward() diff --git a/tests/test_modeling_flaubert.py b/tests/test_modeling_flaubert.py index bba631831d21..aaecafc435ef 100644 --- a/tests/test_modeling_flaubert.py +++ b/tests/test_modeling_flaubert.py @@ -125,9 +125,6 @@ def prepare_config_and_inputs(self): input_mask, ) - def check_loss_output(self, result): - self.parent.assertListEqual(list(result["loss"].size()), []) - def create_and_check_flaubert_model( self, config, @@ -146,9 +143,7 @@ def create_and_check_flaubert_model( result = model(input_ids, lengths=input_lengths, langs=token_type_ids) result = model(input_ids, langs=token_type_ids) result = model(input_ids) - self.parent.assertListEqual( - list(result["last_hidden_state"].size()), [self.batch_size, self.seq_length, self.hidden_size] - ) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) def create_and_check_flaubert_lm_head( self, @@ -167,8 +162,8 @@ def create_and_check_flaubert_lm_head( model.eval() result = model(input_ids, token_type_ids=token_type_ids, labels=token_labels) - self.parent.assertListEqual(list(result["loss"].size()), []) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]) + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) def create_and_check_flaubert_simple_qa( self, @@ -189,9 +184,8 @@ def create_and_check_flaubert_simple_qa( result = model(input_ids) result = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels) - self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length]) - self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length]) - self.check_loss_output(result) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) def create_and_check_flaubert_qa( self, @@ -234,21 +228,16 @@ def create_and_check_flaubert_qa( (total_loss,) = result_with_labels.to_tuple() - self.parent.assertListEqual(list(result_with_labels["loss"].size()), []) - self.parent.assertListEqual( - list(result["start_top_log_probs"].size()), [self.batch_size, model.config.start_n_top] - ) - self.parent.assertListEqual( - list(result["start_top_index"].size()), [self.batch_size, model.config.start_n_top] - ) - self.parent.assertListEqual( - list(result["end_top_log_probs"].size()), - [self.batch_size, model.config.start_n_top * model.config.end_n_top], + self.parent.assertEqual(result_with_labels.loss.shape, ()) + self.parent.assertEqual(result.start_top_log_probs.shape, (self.batch_size, model.config.start_n_top)) + self.parent.assertEqual(result.start_top_index.shape, (self.batch_size, model.config.start_n_top)) + self.parent.assertEqual( + result.end_top_log_probs.shape, (self.batch_size, model.config.start_n_top * model.config.end_n_top) ) - self.parent.assertListEqual( - list(result["end_top_index"].size()), [self.batch_size, model.config.start_n_top * model.config.end_n_top], + self.parent.assertEqual( + result.end_top_index.shape, (self.batch_size, model.config.start_n_top * model.config.end_n_top) ) - self.parent.assertListEqual(list(result["cls_logits"].size()), [self.batch_size]) + self.parent.assertEqual(result.cls_logits.shape, (self.batch_size,)) def create_and_check_flaubert_sequence_classif( self, @@ -269,8 +258,8 @@ def create_and_check_flaubert_sequence_classif( result = model(input_ids) result = model(input_ids, labels=sequence_labels) - self.parent.assertListEqual(list(result["loss"].size()), []) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.type_sequence_label_size]) + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size)) def create_and_check_flaubert_token_classif( self, @@ -290,8 +279,7 @@ def create_and_check_flaubert_token_classif( model.eval() result = model(input_ids, attention_mask=input_mask, labels=token_labels) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) def create_and_check_flaubert_multiple_choice( self, @@ -318,8 +306,7 @@ def create_and_check_flaubert_multiple_choice( token_type_ids=multiple_choice_token_type_ids, labels=choice_labels, ) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices]) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices)) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() diff --git a/tests/test_modeling_gpt2.py b/tests/test_modeling_gpt2.py index 14ef2257c439..66e07f6d4a70 100644 --- a/tests/test_modeling_gpt2.py +++ b/tests/test_modeling_gpt2.py @@ -142,9 +142,6 @@ def prepare_config_and_inputs(self): choice_labels, ) - def check_loss_output(self, result): - self.parent.assertListEqual(list(result["loss"].size()), []) - def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): model = GPT2Model(config=config) model.to(torch_device) @@ -154,9 +151,7 @@ def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, result = model(input_ids, token_type_ids=token_type_ids) result = model(input_ids) - self.parent.assertListEqual( - list(result["last_hidden_state"].size()), [self.batch_size, self.seq_length, self.hidden_size], - ) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) self.parent.assertEqual(len(result["past_key_values"]), config.n_layer) def create_and_check_gpt2_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): @@ -240,10 +235,8 @@ def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mas model.eval() result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids) - self.parent.assertListEqual(list(result["loss"].size()), []) - self.parent.assertListEqual( - list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size], - ) + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) def create_and_check_double_lm_head_model( self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args @@ -265,11 +258,11 @@ def create_and_check_double_lm_head_model( } result = model(**inputs) - self.parent.assertListEqual(list(result["lm_loss"].size()), []) - self.parent.assertListEqual( - list(result["lm_logits"].size()), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size], + self.parent.assertEqual(result.lm_loss.shape, ()) + self.parent.assertEqual( + result.lm_logits.shape, (self.batch_size, self.num_choices, self.seq_length, self.vocab_size) ) - self.parent.assertListEqual(list(result["mc_logits"].size()), [self.batch_size, self.num_choices]) + self.parent.assertEqual(result.mc_logits.shape, (self.batch_size, self.num_choices)) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() diff --git a/tests/test_modeling_longformer.py b/tests/test_modeling_longformer.py index a98b9a7e3565..8b97ef3fc71b 100644 --- a/tests/test_modeling_longformer.py +++ b/tests/test_modeling_longformer.py @@ -113,9 +113,6 @@ def prepare_config_and_inputs(self): return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels - def check_loss_output(self, result): - self.parent.assertListEqual(list(result["loss"].size()), []) - def create_and_check_attention_mask_determinism( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): @@ -137,10 +134,8 @@ def create_and_check_longformer_model( result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) result = model(input_ids, token_type_ids=token_type_ids) result = model(input_ids) - self.parent.assertListEqual( - list(result["last_hidden_state"].size()), [self.batch_size, self.seq_length, self.hidden_size] - ) - self.parent.assertListEqual(list(result["pooler_output"].size()), [self.batch_size, self.hidden_size]) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) def create_and_check_longformer_model_with_global_attention_mask( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -161,10 +156,8 @@ def create_and_check_longformer_model_with_global_attention_mask( result = model(input_ids, token_type_ids=token_type_ids, global_attention_mask=global_attention_mask) result = model(input_ids, global_attention_mask=global_attention_mask) - self.parent.assertListEqual( - list(result["last_hidden_state"].size()), [self.batch_size, self.seq_length, self.hidden_size] - ) - self.parent.assertListEqual(list(result["pooler_output"].size()), [self.batch_size, self.hidden_size]) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) def create_and_check_longformer_for_masked_lm( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -173,8 +166,7 @@ def create_and_check_longformer_for_masked_lm( model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) def create_and_check_longformer_for_question_answering( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -190,9 +182,8 @@ def create_and_check_longformer_for_question_answering( start_positions=sequence_labels, end_positions=sequence_labels, ) - self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length]) - self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length]) - self.check_loss_output(result) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) def create_and_check_longformer_for_sequence_classification( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -202,8 +193,7 @@ def create_and_check_longformer_for_sequence_classification( model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels]) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) def create_and_check_longformer_for_token_classification( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -213,8 +203,7 @@ def create_and_check_longformer_for_token_classification( model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) def create_and_check_longformer_for_multiple_choice( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -234,8 +223,7 @@ def create_and_check_longformer_for_multiple_choice( token_type_ids=multiple_choice_token_type_ids, labels=choice_labels, ) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices]) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices)) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() diff --git a/tests/test_modeling_mobilebert.py b/tests/test_modeling_mobilebert.py index 2d85d7faf351..cedc075b9fdb 100644 --- a/tests/test_modeling_mobilebert.py +++ b/tests/test_modeling_mobilebert.py @@ -154,9 +154,6 @@ def prepare_config_and_inputs_for_decoder(self): encoder_attention_mask, ) - def check_loss_output(self, result): - self.parent.assertListEqual(list(result["loss"].size()), []) - def create_and_check_mobilebert_model( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): @@ -167,10 +164,8 @@ def create_and_check_mobilebert_model( result = model(input_ids, token_type_ids=token_type_ids) result = model(input_ids) - self.parent.assertListEqual( - list(result["last_hidden_state"].size()), [self.batch_size, self.seq_length, self.hidden_size] - ) - self.parent.assertListEqual(list(result["pooler_output"].size()), [self.batch_size, self.hidden_size]) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) def create_and_check_mobilebert_model_as_decoder( self, @@ -202,10 +197,8 @@ def create_and_check_mobilebert_model_as_decoder( ) result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) - self.parent.assertListEqual( - list(result["last_hidden_state"].size()), [self.batch_size, self.seq_length, self.hidden_size] - ) - self.parent.assertListEqual(list(result["pooler_output"].size()), [self.batch_size, self.hidden_size]) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) def create_and_check_mobilebert_for_masked_lm( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -214,8 +207,7 @@ def create_and_check_mobilebert_for_masked_lm( model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) def create_and_check_mobilebert_for_next_sequence_prediction( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -226,8 +218,7 @@ def create_and_check_mobilebert_for_next_sequence_prediction( result = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, next_sentence_label=sequence_labels, ) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, 2]) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, 2)) def create_and_check_mobilebert_for_pretraining( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -242,11 +233,8 @@ def create_and_check_mobilebert_for_pretraining( labels=token_labels, next_sentence_label=sequence_labels, ) - self.parent.assertListEqual( - list(result["prediction_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size] - ) - self.parent.assertListEqual(list(result["seq_relationship_logits"].size()), [self.batch_size, 2]) - self.check_loss_output(result) + self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + self.parent.assertEqual(result.seq_relationship_logits.shape, (self.batch_size, 2)) def create_and_check_mobilebert_for_question_answering( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -261,9 +249,8 @@ def create_and_check_mobilebert_for_question_answering( start_positions=sequence_labels, end_positions=sequence_labels, ) - self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length]) - self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length]) - self.check_loss_output(result) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) def create_and_check_mobilebert_for_sequence_classification( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -273,8 +260,7 @@ def create_and_check_mobilebert_for_sequence_classification( model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels]) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) def create_and_check_mobilebert_for_token_classification( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -284,8 +270,7 @@ def create_and_check_mobilebert_for_token_classification( model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) def create_and_check_mobilebert_for_multiple_choice( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -303,8 +288,7 @@ def create_and_check_mobilebert_for_multiple_choice( token_type_ids=multiple_choice_token_type_ids, labels=choice_labels, ) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices]) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices)) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() diff --git a/tests/test_modeling_openai.py b/tests/test_modeling_openai.py index 5d39313da957..0fefecfd613e 100644 --- a/tests/test_modeling_openai.py +++ b/tests/test_modeling_openai.py @@ -103,9 +103,6 @@ def prepare_config_and_inputs(self): choice_labels, ) - def check_loss_output(self, result): - self.parent.assertListEqual(list(result["loss"].size()), []) - def create_and_check_openai_gpt_model(self, config, input_ids, head_mask, token_type_ids, *args): model = OpenAIGPTModel(config=config) model.to(torch_device) @@ -115,9 +112,7 @@ def create_and_check_openai_gpt_model(self, config, input_ids, head_mask, token_ result = model(input_ids, token_type_ids=token_type_ids) result = model(input_ids) - self.parent.assertListEqual( - list(result["last_hidden_state"].size()), [self.batch_size, self.seq_length, self.hidden_size], - ) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args): model = OpenAIGPTLMHeadModel(config) @@ -125,10 +120,8 @@ def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_typ model.eval() result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids) - self.parent.assertListEqual(list(result["loss"].size()), []) - self.parent.assertListEqual( - list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size], - ) + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args): model = OpenAIGPTDoubleHeadsModel(config) @@ -136,10 +129,8 @@ def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, to model.eval() result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids) - self.parent.assertListEqual(list(result["lm_loss"].size()), []) - self.parent.assertListEqual( - list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size], - ) + self.parent.assertEqual(result.lm_loss.shape, ()) + self.parent.assertEqual(result.lm_logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() diff --git a/tests/test_modeling_reformer.py b/tests/test_modeling_reformer.py index b15f1d435565..a56b99c1434e 100644 --- a/tests/test_modeling_reformer.py +++ b/tests/test_modeling_reformer.py @@ -175,9 +175,6 @@ def prepare_config_and_inputs(self): choice_labels, ) - def check_loss_output(self, result): - self.parent.assertListEqual(list(result["loss"].size()), []) - def create_and_check_reformer_model(self, config, input_ids, input_mask, choice_labels): model = ReformerModel(config=config) model.to(torch_device) @@ -186,8 +183,8 @@ def create_and_check_reformer_model(self, config, input_ids, input_mask, choice_ result = model(input_ids) # 2 * hidden_size because we use reversible resnet layers - self.parent.assertListEqual( - list(result["last_hidden_state"].size()), [self.batch_size, self.seq_length, 2 * self.hidden_size], + self.parent.assertEqual( + result.last_hidden_state.shape, (self.batch_size, self.seq_length, 2 * self.hidden_size) ) def create_and_check_reformer_model_with_lm_backward(self, config, input_ids, input_mask, choice_labels): @@ -206,10 +203,7 @@ def create_and_check_reformer_with_lm(self, config, input_ids, input_mask, choic model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, labels=input_ids) - self.parent.assertListEqual( - list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size], - ) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) def create_and_check_reformer_with_mlm(self, config, input_ids, input_mask, choice_labels): config.is_decoder = False @@ -217,10 +211,7 @@ def create_and_check_reformer_with_mlm(self, config, input_ids, input_mask, choi model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, labels=input_ids) - self.parent.assertListEqual( - list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size], - ) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) def create_and_check_reformer_model_with_attn_mask( self, config, input_ids, input_mask, choice_labels, is_decoder=False @@ -444,9 +435,8 @@ def create_and_check_reformer_for_question_answering(self, config, input_ids, in result = model( input_ids, attention_mask=input_mask, start_positions=choice_labels, end_positions=choice_labels, ) - self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length]) - self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length]) - self.check_loss_output(result) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) def create_and_check_past_buckets_states(self, config, input_ids, input_mask, choice_labels): config.is_decoder = True @@ -490,8 +480,7 @@ def create_and_check_reformer_for_sequence_classification( model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, labels=sequence_labels) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels]) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) class ReformerTesterMixin: diff --git a/tests/test_modeling_roberta.py b/tests/test_modeling_roberta.py index 82de9241919f..00b0b79e540e 100644 --- a/tests/test_modeling_roberta.py +++ b/tests/test_modeling_roberta.py @@ -101,9 +101,6 @@ def prepare_config_and_inputs(self): return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels - def check_loss_output(self, result): - self.parent.assertListEqual(list(result["loss"].size()), []) - def create_and_check_roberta_model( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): @@ -114,10 +111,8 @@ def create_and_check_roberta_model( result = model(input_ids, token_type_ids=token_type_ids) result = model(input_ids) - self.parent.assertListEqual( - list(result["last_hidden_state"].size()), [self.batch_size, self.seq_length, self.hidden_size] - ) - self.parent.assertListEqual(list(result["pooler_output"].size()), [self.batch_size, self.hidden_size]) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) def create_and_check_roberta_for_masked_lm( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -126,8 +121,7 @@ def create_and_check_roberta_for_masked_lm( model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) def create_and_check_roberta_for_token_classification( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -137,8 +131,7 @@ def create_and_check_roberta_for_token_classification( model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) def create_and_check_roberta_for_multiple_choice( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -156,8 +149,7 @@ def create_and_check_roberta_for_multiple_choice( token_type_ids=multiple_choice_token_type_ids, labels=choice_labels, ) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices]) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices)) def create_and_check_roberta_for_question_answering( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -172,9 +164,8 @@ def create_and_check_roberta_for_question_answering( start_positions=sequence_labels, end_positions=sequence_labels, ) - self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length]) - self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length]) - self.check_loss_output(result) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() diff --git a/tests/test_modeling_t5.py b/tests/test_modeling_t5.py index 9177e2cd5438..a316eb826f3e 100644 --- a/tests/test_modeling_t5.py +++ b/tests/test_modeling_t5.py @@ -95,9 +95,6 @@ def prepare_config_and_inputs(self): lm_labels, ) - def check_loss_output(self, result): - self.parent.assertListEqual(list(result["loss"].size()), []) - def check_prepare_lm_labels_via_shift_left( self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels, ): diff --git a/tests/test_modeling_xlm.py b/tests/test_modeling_xlm.py index 30e98d8dd1da..8114cd6ad8db 100644 --- a/tests/test_modeling_xlm.py +++ b/tests/test_modeling_xlm.py @@ -128,9 +128,6 @@ def prepare_config_and_inputs(self): input_mask, ) - def check_loss_output(self, result): - self.parent.assertListEqual(list(result["loss"].size()), []) - def create_and_check_xlm_model( self, config, @@ -149,9 +146,7 @@ def create_and_check_xlm_model( result = model(input_ids, lengths=input_lengths, langs=token_type_ids) result = model(input_ids, langs=token_type_ids) result = model(input_ids) - self.parent.assertListEqual( - list(result["last_hidden_state"].size()), [self.batch_size, self.seq_length, self.hidden_size] - ) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) def create_and_check_xlm_lm_head( self, @@ -170,8 +165,8 @@ def create_and_check_xlm_lm_head( model.eval() result = model(input_ids, token_type_ids=token_type_ids, labels=token_labels) - self.parent.assertListEqual(list(result["loss"].size()), []) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]) + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) def create_and_check_xlm_simple_qa( self, @@ -193,9 +188,8 @@ def create_and_check_xlm_simple_qa( outputs = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels) result = outputs - self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length]) - self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length]) - self.check_loss_output(result) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) def create_and_check_xlm_qa( self, @@ -238,21 +232,16 @@ def create_and_check_xlm_qa( (total_loss,) = result_with_labels.to_tuple() - self.parent.assertListEqual(list(result_with_labels["loss"].size()), []) - self.parent.assertListEqual( - list(result["start_top_log_probs"].size()), [self.batch_size, model.config.start_n_top] - ) - self.parent.assertListEqual( - list(result["start_top_index"].size()), [self.batch_size, model.config.start_n_top] - ) - self.parent.assertListEqual( - list(result["end_top_log_probs"].size()), - [self.batch_size, model.config.start_n_top * model.config.end_n_top], + self.parent.assertEqual(result_with_labels.loss.shape, ()) + self.parent.assertEqual(result.start_top_log_probs.shape, (self.batch_size, model.config.start_n_top)) + self.parent.assertEqual(result.start_top_index.shape, (self.batch_size, model.config.start_n_top)) + self.parent.assertEqual( + result.end_top_log_probs.shape, (self.batch_size, model.config.start_n_top * model.config.end_n_top) ) - self.parent.assertListEqual( - list(result["end_top_index"].size()), [self.batch_size, model.config.start_n_top * model.config.end_n_top], + self.parent.assertEqual( + result.end_top_index.shape, (self.batch_size, model.config.start_n_top * model.config.end_n_top) ) - self.parent.assertListEqual(list(result["cls_logits"].size()), [self.batch_size]) + self.parent.assertEqual(result.cls_logits.shape, (self.batch_size,)) def create_and_check_xlm_sequence_classif( self, @@ -272,8 +261,8 @@ def create_and_check_xlm_sequence_classif( result = model(input_ids) result = model(input_ids, labels=sequence_labels) - self.parent.assertListEqual(list(result["loss"].size()), []) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.type_sequence_label_size]) + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size)) def create_and_check_xlm_token_classif( self, @@ -293,8 +282,7 @@ def create_and_check_xlm_token_classif( model.eval() result = model(input_ids, attention_mask=input_mask, labels=token_labels) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) def create_and_check_xlm_for_multiple_choice( self, @@ -321,8 +309,7 @@ def create_and_check_xlm_for_multiple_choice( token_type_ids=multiple_choice_token_type_ids, labels=choice_labels, ) - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices]) - self.check_loss_output(result) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices)) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() diff --git a/tests/test_modeling_xlnet.py b/tests/test_modeling_xlnet.py index e0d9479503c5..031ae47792e5 100644 --- a/tests/test_modeling_xlnet.py +++ b/tests/test_modeling_xlnet.py @@ -190,9 +190,7 @@ def create_and_check_xlnet_base_model( base_model_output = model(input_ids_1) self.parent.assertEqual(len(base_model_output), 2) - self.parent.assertListEqual( - list(result["last_hidden_state"].size()), [self.batch_size, self.seq_length, self.hidden_size], - ) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) self.parent.assertListEqual( list(list(mem.size()) for mem in result["mems"]), [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers, @@ -311,19 +309,15 @@ def create_and_check_xlnet_lm_head( _ = model(input_ids_q, perm_mask=perm_mask, target_mapping=target_mapping) - self.parent.assertListEqual(list(result1["loss"].size()), []) - self.parent.assertListEqual( - list(result1["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size], - ) + self.parent.assertEqual(result1.loss.shape, ()) + self.parent.assertEqual(result1.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) self.parent.assertListEqual( list(list(mem.size()) for mem in result1["mems"]), [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers, ) - self.parent.assertListEqual(list(result2["loss"].size()), []) - self.parent.assertListEqual( - list(result2["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size], - ) + self.parent.assertEqual(result2.loss.shape, ()) + self.parent.assertEqual(result2.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) self.parent.assertListEqual( list(list(mem.size()) for mem in result2["mems"]), [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers, @@ -373,21 +367,16 @@ def create_and_check_xlnet_qa( total_loss, mems = result_with_labels.to_tuple() - self.parent.assertListEqual(list(result_with_labels["loss"].size()), []) - self.parent.assertListEqual( - list(result["start_top_log_probs"].size()), [self.batch_size, model.config.start_n_top], - ) - self.parent.assertListEqual( - list(result["start_top_index"].size()), [self.batch_size, model.config.start_n_top], - ) - self.parent.assertListEqual( - list(result["end_top_log_probs"].size()), - [self.batch_size, model.config.start_n_top * model.config.end_n_top], + self.parent.assertEqual(result_with_labels.loss.shape, ()) + self.parent.assertEqual(result.start_top_log_probs.shape, (self.batch_size, model.config.start_n_top)) + self.parent.assertEqual(result.start_top_index.shape, (self.batch_size, model.config.start_n_top)) + self.parent.assertEqual( + result.end_top_log_probs.shape, (self.batch_size, model.config.start_n_top * model.config.end_n_top) ) - self.parent.assertListEqual( - list(result["end_top_index"].size()), [self.batch_size, model.config.start_n_top * model.config.end_n_top], + self.parent.assertEqual( + result.end_top_index.shape, (self.batch_size, model.config.start_n_top * model.config.end_n_top) ) - self.parent.assertListEqual(list(result["cls_logits"].size()), [self.batch_size]) + self.parent.assertEqual(result.cls_logits.shape, (self.batch_size,)) self.parent.assertListEqual( list(list(mem.size()) for mem in result["mems"]), [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers, @@ -415,10 +404,8 @@ def create_and_check_xlnet_token_classif( result = model(input_ids_1) result = model(input_ids_1, labels=token_labels) - self.parent.assertListEqual(list(result["loss"].size()), []) - self.parent.assertListEqual( - list(result["logits"].size()), [self.batch_size, self.seq_length, self.type_sequence_label_size], - ) + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.type_sequence_label_size)) self.parent.assertListEqual( list(list(mem.size()) for mem in result["mems"]), [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers, @@ -446,10 +433,8 @@ def create_and_check_xlnet_sequence_classif( result = model(input_ids_1) result = model(input_ids_1, labels=sequence_labels) - self.parent.assertListEqual(list(result["loss"].size()), []) - self.parent.assertListEqual( - list(result["logits"].size()), [self.batch_size, self.type_sequence_label_size], - ) + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size)) self.parent.assertListEqual( list(list(mem.size()) for mem in result["mems"]), [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers, From 6730ecdd3c92f068ddb598812c90baddd4ff22c7 Mon Sep 17 00:00:00 2001 From: Sam Shleifer Date: Tue, 4 Aug 2020 02:59:21 -0400 Subject: [PATCH 094/127] Remove redundant coverage (#6224) --- tests/test_tokenization_common.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index f0456290b919..ba891f0cbb29 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -1065,10 +1065,6 @@ def test_get_vocab(self): self.assertIsInstance(vocab, dict) self.assertEqual(len(vocab), len(tokenizer)) - for word, ind in vocab.items(): - self.assertEqual(tokenizer.convert_tokens_to_ids(word), ind) - self.assertEqual(tokenizer.convert_ids_to_tokens(ind), word) - tokenizer.add_tokens(["asdfasdfasdfasdf"]) vocab = tokenizer.get_vocab() self.assertIsInstance(vocab, dict) From 1d5c3a3d966f2c6459d6a275296a5526af3e5563 Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Tue, 4 Aug 2020 03:20:19 -0400 Subject: [PATCH 095/127] Test with --no-cache-dir (#6235) --- .circleci/config.yml | 14 +++++++------- .github/workflows/github-torch-hub.yml | 2 +- .github/workflows/self-push.yml | 2 +- .github/workflows/self-scheduled.yml | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 7d6e3018f87d..3666e0936270 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -11,7 +11,7 @@ jobs: steps: - checkout - run: sudo pip install --upgrade pip - - run: sudo pip install .[sklearn,tf-cpu,torch,testing] + - run: sudo pip install .[sklearn,tf-cpu,torch,testing] --no-cache-dir - run: sudo pip install codecov pytest-cov - run: python -m pytest -n 8 --dist=loadfile -s ./tests/ --cov | tee output.txt - run: codecov @@ -29,7 +29,7 @@ jobs: steps: - checkout - run: sudo pip install --upgrade pip - - run: sudo pip install .[sklearn,torch,testing] + - run: sudo pip install .[sklearn,torch,testing] --no-cache-dir - run: python -m pytest -n 8 --dist=loadfile -s ./tests/ | tee output.txt - store_artifacts: path: ~/transformers/output.txt @@ -46,7 +46,7 @@ jobs: steps: - checkout - run: sudo pip install --upgrade pip - - run: sudo pip install .[sklearn,tf-cpu,testing] + - run: sudo pip install .[sklearn,tf-cpu,testing] --no-cache-dir - run: python -m pytest -n 8 --dist=loadfile -s ./tests/ | tee output.txt - store_artifacts: path: ~/transformers/output.txt @@ -76,7 +76,7 @@ jobs: steps: - checkout - run: sudo pip install --upgrade pip - - run: sudo pip install .[sklearn,torch,testing] + - run: sudo pip install .[sklearn,torch,testing] --no-cache-dir - run: sudo pip install -r examples/requirements.txt - run: python -m pytest -n 8 --dist=loadfile -rA -s ./examples/ | tee output.txt - store_artifacts: @@ -89,7 +89,7 @@ jobs: steps: - checkout - run: sudo pip install --upgrade pip - - run: sudo pip install .[tf,torch,docs] + - run: sudo pip install .[tf,torch,docs] --no-cache-dir - run: cd docs && make html SPHINXOPTS="-W" - store_artifacts: path: ./docs/_build @@ -102,7 +102,7 @@ jobs: fingerprints: - "5b:7a:95:18:07:8c:aa:76:4c:60:35:88:ad:60:56:71" - checkout - - run: sudo pip install .[tf,torch,docs] + - run: sudo pip install .[tf,torch,docs] --no-cache-dir - run: ./.circleci/deploy.sh check_code_quality: working_directory: ~/transformers @@ -115,7 +115,7 @@ jobs: - run: sudo pip install --upgrade pip # we need a version of isort with https://github.com/timothycrosley/isort/pull/1000 - run: sudo pip install git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort - - run: sudo pip install .[tf,torch,quality] + - run: sudo pip install .[tf,torch,quality] --no-cache-dir - run: black --check --line-length 119 --target-version py35 examples templates tests src utils - run: isort --check-only --recursive examples templates tests src utils - run: flake8 examples templates tests src utils diff --git a/.github/workflows/github-torch-hub.yml b/.github/workflows/github-torch-hub.yml index 923b0983b4c3..f9eba31adcdf 100644 --- a/.github/workflows/github-torch-hub.yml +++ b/.github/workflows/github-torch-hub.yml @@ -21,7 +21,7 @@ jobs: - name: Install dependencies run: | pip install --upgrade pip - pip install torch + pip install torch --no-cache-dir pip install numpy tokenizers filelock requests tqdm regex sentencepiece sacremoses packaging - name: Torch hub list diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml index 8784c9b08964..fceb9400d17d 100644 --- a/.github/workflows/self-push.yml +++ b/.github/workflows/self-push.yml @@ -36,7 +36,7 @@ jobs: run: | source .env/bin/activate pip install --upgrade pip - pip install torch + pip install torch --no-cache-dir pip install .[sklearn,testing] - name: Are GPUs recognized by our DL frameworks diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index a61a9b58079f..9380535ab249 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -32,7 +32,7 @@ jobs: run: | source .env/bin/activate pip install --upgrade pip - pip install .[sklearn,torch,testing] + pip install .[sklearn,torch,testing] --no-cache-dir - name: Are GPUs recognized by our DL frameworks run: | From 7ea9b2db3732904014b9121fb8a5c896ae00d4cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s=20Felipe=20Cruz?= Date: Tue, 4 Aug 2020 00:23:28 -0700 Subject: [PATCH 096/127] Encoder decoder config docs (#6195) * Adding docs for how to load encoder_decoder pretrained model with individual config objects * Adding docs for loading encoder_decoder config from pretrained folder * Fixing W293 blank line contains whitespace * Update src/transformers/modeling_encoder_decoder.py * Update src/transformers/modeling_encoder_decoder.py * Update src/transformers/modeling_encoder_decoder.py * Apply suggestions from code review model file should only show examples for how to load save model * Update src/transformers/configuration_encoder_decoder.py * Update src/transformers/configuration_encoder_decoder.py * fix space Co-authored-by: Patrick von Platen --- src/transformers/configuration_encoder_decoder.py | 9 +++++++++ src/transformers/modeling_encoder_decoder.py | 8 +++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/transformers/configuration_encoder_decoder.py b/src/transformers/configuration_encoder_decoder.py index ae71dbecc175..785a4c654ec7 100644 --- a/src/transformers/configuration_encoder_decoder.py +++ b/src/transformers/configuration_encoder_decoder.py @@ -56,6 +56,15 @@ class EncoderDecoderConfig(PretrainedConfig): >>> # Accessing the model configuration >>> config_encoder = model.config.encoder >>> config_decoder = model.config.decoder + >>> # set decoder config to causal lm + >>> config_decoder.is_decoder = True + + >>> # Saving the model, including its configuration + >>> model.save_pretrained('my-model') + + >>> # loading model and config from pretrained folder + >>> encoder_decoder_config = EncoderDecoderConfig.from_pretrained('my-model') + >>> model = EncoderDecoderModel.from_pretrained('my-model', config=encoder_decoder_config) """ model_type = "encoder_decoder" diff --git a/src/transformers/modeling_encoder_decoder.py b/src/transformers/modeling_encoder_decoder.py index ec98a250d9ed..772ae74e22a6 100644 --- a/src/transformers/modeling_encoder_decoder.py +++ b/src/transformers/modeling_encoder_decoder.py @@ -127,7 +127,13 @@ def from_encoder_decoder_pretrained( Examples:: >>> from transformers import EncoderDecoderModel - >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert + >>> # initialize a bert2bert from two pretrained BERT models. Note that the cross-attention layers will be randomly initialized + >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') + >>> # saving model after fine-tuning + >>> model.save_pretrained("./bert2bert") + >>> # load fine-tuned model + >>> model = EncoderDecoderModel.from_pretrained("./bert2bert") + """ kwargs_encoder = { From 7f65daa2e155ecdd8594e19862dac8b322ed3b73 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 4 Aug 2020 13:02:25 +0200 Subject: [PATCH 097/127] fix reformer fp16 (#6237) --- tests/test_modeling_reformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_modeling_reformer.py b/tests/test_modeling_reformer.py index a56b99c1434e..e5d07d8eb68a 100644 --- a/tests/test_modeling_reformer.py +++ b/tests/test_modeling_reformer.py @@ -389,7 +389,7 @@ def create_and_check_reformer_model_fp16_forward(self, config, input_ids, input_ model.to(torch_device) model.half() model.eval() - output = model(input_ids, attention_mask=input_mask)["last_input_state"] + output = model(input_ids, attention_mask=input_mask)["last_hidden_state"] self.parent.assertFalse(torch.isnan(output).any().item()) def create_and_check_reformer_model_generate(self, config, input_ids, input_mask, choice_labels): From 268bf34630aaae4036dbe3e45a0e8a0fa75e18f9 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 4 Aug 2020 06:31:49 -0700 Subject: [PATCH 098/127] typo (#6225) --- src/transformers/benchmark/benchmark_args_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/benchmark/benchmark_args_utils.py b/src/transformers/benchmark/benchmark_args_utils.py index cb94b70ac897..4904225805c3 100644 --- a/src/transformers/benchmark/benchmark_args_utils.py +++ b/src/transformers/benchmark/benchmark_args_utils.py @@ -62,8 +62,8 @@ class BenchmarkArguments: fp16: bool = field(default=False, metadata={"help": "Use FP16 to accelerate inference."}) training: bool = field(default=False, metadata={"help": "Benchmark training of model"}) verbose: bool = field(default=False, metadata={"help": "Verbose memory tracing"}) - no_speed: bool = field(default=False, metadata={"help": "Don't perform speed measurments"}) - no_memory: bool = field(default=False, metadata={"help": "Don't perform memory measurments"}) + no_speed: bool = field(default=False, metadata={"help": "Don't perform speed measurements"}) + no_memory: bool = field(default=False, metadata={"help": "Don't perform memory measurements"}) trace_memory_line_by_line: bool = field(default=False, metadata={"help": "Trace memory line by line"}) save_to_csv: bool = field(default=False, metadata={"help": "Save result to a CSV file"}) log_print: bool = field(default=False, metadata={"help": "Save all print statements in a log file"}) From d5b0a0e235cc6fccba4f9013cdb54cee01e90a91 Mon Sep 17 00:00:00 2001 From: Sam Shleifer Date: Tue, 4 Aug 2020 09:53:51 -0400 Subject: [PATCH 099/127] mBART Conversion script (#6230) --- ..._original_pytorch_checkpoint_to_pytorch.py | 13 ------- ...rt_mbart_original_checkpoint_to_pytorch.py | 36 +++++++++++++++++++ 2 files changed, 36 insertions(+), 13 deletions(-) create mode 100644 src/transformers/convert_mbart_original_checkpoint_to_pytorch.py diff --git a/src/transformers/convert_bart_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/convert_bart_original_pytorch_checkpoint_to_pytorch.py index bba7b5a76bb6..52efc88f61ae 100644 --- a/src/transformers/convert_bart_original_pytorch_checkpoint_to_pytorch.py +++ b/src/transformers/convert_bart_original_pytorch_checkpoint_to_pytorch.py @@ -78,19 +78,6 @@ def load_xsum_checkpoint(checkpoint_path): return hub_interface -def convert_checkpoint_from_disk(checkpoint_path, **config_kwargs): - state_dict = torch.load(checkpoint_path, map_location="cpu")["model"] - remove_ignore_keys_(state_dict) - vocab_size = state_dict["encoder.embed_tokens.weight"].shape[0] - state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"] - mbart_config = BartConfig(vocab_size=vocab_size, **config_kwargs) - model = BartForConditionalGeneration(mbart_config) - model.model.load_state_dict(state_dict) - if hasattr(model, "lm_head"): - model.lm_head = _make_linear_from_emb(model.model.shared) - return model - - @torch.no_grad() def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path, hf_checkpoint_name=None): """ diff --git a/src/transformers/convert_mbart_original_checkpoint_to_pytorch.py b/src/transformers/convert_mbart_original_checkpoint_to_pytorch.py new file mode 100644 index 000000000000..e61395d0d4aa --- /dev/null +++ b/src/transformers/convert_mbart_original_checkpoint_to_pytorch.py @@ -0,0 +1,36 @@ +import argparse + +import torch + +from transformers import BartForConditionalGeneration, MBartConfig + +from .convert_bart_original_pytorch_checkpoint_to_pytorch import remove_ignore_keys_ + + +def convert_fairseq_mbart_checkpoint_from_disk(checkpoint_path, hf_config_path="facebook/mbart-large-en-ro"): + state_dict = torch.load(checkpoint_path, map_location="cpu")["model"] + remove_ignore_keys_(state_dict) + vocab_size = state_dict["encoder.embed_tokens.weight"].shape[0] + mbart_config = MBartConfig.from_pretrained(hf_config_path, vocab_size=vocab_size) + state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"] + model = BartForConditionalGeneration(mbart_config) + model.model.load_state_dict(state_dict) + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "fairseq_path", type=str, help="bart.large, bart.large.cnn or a path to a model.pt on local filesystem." + ) + parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") + parser.add_argument( + "--hf_config", + default="facebook/mbart-large-cc25", + type=str, + help="Which huggingface architecture to use: bart-large-xsum", + ) + args = parser.parse_args() + model = convert_fairseq_mbart_checkpoint_from_disk(args.fairseq_path, hf_config_path=args.hf_config) + model.save_pretrained(args.pytorch_dump_folder_path) From 6c9ba1d8fc3f0957ea7a1a59ef8622f8d54ac89a Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 4 Aug 2020 19:22:43 +0200 Subject: [PATCH 100/127] [Reformer] Make random seed generator available on random seed and not on model device (#6244) * improve if else statement random seeds * Apply suggestions from code review * Update src/transformers/modeling_reformer.py --- src/transformers/modeling_reformer.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/transformers/modeling_reformer.py b/src/transformers/modeling_reformer.py index f9e8ac76d6d1..6049dc3ed086 100644 --- a/src/transformers/modeling_reformer.py +++ b/src/transformers/modeling_reformer.py @@ -1399,15 +1399,16 @@ def _init_attention_seed(self): """ # randomize seeds - if next(self.parameters()).device.type == "cuda": + # use cuda generator if available + if len(torch.cuda.default_generators) > 0: # GPU device_idx = torch.cuda.current_device() self.attention_seed = torch.cuda.default_generators[device_idx].seed() - torch.cuda.manual_seed(self.attention_seed) else: # CPU self.attention_seed = int(torch.seed() % sys.maxsize) - torch.manual_seed(self.attention_seed) + + torch.manual_seed(self.attention_seed) def _init_feed_forward_seed(self): """ @@ -1417,17 +1418,17 @@ def _init_feed_forward_seed(self): call and 1 forward call in backward to recalculate activations. """ - # randomize seeds - if next(self.parameters()).device.type == "cuda": + # use cuda generator if available + if len(torch.cuda.default_generators) > 0: # GPU device_idx = torch.cuda.current_device() self.feed_forward_seed = torch.cuda.default_generators[device_idx].seed() - torch.cuda.manual_seed(self.feed_forward_seed) else: # CPU self.feed_forward_seed = int(torch.seed() % sys.maxsize) - torch.manual_seed(self.feed_forward_seed) + + torch.manual_seed(self.feed_forward_seed) def forward( self, From 5920a37a4c95b667f0a2962c4b4e727c323b07fa Mon Sep 17 00:00:00 2001 From: Timo Moeller Date: Tue, 4 Aug 2020 19:40:49 +0200 Subject: [PATCH 101/127] Add license info to German Bert models (#6242) * Add xlm-r QA model card * Add tags * Add license info to german bert --- model_cards/bert-base-german-cased-README.md | 1 + model_cards/deepset/bert-base-german-cased-oldvocab/README.md | 1 + 2 files changed, 2 insertions(+) diff --git a/model_cards/bert-base-german-cased-README.md b/model_cards/bert-base-german-cased-README.md index 8d616dc34c44..bb154a2ed74d 100644 --- a/model_cards/bert-base-german-cased-README.md +++ b/model_cards/bert-base-german-cased-README.md @@ -1,5 +1,6 @@ --- language: de +license: mit thumbnail: https://static.tildacdn.com/tild6438-3730-4164-b266-613634323466/german_bert.png tags: - exbert diff --git a/model_cards/deepset/bert-base-german-cased-oldvocab/README.md b/model_cards/deepset/bert-base-german-cased-oldvocab/README.md index 8fca3924bbd2..b1401379287c 100644 --- a/model_cards/deepset/bert-base-german-cased-oldvocab/README.md +++ b/model_cards/deepset/bert-base-german-cased-oldvocab/README.md @@ -1,5 +1,6 @@ --- language: de +license: mit thumbnail: https://static.tildacdn.com/tild6438-3730-4164-b266-613634323466/german_bert.png tags: - exbert From 972535ea74c7b30987bc31c6621a2bbb58f82ca6 Mon Sep 17 00:00:00 2001 From: Joe Davison Date: Tue, 4 Aug 2020 16:37:49 -0400 Subject: [PATCH 102/127] fix zero shot pipeline docs (#6245) --- docs/source/main_classes/pipelines.rst | 8 ++++++++ src/transformers/__init__.py | 1 + src/transformers/pipelines.py | 11 ++++++----- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/docs/source/main_classes/pipelines.rst b/docs/source/main_classes/pipelines.rst index 067b7eca9308..6bcbd399e116 100644 --- a/docs/source/main_classes/pipelines.rst +++ b/docs/source/main_classes/pipelines.rst @@ -20,6 +20,7 @@ There are two categories of pipeline abstractions to be aware about: - :class:`~transformers.TextGenerationPipeline` - :class:`~transformers.TokenClassificationPipeline` - :class:`~transformers.TranslationPipeline` + - :class:`~transformers.ZeroShotClassificationPipeline` The pipeline abstraction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -97,6 +98,13 @@ TokenClassificationPipeline :special-members: __call__ :members: +ZeroShotClassificationPipeline +========================================== + +.. autoclass:: transformers.ZeroShotClassificationPipeline + :special-members: __call__ + :members: + Parent class: :obj:`Pipeline` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 18f6d72cefa3..f14f032d191c 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -120,6 +120,7 @@ TextGenerationPipeline, TokenClassificationPipeline, TranslationPipeline, + ZeroShotClassificationPipeline, pipeline, ) diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py index 3cd252fd8f4a..8538233b39b9 100755 --- a/src/transformers/pipelines.py +++ b/src/transformers/pipelines.py @@ -1033,30 +1033,31 @@ def __call__(self, sequences, candidate_labels, hypothesis_template="This exampl Classify the sequence(s) given as inputs. Args: - sequences (:obj:`str` or obj:`List[str]`): + sequences (:obj:`str` or :obj:`List[str]`): The sequence(s) to classify, will be truncated if the model input is too large. - candidate_labels (:obj:`str` or obj:`List[str]`): + candidate_labels (:obj:`str` or :obj:`List[str]`): The set of possible class labels to classify each sequence into. Can be a single label, a string of comma-separated labels, or a list of labels. - hypothesis_template (obj:`str`, `optional`, defaults to :obj:`"This example is {}."`): + hypothesis_template (:obj:`str`, `optional`, defaults to :obj:`"This example is {}."`): The template used to turn each label into an NLI-style hypothesis. This template must include a {} or similar syntax for the candidate label to be inserted into the template. For example, the default template is :obj:`"This example is {}."` With the candidate label :obj:`"sports"`, this would be fed into the model like :obj:`" sequence to classify This example is sports . "`. The default template works well in many cases, but it may be worthwhile to experiment with different templates depending on the task setting. - multi_class (obj:`bool`, `optional`, defaults to :obj:`False`): + multi_class (:obj:`bool`, `optional`, defaults to :obj:`False`): Whether or not multiple candidate labels can be true. If :obj:`False`, the scores are normalized such that the sum of the label likelihoods for each sequence is 1. If :obj:`True`, the labels are considered independent and probabilities are normalized for each candidate by doing a softmax of the entailment score vs. the contradiction score. + Return: A :obj:`dict` or a list of :obj:`dict`: Each result comes as a dictionary with the following keys: - **sequence** (:obj:`str`) -- The sequence for which this is the output. - **labels** (:obj:`List[str]`) -- The labels sorted by order of likelihood. - - **scores** (:obj:` List[float]`) -- The probabilities for each of the labels. + - **scores** (:obj:`List[float]`) -- The probabilities for each of the labels. """ outputs = super().__call__(sequences, candidate_labels, hypothesis_template) num_sequences = 1 if isinstance(sequences, str) else len(sequences) From aa7c22a283083a1bf1e76eb6857fb8f934ddfdd6 Mon Sep 17 00:00:00 2001 From: Binny Mathew Date: Wed, 5 Aug 2020 03:10:47 +0530 Subject: [PATCH 103/127] Update Model Card (#6246) Added citation and paper links. --- .../dehatebert-mono-arabic/README.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/model_cards/Hate-speech-CNERG/dehatebert-mono-arabic/README.md b/model_cards/Hate-speech-CNERG/dehatebert-mono-arabic/README.md index e90a6e80e9e0..4234735d79a2 100644 --- a/model_cards/Hate-speech-CNERG/dehatebert-mono-arabic/README.md +++ b/model_cards/Hate-speech-CNERG/dehatebert-mono-arabic/README.md @@ -1,2 +1,18 @@ This model is used detecting **hatespeech** in **Arabic language**. The mono in the name refers to the monolingual setting, where the model is trained using only Arabic language data. It is finetuned on multilingual bert model. The model is trained with different learning rates and the best validation score achieved is 0.877609 for a learning rate of 2e-5. Training code can be found at this [url](https://github.com/punyajoy/DE-LIMIT) + +### For more details about our paper + +Sai Saketh Aluru, Binny Mathew, Punyajoy Saha and Animesh Mukherjee. "[Deep Learning Models for Multilingual Hate Speech Detection](https://arxiv.org/abs/2004.06465)". Accepted at ECML-PKDD 2020. + +***Please cite our paper in any published work that uses any of these resources.*** + +~~~ +@article{aluru2020deep, + title={Deep Learning Models for Multilingual Hate Speech Detection}, + author={Aluru, Sai Saket and Mathew, Binny and Saha, Punyajoy and Mukherjee, Animesh}, + journal={arXiv preprint arXiv:2004.06465}, + year={2020} +} + +~~~ From 3f30ebe6ca27b2cbad88c890ad5183b54f19db3c Mon Sep 17 00:00:00 2001 From: Manuel Romero Date: Tue, 4 Aug 2020 23:41:23 +0200 Subject: [PATCH 104/127] Create README.md (#6075) --- .../mrm8488/gpt2-finetuned-recipes-cooking/README.md | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 model_cards/mrm8488/gpt2-finetuned-recipes-cooking/README.md diff --git a/model_cards/mrm8488/gpt2-finetuned-recipes-cooking/README.md b/model_cards/mrm8488/gpt2-finetuned-recipes-cooking/README.md new file mode 100644 index 000000000000..1e8029f30425 --- /dev/null +++ b/model_cards/mrm8488/gpt2-finetuned-recipes-cooking/README.md @@ -0,0 +1,6 @@ +--- +language: en +thumbnail: +widget: +- text: "HuggingFace Cake:" +--- From 5177dca634c7b4b6e3e585ae45938e4f043a7348 Mon Sep 17 00:00:00 2001 From: Adam Montgomerie Date: Wed, 5 Aug 2020 06:42:53 +0900 Subject: [PATCH 105/127] Create README.md (#6123) --- .../t5-base-question-generator/README.md | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 model_cards/iarfmoose/t5-base-question-generator/README.md diff --git a/model_cards/iarfmoose/t5-base-question-generator/README.md b/model_cards/iarfmoose/t5-base-question-generator/README.md new file mode 100644 index 000000000000..6f1de2874cab --- /dev/null +++ b/model_cards/iarfmoose/t5-base-question-generator/README.md @@ -0,0 +1,33 @@ +# Model name + +## Model description + +This model is a sequence-to-sequence question generator which takes an answer and context as an input, and generates a question as an output. It is based on a pretrained `t5-base` model. + +## Intended uses & limitations + +The model is trained to generate reading comprehension-style questions with answers extracted from a text. The model performs best with full sentence answers, but can also be used with single word or short phrase answers. + +#### How to use + +The model takes concatenated answers and context as an input sequence, and will generate a full question sentence as an output sequence. The max sequence length is 512 tokens. Inputs should be organised into the following format: +``` +answer_token context_token +``` +The input sequence can then be encoded and passed as the `input_ids` argument in the model's `generate()` method. + +For best results, a large number of questions can be generated, and then filtered using [iarfmoose/bert-base-cased-qa-evaluator](https://huggingface.co/iarfmoose/bert-base-cased-qa-evaluator). + +For examples, please see https://github.com/iarfmoose/question_generator. + +#### Limitations and bias + +The model is limited to generating questions in the same style as those found in [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/), [CoQA](https://stanfordnlp.github.io/coqa/), and [MSMARCO](https://microsoft.github.io/msmarco/). The generated questions can potentially be leading or reflect biases that are present in the context. If the context is too short or completely absent, or if the context and answer do not match, the generated question is likely to be incoherent. + +## Training data + +The model was fine-tuned on a dataset made up of several well-known QA datasets ([SQuAD](https://rajpurkar.github.io/SQuAD-explorer/), [CoQA](https://stanfordnlp.github.io/coqa/), and [MSMARCO](https://microsoft.github.io/msmarco/)). The datasets were restructured by concatenating the answer and context fields into the previously-mentioned format. The question field from the datasets was used as the target during training. The full training set was roughly 200,000 examples. + +## Training procedure + +The model was trained for 20 epochs over the training set with a learning rate of 1e-3. The batch size was only 4 due to GPU memory limitations when training on Google Colab. From 869ec441c99bad632b5812173b1fc4d6803b0f3b Mon Sep 17 00:00:00 2001 From: Ali Safaya Date: Wed, 5 Aug 2020 00:43:38 +0300 Subject: [PATCH 106/127] Update README.md (#6198) --- model_cards/asafaya/bert-mini-arabic/README.md | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/model_cards/asafaya/bert-mini-arabic/README.md b/model_cards/asafaya/bert-mini-arabic/README.md index 77810c6fbca4..d480df7b27cd 100644 --- a/model_cards/asafaya/bert-mini-arabic/README.md +++ b/model_cards/asafaya/bert-mini-arabic/README.md @@ -9,15 +9,26 @@ datasets: Pretrained BERT Mini language model for Arabic -_If you use this model in your work, please cite this paper (to appear in 2020):_ +_If you use this model in your work, please cite this paper:_ -``` + + +``` +@misc{safaya2020kuisail, + title={KUISAIL at SemEval-2020 Task 12: BERT-CNN for Offensive Speech Identification in Social Media}, + author={Ali Safaya and Moutasem Abdullatif and Deniz Yuret}, + year={2020}, + eprint={2007.13184}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} ``` ## Pretraining Corpus From 4f679556629529f83ed71d6b675e831096a54242 Mon Sep 17 00:00:00 2001 From: Ali Safaya Date: Wed, 5 Aug 2020 00:43:48 +0300 Subject: [PATCH 107/127] Update README.md (#6199) --- .../asafaya/bert-base-arabic/README.md | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/model_cards/asafaya/bert-base-arabic/README.md b/model_cards/asafaya/bert-base-arabic/README.md index 8c06454ee0dd..5b565e468619 100644 --- a/model_cards/asafaya/bert-base-arabic/README.md +++ b/model_cards/asafaya/bert-base-arabic/README.md @@ -1,20 +1,35 @@ --- language: ar +datasets: +- oscar +- wikipedia --- # Arabic BERT Model Pretrained BERT base language model for Arabic -_If you use this model in your work, please cite this paper (to appear in 2020):_ -``` +_If you use this model in your work, please cite this paper:_ + + + +``` +@misc{safaya2020kuisail, + title={KUISAIL at SemEval-2020 Task 12: BERT-CNN for Offensive Speech Identification in Social Media}, + author={Ali Safaya and Moutasem Abdullatif and Deniz Yuret}, + year={2020}, + eprint={2007.13184}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} ``` ## Pretraining Corpus From ddfdbb86c1c89b6dd9634ba816d2ae84cb54d4f8 Mon Sep 17 00:00:00 2001 From: Ali Safaya Date: Wed, 5 Aug 2020 00:44:05 +0300 Subject: [PATCH 108/127] Update README.md (#6200) --- .../asafaya/bert-medium-arabic/README.md | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/model_cards/asafaya/bert-medium-arabic/README.md b/model_cards/asafaya/bert-medium-arabic/README.md index dfb1e469f52b..502c1a5821a1 100644 --- a/model_cards/asafaya/bert-medium-arabic/README.md +++ b/model_cards/asafaya/bert-medium-arabic/README.md @@ -1,20 +1,34 @@ --- language: ar +datasets: +- oscar +- wikipedia --- # Arabic BERT Medium Model Pretrained BERT Medium language model for Arabic -_If you use this model in your work, please cite this paper (to appear in 2020):_ +_If you use this model in your work, please cite this paper:_ -``` + + +``` +@misc{safaya2020kuisail, + title={KUISAIL at SemEval-2020 Task 12: BERT-CNN for Offensive Speech Identification in Social Media}, + author={Ali Safaya and Moutasem Abdullatif and Deniz Yuret}, + year={2020}, + eprint={2007.13184}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} ``` ## Pretraining Corpus From d9149f00d1a4650bafa7e1cd73e10398193c852c Mon Sep 17 00:00:00 2001 From: Ali Safaya Date: Wed, 5 Aug 2020 00:44:14 +0300 Subject: [PATCH 109/127] Update README.md (#6201) --- .../asafaya/bert-large-arabic/README.md | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/model_cards/asafaya/bert-large-arabic/README.md b/model_cards/asafaya/bert-large-arabic/README.md index e83b67d7a861..4ecf5d251835 100644 --- a/model_cards/asafaya/bert-large-arabic/README.md +++ b/model_cards/asafaya/bert-large-arabic/README.md @@ -1,20 +1,35 @@ --- language: ar +datasets: +- oscar +- wikipedia --- + # Arabic BERT Large Model Pretrained BERT Large language model for Arabic -_If you use this model in your work, please cite this paper (to appear in 2020):_ +_If you use this model in your work, please cite this paper:_ -``` + + +``` +@misc{safaya2020kuisail, + title={KUISAIL at SemEval-2020 Task 12: BERT-CNN for Offensive Speech Identification in Social Media}, + author={Ali Safaya and Moutasem Abdullatif and Deniz Yuret}, + year={2020}, + eprint={2007.13184}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} ``` ## Pretraining Corpus From ed6b8f3128e3c126f97a00bcddf0f746b30b48fa Mon Sep 17 00:00:00 2001 From: Lilian Bordeau Date: Wed, 5 Aug 2020 13:23:55 +0200 Subject: [PATCH 110/127] Update to match renamed attributes in fairseq master (#5972) * Update to match renamed attributes in fairseq master RobertaModel no longer have model.encoder and args.num_classes attributes as of 5/28/20. * Quality Co-authored-by: Lysandre --- ...rta_original_pytorch_checkpoint_to_pytorch.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py index 869568580da2..ab3a4864b0ae 100644 --- a/src/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py +++ b/src/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py @@ -47,7 +47,7 @@ def convert_roberta_checkpoint_to_pytorch( """ roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path) roberta.eval() # disable dropout - roberta_sent_encoder = roberta.model.decoder.sentence_encoder + roberta_sent_encoder = roberta.model.encoder.sentence_encoder config = RobertaConfig( vocab_size=roberta_sent_encoder.embed_tokens.num_embeddings, hidden_size=roberta.args.encoder_embed_dim, @@ -59,7 +59,7 @@ def convert_roberta_checkpoint_to_pytorch( layer_norm_eps=1e-5, # PyTorch default used in fairseq ) if classification_head: - config.num_labels = roberta.args.num_classes + config.num_labels = roberta.model.classification_heads["mnli"].out_proj.weight.shape[0] print("Our BERT config:", config) model = RobertaForSequenceClassification(config) if classification_head else RobertaForMaskedLM(config) @@ -126,12 +126,12 @@ def convert_roberta_checkpoint_to_pytorch( model.classifier.out_proj.bias = roberta.model.classification_heads["mnli"].out_proj.bias else: # LM Head - model.lm_head.dense.weight = roberta.model.decoder.lm_head.dense.weight - model.lm_head.dense.bias = roberta.model.decoder.lm_head.dense.bias - model.lm_head.layer_norm.weight = roberta.model.decoder.lm_head.layer_norm.weight - model.lm_head.layer_norm.bias = roberta.model.decoder.lm_head.layer_norm.bias - model.lm_head.decoder.weight = roberta.model.decoder.lm_head.weight - model.lm_head.decoder.bias = roberta.model.decoder.lm_head.bias + model.lm_head.dense.weight = roberta.model.encoder.lm_head.dense.weight + model.lm_head.dense.bias = roberta.model.encoder.lm_head.dense.bias + model.lm_head.layer_norm.weight = roberta.model.encoder.lm_head.layer_norm.weight + model.lm_head.layer_norm.bias = roberta.model.encoder.lm_head.layer_norm.bias + model.lm_head.decoder.weight = roberta.model.encoder.lm_head.weight + model.lm_head.decoder.bias = roberta.model.encoder.lm_head.bias # Let's check that we get the same results. input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(0) # batch of size 1 From 24c5a6e351fe2f04ac79a0afd1a7a7811da6bd3c Mon Sep 17 00:00:00 2001 From: Ninnart Fuengfusin Date: Wed, 5 Aug 2020 20:34:57 +0900 Subject: [PATCH 111/127] Update optimization.py (#6261) --- src/transformers/optimization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/optimization.py b/src/transformers/optimization.py index f456f8ac2a86..9419dc76c85f 100644 --- a/src/transformers/optimization.py +++ b/src/transformers/optimization.py @@ -78,7 +78,7 @@ def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_st num_warmup_steps (:obj:`int`): The number of steps for the warmup phase. num_training_steps (:obj:`int`): - The totale number of training steps. + The total number of training steps. last_epoch (:obj:`int`, `optional`, defaults to -1): The index of the last epoch when resuming training. From d89acd07cc42a2352cca18c8facefb9442fd08ab Mon Sep 17 00:00:00 2001 From: Zhu Baohe Date: Wed, 5 Aug 2020 19:37:57 +0800 Subject: [PATCH 112/127] fix (#6257) --- src/transformers/configuration_xlnet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/configuration_xlnet.py b/src/transformers/configuration_xlnet.py index 79a226d5151e..4c50335d7e81 100644 --- a/src/transformers/configuration_xlnet.py +++ b/src/transformers/configuration_xlnet.py @@ -82,7 +82,7 @@ class XLNetConfig(PretrainedConfig): Whether to use the same attention length for each token. summary_type (:obj:`string`, optional, defaults to "last"): Argument used when doing sequence summary. Used in for the multiple choice head in - :class:transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`. + :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`. Is one of the following options: - 'last' => take the last token hidden state (like XLNet) From 376c02e9a9196ee1ebb596dd588fc2c89450905a Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Wed, 5 Aug 2020 06:01:17 -0700 Subject: [PATCH 113/127] [WIP] lightning_base: support --lr_scheduler with multiple possibilities (#6232) * support --lr_scheduler with multiple possibilities * correct the error message * add a note about supported schedulers * cleanup * cleanup2 * needs the argument default * style * add another assert in the test * implement requested changes * cleanups * fix relative import * cleanup --- examples/lightning_base.py | 39 ++++++++++++-- examples/seq2seq/test_seq2seq_examples.py | 64 ++++++++++++++++++++++- 2 files changed, 98 insertions(+), 5 deletions(-) diff --git a/examples/lightning_base.py b/examples/lightning_base.py index ae03e295614a..85435718302f 100644 --- a/examples/lightning_base.py +++ b/examples/lightning_base.py @@ -20,6 +20,10 @@ AutoTokenizer, PretrainedConfig, PreTrainedTokenizer, +) +from transformers.optimization import ( + get_cosine_schedule_with_warmup, + get_cosine_with_hard_restarts_schedule_with_warmup, get_linear_schedule_with_warmup, ) @@ -39,6 +43,19 @@ } +# update this and the import above to support new schedulers from transformers.optimization +arg_to_scheduler = { + "linear": get_linear_schedule_with_warmup, + "cosine": get_cosine_schedule_with_warmup, + "cosine_w_restarts": get_cosine_with_hard_restarts_schedule_with_warmup, + # polynomial': '', # TODO + # '': get_constant_schedule, # not supported for now + # '': get_constant_schedule_with_warmup, # not supported for now +} +arg_to_scheduler_choices = sorted(arg_to_scheduler.keys()) +arg_to_scheduler_metavar = "{" + ", ".join(arg_to_scheduler_choices) + "}" + + class BaseTransformer(pl.LightningModule): def __init__( self, @@ -97,6 +114,14 @@ def __init__( def load_hf_checkpoint(self, *args, **kwargs): self.model = self.model_type.from_pretrained(*args, **kwargs) + def get_lr_scheduler(self): + get_schedule_func = arg_to_scheduler[self.hparams.lr_scheduler] + scheduler = get_schedule_func( + self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps + ) + scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1} + return scheduler + def configure_optimizers(self): """Prepare optimizer and schedule (linear warmup and decay)""" model = self.model @@ -114,10 +139,8 @@ def configure_optimizers(self): optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon) self.opt = optimizer - scheduler = get_linear_schedule_with_warmup( - self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps - ) - scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1} + scheduler = self.get_lr_scheduler() + return [optimizer], [scheduler] def test_step(self, batch, batch_nb): @@ -203,6 +226,14 @@ def add_model_specific_args(parser, root_dir): "--attention_dropout", type=float, help="Attention dropout probability (Optional). Goes into model.config", ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") + parser.add_argument( + "--lr_scheduler", + default="linear", + choices=arg_to_scheduler_choices, + metavar=arg_to_scheduler_metavar, + type=str, + help="Learning rate scheduler", + ) parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") diff --git a/examples/seq2seq/test_seq2seq_examples.py b/examples/seq2seq/test_seq2seq_examples.py index d12aa0349332..7473e0a64bd4 100644 --- a/examples/seq2seq/test_seq2seq_examples.py +++ b/examples/seq2seq/test_seq2seq_examples.py @@ -8,15 +8,17 @@ from unittest.mock import patch import pytest +import pytorch_lightning as pl import torch from pytest import param from torch.utils.data import DataLoader +import lightning_base from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, MBartTokenizer from transformers.testing_utils import require_multigpu from .distillation import distill_main, evaluate_checkpoint -from .finetune import main +from .finetune import SummarizationModule, main from .pack_dataset import pack_data_dir from .run_eval import generate_summaries_or_translations, run_generate from .utils import MBartDataset, Seq2SeqDataset, label_smoothed_nll_loss, lmap, load_json @@ -60,6 +62,7 @@ "tokenizer_name": "facebook/bart-large", "do_lower_case": False, "learning_rate": 0.3, + "lr_scheduler": "linear", "weight_decay": 0.0, "adam_epsilon": 1e-08, "warmup_steps": 0, @@ -326,6 +329,65 @@ def test_finetune_extra_model_args(): assert str(excinfo.value) == f"model config doesn't have a `{unsupported_param}` attribute" +def test_finetune_lr_shedulers(capsys): + args_d: dict = CHEAP_ARGS.copy() + + task = "summarization" + tmp_dir = make_test_data_dir() + + model = BART_TINY + output_dir = tempfile.mkdtemp(prefix="output_1_") + + args_d.update( + data_dir=tmp_dir, + model_name_or_path=model, + output_dir=output_dir, + tokenizer_name=None, + train_batch_size=2, + eval_batch_size=2, + do_predict=False, + task=task, + src_lang="en_XX", + tgt_lang="ro_RO", + freeze_encoder=True, + freeze_embeds=True, + ) + + # emulate finetune.py + parser = argparse.ArgumentParser() + parser = pl.Trainer.add_argparse_args(parser) + parser = SummarizationModule.add_model_specific_args(parser, os.getcwd()) + args = {"--help": True} + + # --help test + with pytest.raises(SystemExit) as excinfo: + args = parser.parse_args(args) + assert False, "--help is expected to sys.exit" + assert excinfo.type == SystemExit + captured = capsys.readouterr() + expected = lightning_base.arg_to_scheduler_metavar + assert expected in captured.out, "--help is expected to list the supported schedulers" + + # --lr_scheduler=non_existing_scheduler test + unsupported_param = "non_existing_scheduler" + args = {f"--lr_scheduler={unsupported_param}"} + with pytest.raises(SystemExit) as excinfo: + args = parser.parse_args(args) + assert False, "invalid argument is expected to sys.exit" + assert excinfo.type == SystemExit + captured = capsys.readouterr() + expected = f"invalid choice: '{unsupported_param}'" + assert expected in captured.err, f"should have bailed on invalid choice of scheduler {unsupported_param}" + + # --lr_scheduler=existing_scheduler test + supported_param = "cosine" + args_d1 = args_d.copy() + args_d1["lr_scheduler"] = supported_param + args = argparse.Namespace(**args_d1) + model = main(args) + assert getattr(model.hparams, "lr_scheduler") == supported_param, f"lr_scheduler={supported_param} shouldn't fail" + + def test_pack_dataset(): tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-cc25") From 33966811bd82b593f7fc6de5669c9f5c7a479df5 Mon Sep 17 00:00:00 2001 From: Julien Plu Date: Wed, 5 Aug 2020 15:04:27 +0200 Subject: [PATCH 114/127] Add SequenceClassification and MultipleChoice TF models to Electra (#6227) * Add SequenceClassification and MultipleChoice TF models to Electra * Apply style * Add summary_proj_to_labels to Electra config * Finally mirroring the PT version of these models * Apply style * Fix Electra test --- src/transformers/__init__.py | 2 + src/transformers/modeling_tf_auto.py | 4 + src/transformers/modeling_tf_electra.py | 246 ++++++++++++++++++++++-- tests/test_modeling_tf_electra.py | 41 ++++ 4 files changed, 281 insertions(+), 12 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index f14f032d191c..cf86a2d028a7 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -537,8 +537,10 @@ from .modeling_tf_electra import ( TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST, TFElectraForMaskedLM, + TFElectraForMultipleChoice, TFElectraForPreTraining, TFElectraForQuestionAnswering, + TFElectraForSequenceClassification, TFElectraForTokenClassification, TFElectraModel, TFElectraPreTrainedModel, diff --git a/src/transformers/modeling_tf_auto.py b/src/transformers/modeling_tf_auto.py index a47c0f30ad7e..66176a0e3168 100644 --- a/src/transformers/modeling_tf_auto.py +++ b/src/transformers/modeling_tf_auto.py @@ -77,8 +77,10 @@ ) from .modeling_tf_electra import ( TFElectraForMaskedLM, + TFElectraForMultipleChoice, TFElectraForPreTraining, TFElectraForQuestionAnswering, + TFElectraForSequenceClassification, TFElectraForTokenClassification, TFElectraModel, ) @@ -247,6 +249,7 @@ (MobileBertConfig, TFMobileBertForSequenceClassification), (FlaubertConfig, TFFlaubertForSequenceClassification), (XLMConfig, TFXLMForSequenceClassification), + (ElectraConfig, TFElectraForSequenceClassification), ] ) @@ -294,6 +297,7 @@ (XLNetConfig, TFXLNetForMultipleChoice), (FlaubertConfig, TFFlaubertForMultipleChoice), (AlbertConfig, TFAlbertForMultipleChoice), + (ElectraConfig, TFElectraForMultipleChoice), ] ) diff --git a/src/transformers/modeling_tf_electra.py b/src/transformers/modeling_tf_electra.py index 48f598adaa32..5a7e366cd36f 100644 --- a/src/transformers/modeling_tf_electra.py +++ b/src/transformers/modeling_tf_electra.py @@ -4,11 +4,19 @@ from transformers import ElectraConfig -from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable +from .file_utils import ( + MULTIPLE_CHOICE_DUMMY_INPUTS, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_callable, +) from .modeling_tf_bert import ACT2FN, TFBertEncoder, TFBertPreTrainedModel from .modeling_tf_utils import ( TFMaskedLanguageModelingLoss, + TFMultipleChoiceLoss, TFQuestionAnsweringLoss, + TFSequenceClassificationLoss, + TFSequenceSummary, TFTokenClassificationLoss, get_initializer, keras_serializable, @@ -20,6 +28,7 @@ logger = logging.getLogger(__name__) _TOKENIZER_FOR_DOC = "ElectraTokenizer" +_CONFIG_FOR_DOC = "ElectraConfig" TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [ "google/electra-small-generator", @@ -73,13 +82,7 @@ def build(self, input_shape): super().build(input_shape) def call( - self, - input_ids=None, - position_ids=None, - token_type_ids=None, - inputs_embeds=None, - mode="embedding", - training=False, + self, input_ids, position_ids=None, token_type_ids=None, inputs_embeds=None, mode="embedding", training=False, ): """Get token embeddings of inputs. Args: @@ -438,7 +441,7 @@ def __init__(self, config, **kwargs): @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) def call( self, - input_ids=None, + input_ids, attention_mask=None, token_type_ids=None, position_ids=None, @@ -539,7 +542,7 @@ def get_output_embeddings(self): @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-generator") def call( self, - input_ids=None, + input_ids, attention_mask=None, token_type_ids=None, position_ids=None, @@ -604,6 +607,225 @@ def call( return output # (masked_lm_loss), prediction_scores, (hidden_states), (attentions) +class TFElectraClassificationHead(tf.keras.layers.Layer): + """Head for sentence-level classification tasks.""" + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.dense = tf.keras.layers.Dense( + config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.out_proj = tf.keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" + ) + + def call(self, inputs, **kwargs): + x = inputs[:, 0, :] # take token (equiv. to [CLS]) + x = self.dropout(x) + x = self.dense(x) + x = ACT2FN["gelu"](x) # although BERT uses tanh here, it seems Electra authors used gelu here + x = self.dropout(x) + x = self.out_proj(x) + + return x + + +@add_start_docstrings( + """ELECTRA Model transformer with a sequence classification/regression head on top (a linear layer on top of + the pooled output) e.g. for GLUE tasks. """, + ELECTRA_START_DOCSTRING, +) +class TFElectraForSequenceClassification(TFElectraPreTrainedModel, TFSequenceClassificationLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + self.electra = TFElectraMainLayer(config, name="electra") + self.classifier = TFElectraClassificationHead(config, name="classifier") + + @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="google/electra-small-discriminator", + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + labels=None, + training=False, + ): + r""" + Returns: + :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs: + logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`) + Classification (or regression if config.num_labels==1) scores (before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + """ + outputs = self.electra( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + inputs_embeds, + output_attentions, + output_hidden_states, + training=training, + ) + logits = self.classifier(outputs[0]) + outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here + + if labels is not None: + loss = self.compute_loss(labels, logits) + outputs = (loss,) + outputs + + return outputs # (loss), logits, (hidden_states), (attentions) + + +@add_start_docstrings( + """ELECTRA Model with a multiple choice classification head on top (a linear layer on top of + the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, + ELECTRA_START_DOCSTRING, +) +class TFElectraForMultipleChoice(TFElectraPreTrainedModel, TFMultipleChoiceLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.electra = TFElectraMainLayer(config, name="electra") + self.sequence_summary = TFSequenceSummary( + config, initializer_range=config.initializer_range, name="sequence_summary" + ) + self.classifier = tf.keras.layers.Dense( + 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) + + @property + def dummy_inputs(self): + """ Dummy inputs to build the network. + + Returns: + tf.Tensor with dummy inputs + """ + return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} + + @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) + @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator") + def call( + self, + inputs, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + labels=None, + training=False, + ): + r""" + labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + Labels for computing the multiple choice classification loss. + Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension + of the input tensors. (see `input_ids` above) + + Return: + :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs: + classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`: + `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above). + + Classification scores (before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + if isinstance(inputs, (tuple, list)): + input_ids = inputs[0] + attention_mask = inputs[1] if len(inputs) > 1 else attention_mask + token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids + position_ids = inputs[3] if len(inputs) > 3 else position_ids + head_mask = inputs[4] if len(inputs) > 4 else head_mask + inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds + output_attentions = inputs[6] if len(inputs) > 6 else output_attentions + output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states + labels = inputs[8] if len(inputs) > 8 else labels + assert len(inputs) <= 9, "Too many inputs." + elif isinstance(inputs, (dict, BatchEncoding)): + input_ids = inputs.get("input_ids") + attention_mask = inputs.get("attention_mask", attention_mask) + token_type_ids = inputs.get("token_type_ids", token_type_ids) + position_ids = inputs.get("position_ids", position_ids) + head_mask = inputs.get("head_mask", head_mask) + inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) + output_attentions = inputs.get("output_attentions", output_attentions) + output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) + labels = inputs.get("labels", labels) + assert len(inputs) <= 9, "Too many inputs." + else: + input_ids = inputs + + if input_ids is not None: + num_choices = shape_list(input_ids)[1] + seq_length = shape_list(input_ids)[2] + else: + num_choices = shape_list(inputs_embeds)[1] + seq_length = shape_list(inputs_embeds)[2] + + flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None + flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None + flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None + flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None + flat_inputs_embeds = ( + tf.reshape(inputs_embeds, (-1, seq_length, shape_list(inputs_embeds)[3])) + if inputs_embeds is not None + else None + ) + outputs = self.electra( + flat_input_ids, + flat_attention_mask, + flat_token_type_ids, + flat_position_ids, + head_mask, + flat_inputs_embeds, + output_attentions, + output_hidden_states, + training=training, + ) + logits = self.sequence_summary(outputs[0]) + logits = self.classifier(logits) + reshaped_logits = tf.reshape(logits, (-1, num_choices)) + outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here + + if labels is not None: + loss = self.compute_loss(labels, reshaped_logits) + outputs = (loss,) + outputs + + return outputs # (loss), reshaped_logits, (hidden_states), (attentions) + + @add_start_docstrings( """Electra model with a token classification head on top. @@ -624,7 +846,7 @@ def __init__(self, config, **kwargs): @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator") def call( self, - inputs=None, + inputs, attention_mask=None, token_type_ids=None, position_ids=None, @@ -706,7 +928,7 @@ def __init__(self, config, *inputs, **kwargs): @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator") def call( self, - inputs=None, + inputs, attention_mask=None, token_type_ids=None, position_ids=None, diff --git a/tests/test_modeling_tf_electra.py b/tests/test_modeling_tf_electra.py index a0a1cef03068..625f935c622d 100644 --- a/tests/test_modeling_tf_electra.py +++ b/tests/test_modeling_tf_electra.py @@ -24,10 +24,14 @@ if is_tf_available(): + import tensorflow as tf + from transformers.modeling_tf_electra import ( TFElectraModel, TFElectraForMaskedLM, + TFElectraForMultipleChoice, TFElectraForPreTraining, + TFElectraForSequenceClassification, TFElectraForTokenClassification, TFElectraForQuestionAnswering, ) @@ -138,6 +142,35 @@ def create_and_check_electra_for_pretraining( } self.parent.assertListEqual(list(result["prediction_scores"].shape), [self.batch_size, self.seq_length]) + def create_and_check_electra_for_sequence_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = TFElectraForSequenceClassification(config=config) + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + (logits,) = model(inputs) + result = { + "logits": logits.numpy(), + } + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels]) + + def create_and_check_electra_for_multiple_choice( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_choices = self.num_choices + model = TFElectraForMultipleChoice(config=config) + multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1)) + multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1)) + multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1)) + inputs = { + "input_ids": multiple_choice_inputs_ids, + "attention_mask": multiple_choice_input_mask, + "token_type_ids": multiple_choice_token_type_ids, + } + (logits,) = model(inputs) + result = {"logits": logits.numpy()} + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) + def create_and_check_electra_for_question_answering( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): @@ -210,6 +243,14 @@ def test_for_question_answering(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_electra_for_question_answering(*config_and_inputs) + def test_for_sequence_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_electra_for_sequence_classification(*config_and_inputs) + + def test_for_multiple_choice(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_electra_for_multiple_choice(*config_and_inputs) + def test_for_token_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_electra_for_token_classification(*config_and_inputs) From bd0eab351a338175053998ddfc059f1cb6424ab4 Mon Sep 17 00:00:00 2001 From: Teven Date: Wed, 5 Aug 2020 15:05:52 +0200 Subject: [PATCH 115/127] Trainer + wandb quality of life logging tweaks (#6241) * added `name` argument for wandb logging, also logging model config with trainer arguments * Update src/transformers/training_args.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * added tf, post-review changes Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- src/transformers/trainer.py | 5 ++++- src/transformers/trainer_tf.py | 3 ++- src/transformers/training_args.py | 6 ++++++ src/transformers/training_args_tf.py | 2 ++ 4 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index e1429713fb9b..10674c062009 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -383,7 +383,10 @@ def setup_wandb(self): logger.info( 'Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"' ) - wandb.init(project=os.getenv("WANDB_PROJECT", "huggingface"), config=self.args.to_sanitized_dict()) + combined_dict = {**self.model.config.to_dict(), **self.args.to_sanitized_dict()} + wandb.init( + project=os.getenv("WANDB_PROJECT", "huggingface"), config=combined_dict, name=self.args.run_name + ) # keep track of model topology and gradients, unsupported on TPU if not is_torch_tpu_available() and os.getenv("WANDB_WATCH") != "false": wandb.watch( diff --git a/src/transformers/trainer_tf.py b/src/transformers/trainer_tf.py index aaca022e815e..d388017437f0 100644 --- a/src/transformers/trainer_tf.py +++ b/src/transformers/trainer_tf.py @@ -215,7 +215,8 @@ def setup_wandb(self): return self._setup_wandb() logger.info('Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"') - wandb.init(project=os.getenv("WANDB_PROJECT", "huggingface"), config=vars(self.args)) + combined_dict = {**self.model.config.to_dict(), **self.args.to_sanitized_dict()} + wandb.init(project=os.getenv("WANDB_PROJECT", "huggingface"), config=combined_dict, name=self.args.run_name) def prediction_loop( self, diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index ad33266a8181..713293e34999 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -109,6 +109,8 @@ class TrainingArguments: make use of the past hidden states for their predictions. If this argument is set to a positive int, the ``Trainer`` will use the corresponding output (usually index 2) as the past state and feed it to the model at the next training step under the keyword argument ``mems``. + run_name (:obj:`str`, `optional`): + A descriptor for the run. Notably used for wandb logging. """ output_dir: str = field( @@ -222,6 +224,10 @@ class TrainingArguments: metadata={"help": "If >=0, uses the corresponding part of the output as the past state for next step."}, ) + run_name: Optional[str] = field( + default=None, metadata={"help": "An optional descriptor for the run. Notably used for wandb logging."} + ) + @property def train_batch_size(self) -> int: """ diff --git a/src/transformers/training_args_tf.py b/src/transformers/training_args_tf.py index 0adf34464507..c1dea84d589e 100644 --- a/src/transformers/training_args_tf.py +++ b/src/transformers/training_args_tf.py @@ -95,6 +95,8 @@ class TFTrainingArguments(TrainingArguments): at the next training step under the keyword argument ``mems``. tpu_name (:obj:`str`, `optional`): The name of the TPU the process is running on. + run_name (:obj:`str`, `optional`): + A descriptor for the run. Notably used for wandb logging. """ tpu_name: str = field( From c67d1a0259cbb3aef31952b4f37d4fee0e36f134 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Wed, 5 Aug 2020 11:34:39 -0400 Subject: [PATCH 116/127] Tf model outputs (#6247) * TF outputs and test on BERT * Albert to DistilBert * All remaining TF models except T5 * Documentation * One file forgotten * TF outputs and test on BERT * Albert to DistilBert * All remaining TF models except T5 * Documentation * One file forgotten * Add new models and fix issues * Quality improvements * Add T5 * A bit of cleanup * Fix for slow tests * Style --- docs/source/model_doc/albert.rst | 5 +- docs/source/model_doc/bert.rst | 5 +- docs/source/model_doc/electra.rst | 26 +- docs/source/model_doc/gpt.rst | 3 + docs/source/model_doc/gpt2.rst | 3 + docs/source/model_doc/mobilebert.rst | 5 +- docs/source/model_doc/transformerxl.rst | 6 + docs/source/model_doc/xlnet.rst | 18 + src/transformers/file_utils.py | 15 +- src/transformers/modeling_albert.py | 8 +- src/transformers/modeling_bert.py | 8 +- src/transformers/modeling_electra.py | 10 +- src/transformers/modeling_mobilebert.py | 8 +- src/transformers/modeling_t5.py | 2 +- src/transformers/modeling_tf_albert.py | 402 +++++++------ src/transformers/modeling_tf_bert.py | 469 +++++++-------- src/transformers/modeling_tf_camembert.py | 2 - src/transformers/modeling_tf_ctrl.py | 122 ++-- src/transformers/modeling_tf_distilbert.py | 319 +++++----- src/transformers/modeling_tf_electra.py | 352 ++++++----- src/transformers/modeling_tf_flaubert.py | 28 +- src/transformers/modeling_tf_gpt2.py | 218 ++++--- src/transformers/modeling_tf_mobilebert.py | 427 +++++++------- src/transformers/modeling_tf_openai.py | 187 +++--- src/transformers/modeling_tf_outputs.py | 555 ++++++++++++++++++ src/transformers/modeling_tf_roberta.py | 286 +++++---- src/transformers/modeling_tf_t5.py | 157 +++-- src/transformers/modeling_tf_transfo_xl.py | 201 +++++-- src/transformers/modeling_tf_xlm.py | 327 ++++++----- src/transformers/modeling_tf_xlm_roberta.py | 2 - src/transformers/modeling_tf_xlnet.py | 555 ++++++++++++------ src/transformers/modeling_xlnet.py | 7 +- .../adding_a_new_model/modeling_tf_xxx.py | 358 +++++------ .../tests/test_modeling_tf_xxx.py | 57 +- tests/test_modeling_tf_albert.py | 45 +- tests/test_modeling_tf_bert.py | 55 +- tests/test_modeling_tf_camembert.py | 2 +- tests/test_modeling_tf_common.py | 8 +- tests/test_modeling_tf_ctrl.py | 25 +- tests/test_modeling_tf_distilbert.py | 32 +- tests/test_modeling_tf_electra.py | 47 +- tests/test_modeling_tf_flaubert.py | 40 +- tests/test_modeling_tf_gpt2.py | 34 +- tests/test_modeling_tf_mobilebert.py | 60 +- tests/test_modeling_tf_openai_gpt.py | 28 +- tests/test_modeling_tf_roberta.py | 37 +- tests/test_modeling_tf_t5.py | 34 +- tests/test_modeling_tf_transfo_xl.py | 13 +- tests/test_modeling_tf_xlm.py | 38 +- tests/test_modeling_tf_xlm_roberta.py | 2 +- tests/test_modeling_tf_xlnet.py | 58 +- 51 files changed, 3267 insertions(+), 2444 deletions(-) create mode 100644 src/transformers/modeling_tf_outputs.py diff --git a/docs/source/model_doc/albert.rst b/docs/source/model_doc/albert.rst index 6562631e0a29..c78426d0c773 100644 --- a/docs/source/model_doc/albert.rst +++ b/docs/source/model_doc/albert.rst @@ -50,7 +50,10 @@ AlbertTokenizer Albert specific outputs ~~~~~~~~~~~~~~~~~~~~~~~ -.. autoclass:: transformers.modeling_albert.AlbertForPretrainingOutput +.. autoclass:: transformers.modeling_albert.AlbertForPreTrainingOutput + :members: + +.. autoclass:: transformers.modeling_tf_albert.TFAlbertForPreTrainingOutput :members: diff --git a/docs/source/model_doc/bert.rst b/docs/source/model_doc/bert.rst index 1666260f96e5..5e35b520d86d 100644 --- a/docs/source/model_doc/bert.rst +++ b/docs/source/model_doc/bert.rst @@ -57,7 +57,10 @@ BertTokenizerFast Bert specific outputs ~~~~~~~~~~~~~~~~~~~~~ -.. autoclass:: transformers.modeling_bert.BertForPretrainingOutput +.. autoclass:: transformers.modeling_bert.BertForPreTrainingOutput + :members: + +.. autoclass:: transformers.modeling_tf_bert.TFBertForPreTrainingOutput :members: diff --git a/docs/source/model_doc/electra.rst b/docs/source/model_doc/electra.rst index 895ca9dde8f5..993ed4d2b567 100644 --- a/docs/source/model_doc/electra.rst +++ b/docs/source/model_doc/electra.rst @@ -74,7 +74,10 @@ ElectraTokenizerFast Electra specific outputs ~~~~~~~~~~~~~~~~~~~~~~~~ -.. autoclass:: transformers.modeling_electra.ElectraForPretrainingOutput +.. autoclass:: transformers.modeling_electra.ElectraForPreTrainingOutput + :members: + +.. autoclass:: transformers.modeling_tf_electra.TFElectraForPreTrainingOutput :members: @@ -106,6 +109,13 @@ ElectraForSequenceClassification :members: +ElectraForMultipleChoice +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.ElectraForMultipleChoice + :members: + + ElectraForTokenClassification ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -141,6 +151,20 @@ TFElectraForMaskedLM :members: +TFElectraForSequenceClassification +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.TFElectraForSequenceClassification + :members: + + +TFElectraForMultipleChoice +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.TFElectraForMultipleChoice + :members: + + TFElectraForTokenClassification ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/model_doc/gpt.rst b/docs/source/model_doc/gpt.rst index 39c5fe269cc4..99772b30fdd8 100644 --- a/docs/source/model_doc/gpt.rst +++ b/docs/source/model_doc/gpt.rst @@ -77,6 +77,9 @@ OpenAI specific outputs .. autoclass:: transformers.modeling_openai.OpenAIGPTDoubleHeadsModelOutput :members: +.. autoclass:: transformers.modeling_tf_openai.TFOpenAIGPTDoubleHeadsModelOutput + :members: + OpenAIGPTModel ~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/model_doc/gpt2.rst b/docs/source/model_doc/gpt2.rst index 3f1be1bb4c18..b1ae24d98e8f 100644 --- a/docs/source/model_doc/gpt2.rst +++ b/docs/source/model_doc/gpt2.rst @@ -64,6 +64,9 @@ GPT2 specific outputs .. autoclass:: transformers.modeling_gpt2.GPT2DoubleHeadsModelOutput :members: +.. autoclass:: transformers.modeling_tf_gpt2.TFGPT2DoubleHeadsModelOutput + :members: + GPT2Model ~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/model_doc/mobilebert.rst b/docs/source/model_doc/mobilebert.rst index ad3e0c206e95..038adc6b6687 100644 --- a/docs/source/model_doc/mobilebert.rst +++ b/docs/source/model_doc/mobilebert.rst @@ -59,7 +59,10 @@ MobileBertTokenizerFast MobileBert specific outputs ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. autoclass:: transformers.modeling_mobilebert.MobileBertForPretrainingOutput +.. autoclass:: transformers.modeling_mobilebert.MobileBertForPreTrainingOutput + :members: + +.. autoclass:: transformers.modeling_tf_mobilebert.TFMobileBertForPreTrainingOutput :members: diff --git a/docs/source/model_doc/transformerxl.rst b/docs/source/model_doc/transformerxl.rst index dc1a637836de..c9c9807a0dc2 100644 --- a/docs/source/model_doc/transformerxl.rst +++ b/docs/source/model_doc/transformerxl.rst @@ -63,6 +63,12 @@ TransfoXL specific outputs .. autoclass:: transformers.modeling_transfo_xl.TransfoXLLMHeadModelOutput :members: +.. autoclass:: transformers.modeling_tf_transfo_xl.TFTransfoXLModelOutput + :members: + +.. autoclass:: transformers.modeling_tf_transfo_xl.TFTransfoXLLMHeadModelOutput + :members: + TransfoXLModel ~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/model_doc/xlnet.rst b/docs/source/model_doc/xlnet.rst index bea589759147..d424aecc18d1 100644 --- a/docs/source/model_doc/xlnet.rst +++ b/docs/source/model_doc/xlnet.rst @@ -74,6 +74,24 @@ XLNet specific outputs .. autoclass:: transformers.modeling_xlnet.XLNetForQuestionAnsweringOutput :members: +.. autoclass:: transformers.modeling_tf_xlnet.TFXLNetModelOutput + :members: + +.. autoclass:: transformers.modeling_tf_xlnet.TFXLNetLMHeadModelOutput + :members: + +.. autoclass:: transformers.modeling_tf_xlnet.TFXLNetForSequenceClassificationOutput + :members: + +.. autoclass:: transformers.modeling_tf_xlnet.TFXLNetForMultipleChoiceOutput + :members: + +.. autoclass:: transformers.modeling_tf_xlnet.TFXLNetForTokenClassificationOutput + :members: + +.. autoclass:: transformers.modeling_tf_xlnet.TFXLNetForQuestionAnsweringSimpleOutput + :members: + XLNetModel ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py index 5bdf1f792d95..2f72469f1c22 100644 --- a/src/transformers/file_utils.py +++ b/src/transformers/file_utils.py @@ -190,7 +190,7 @@ def docstring_decorator(fn): return docstring_decorator -RETURN_INTRODUCTION = r""" +PT_RETURN_INTRODUCTION = r""" Returns: :class:`~{full_output_type}` or :obj:`tuple(torch.FloatTensor)`: A :class:`~{full_output_type}` (if ``return_dict=True`` is passed or when ``config.return_dict=True``) or a @@ -200,6 +200,16 @@ def docstring_decorator(fn): """ +TF_RETURN_INTRODUCTION = r""" + Returns: + :class:`~{full_output_type}` or :obj:`tuple(tf.Tensor)`: + A :class:`~{full_output_type}` (if ``return_dict=True`` is passed or when ``config.return_dict=True``) or a + tuple of :obj:`tf.Tensor` comprising various elements depending on the configuration + (:class:`~transformers.{config_class}`) and inputs. + +""" + + def _get_indent(t): """Returns the indentation in the first line of t""" search = re.search(r"^(\s*)\S", t) @@ -249,7 +259,8 @@ def _prepare_output_docstrings(output_type, config_class): # Add the return introduction full_output_type = f"{output_type.__module__}.{output_type.__name__}" - intro = RETURN_INTRODUCTION.format(full_output_type=full_output_type, config_class=config_class) + intro = TF_RETURN_INTRODUCTION if output_type.__name__.startswith("TF") else PT_RETURN_INTRODUCTION + intro = intro.format(full_output_type=full_output_type, config_class=config_class) return intro + docstrings diff --git a/src/transformers/modeling_albert.py b/src/transformers/modeling_albert.py index cbd94ce47f66..2f3427fce3a8 100644 --- a/src/transformers/modeling_albert.py +++ b/src/transformers/modeling_albert.py @@ -407,9 +407,9 @@ def _init_weights(self, module): @dataclass -class AlbertForPretrainingOutput(ModelOutput): +class AlbertForPreTrainingOutput(ModelOutput): """ - Output type of :class:`~transformers.AlbertForPretrainingModel`. + Output type of :class:`~transformers.AlbertForPreTrainingModel`. Args: loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`): @@ -643,7 +643,7 @@ def get_output_embeddings(self): return self.predictions.decoder @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=AlbertForPretrainingOutput, config_class=_CONFIG_FOR_DOC) + @replace_return_docstrings(output_type=AlbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) def forward( self, input_ids=None, @@ -728,7 +728,7 @@ def forward( output = (prediction_scores, sop_scores) + outputs[2:] return ((total_loss,) + output) if total_loss is not None else output - return AlbertForPretrainingOutput( + return AlbertForPreTrainingOutput( loss=total_loss, prediction_logits=prediction_scores, sop_logits=sop_scores, diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py index 74c5acafbe66..fb2a2a510e77 100644 --- a/src/transformers/modeling_bert.py +++ b/src/transformers/modeling_bert.py @@ -586,9 +586,9 @@ def _init_weights(self, module): @dataclass -class BertForPretrainingOutput(ModelOutput): +class BertForPreTrainingOutput(ModelOutput): """ - Output type of :class:`~transformers.BertForPretrainingModel`. + Output type of :class:`~transformers.BertForPreTrainingModel`. Args: loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`): @@ -837,7 +837,7 @@ def get_output_embeddings(self): return self.cls.predictions.decoder @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) - @replace_return_docstrings(output_type=BertForPretrainingOutput, config_class=_CONFIG_FOR_DOC) + @replace_return_docstrings(output_type=BertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) def forward( self, input_ids=None, @@ -918,7 +918,7 @@ def forward( output = (prediction_scores, seq_relationship_score) + outputs[2:] return ((total_loss,) + output) if total_loss is not None else output - return BertForPretrainingOutput( + return BertForPreTrainingOutput( loss=total_loss, prediction_logits=prediction_scores, seq_relationship_logits=seq_relationship_score, diff --git a/src/transformers/modeling_electra.py b/src/transformers/modeling_electra.py index 5e4e1286d28f..1eb58c1486d6 100644 --- a/src/transformers/modeling_electra.py +++ b/src/transformers/modeling_electra.py @@ -188,9 +188,9 @@ class ElectraPreTrainedModel(BertPreTrainedModel): @dataclass -class ElectraForPretrainingOutput(ModelOutput): +class ElectraForPreTrainingOutput(ModelOutput): """ - Output type of :class:`~transformers.ElectraForPretrainingModel`. + Output type of :class:`~transformers.ElectraForPreTrainingModel`. Args: loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`): @@ -496,7 +496,7 @@ def __init__(self, config): self.init_weights() @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=ElectraForPretrainingOutput, config_class=_CONFIG_FOR_DOC) + @replace_return_docstrings(output_type=ElectraForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) def forward( self, input_ids=None, @@ -562,7 +562,7 @@ def forward( output = (logits,) + discriminator_hidden_states[1:] return ((loss,) + output) if loss is not None else output - return ElectraForPretrainingOutput( + return ElectraForPreTrainingOutput( loss=loss, logits=logits, hidden_states=discriminator_hidden_states.hidden_states, @@ -850,7 +850,7 @@ def forward( @add_start_docstrings( """ELECTRA Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, - ELECTRA_INPUTS_DOCSTRING, + ELECTRA_START_DOCSTRING, ) class ElectraForMultipleChoice(ElectraPreTrainedModel): def __init__(self, config): diff --git a/src/transformers/modeling_mobilebert.py b/src/transformers/modeling_mobilebert.py index f0b01cfa618b..5fbda94bd9c8 100644 --- a/src/transformers/modeling_mobilebert.py +++ b/src/transformers/modeling_mobilebert.py @@ -685,9 +685,9 @@ def _init_weights(self, module): @dataclass -class MobileBertForPretrainingOutput(ModelOutput): +class MobileBertForPreTrainingOutput(ModelOutput): """ - Output type of :class:`~transformers.MobileBertForPretrainingModel`. + Output type of :class:`~transformers.MobileBertForPreTrainingModel`. Args: loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`): @@ -948,7 +948,7 @@ def tie_weights(self): self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings()) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=MobileBertForPretrainingOutput, config_class=_CONFIG_FOR_DOC) + @replace_return_docstrings(output_type=MobileBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) def forward( self, input_ids=None, @@ -1018,7 +1018,7 @@ def forward( output = (prediction_scores, seq_relationship_score) + outputs[2:] return ((total_loss,) + output) if total_loss is not None else output - return MobileBertForPretrainingOutput( + return MobileBertForPreTrainingOutput( loss=total_loss, prediction_logits=prediction_scores, seq_relationship_logits=seq_relationship_score, diff --git a/src/transformers/modeling_t5.py b/src/transformers/modeling_t5.py index 03a0827e1d2a..3b718f242f8a 100644 --- a/src/transformers/modeling_t5.py +++ b/src/transformers/modeling_t5.py @@ -973,7 +973,7 @@ def forward( output_hidden_states=output_hidden_states, return_dict=return_dict, ) - elif not return_dict and not isinstance(encoder_outputs, BaseModelOutput): + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): encoder_outputs = BaseModelOutput( last_hidden_state=encoder_outputs[0], hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, diff --git a/src/transformers/modeling_tf_albert.py b/src/transformers/modeling_tf_albert.py index 4159a6cb08c9..73a1e8252742 100644 --- a/src/transformers/modeling_tf_albert.py +++ b/src/transformers/modeling_tf_albert.py @@ -17,17 +17,30 @@ import logging +from dataclasses import dataclass +from typing import Optional, Tuple import tensorflow as tf from .configuration_albert import AlbertConfig from .file_utils import ( MULTIPLE_CHOICE_DUMMY_INPUTS, + ModelOutput, add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable, + replace_return_docstrings, ) from .modeling_tf_bert import ACT2FN, TFBertSelfAttention +from .modeling_tf_outputs import ( + TFBaseModelOutput, + TFBaseModelOutputWithPooling, + TFMaskedLMOutput, + TFMultipleChoiceModelOutput, + TFQuestionAnsweringModelOutput, + TFSequenceClassifierOutput, + TFTokenClassifierOutput, +) from .modeling_tf_utils import ( TFMaskedLanguageModelingLoss, TFMultipleChoiceLoss, @@ -44,6 +57,7 @@ logger = logging.getLogger(__name__) +_CONFIG_FOR_DOC = "AlbertConfig" _TOKENIZER_FOR_DOC = "AlbertTokenizer" TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ @@ -414,12 +428,19 @@ def __init__(self, config, **kwargs): for i in range(config.num_hidden_groups) ] - def call(self, hidden_states, attention_mask, head_mask, output_attentions, output_hidden_states, training=False): + def call( + self, + hidden_states, + attention_mask, + head_mask, + output_attentions, + output_hidden_states, + return_dict, + training=False, + ): hidden_states = self.embedding_hidden_mapping_in(hidden_states) - all_attentions = () - - if output_hidden_states: - all_hidden_states = (hidden_states,) + all_attentions = () if output_attentions else None + all_hidden_states = (hidden_states,) if output_hidden_states else None for i in range(self.config.num_hidden_layers): # Number of layers in a hidden group @@ -444,14 +465,11 @@ def call(self, hidden_states, attention_mask, head_mask, output_attentions, outp if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - outputs = (hidden_states,) - if output_hidden_states: - outputs = outputs + (all_hidden_states,) - if output_attentions: - outputs = outputs + (all_attentions,) - - # last-layer hidden state, (all hidden states), (all attentions) - return outputs + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) + return TFBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions + ) class TFAlbertPreTrainedModel(TFPreTrainedModel): @@ -506,6 +524,7 @@ def __init__(self, config, **kwargs): self.num_hidden_layers = config.num_hidden_layers self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states + self.return_dict = config.use_return_dict self.embeddings = TFAlbertEmbeddings(config, name="embeddings") self.encoder = TFAlbertTransformer(config, name="encoder") @@ -543,6 +562,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, training=False, ): if isinstance(inputs, (tuple, list)): @@ -554,7 +574,8 @@ def call( inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states - assert len(inputs) <= 8, "Too many inputs." + return_dict = inputs[8] if len(inputs) > 8 else return_dict + assert len(inputs) <= 9, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -564,12 +585,14 @@ def call( inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) - assert len(inputs) <= 8, "Too many inputs." + return_dict = inputs.get("return_dict", return_dict) + assert len(inputs) <= 9, "Too many inputs." else: input_ids = inputs output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states + return_dict = return_dict if return_dict is not None else self.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -619,16 +642,52 @@ def call( head_mask, output_attentions, output_hidden_states, + return_dict, training=training, ) sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output[:, 0]) - # add hidden_states and attentions if they are here - outputs = (sequence_output, pooled_output,) + encoder_outputs[1:] - # sequence_output, pooled_output, (hidden_states), (attentions) - return outputs + if not return_dict: + return (sequence_output, pooled_output,) + encoder_outputs[1:] + + return TFBaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@dataclass +class TFAlbertForPreTrainingOutput(ModelOutput): + """ + Output type of :class:`~transformers.TFAlbertForPreTrainingModel`. + + Args: + prediction_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + sop_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False + continuation before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + prediction_logits: tf.Tensor = None + sop_logits: tf.Tensor = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None ALBERT_START_DOCSTRING = r""" @@ -707,6 +766,11 @@ def call( (if set to :obj:`False`) for evaluation. output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -720,32 +784,13 @@ def __init__(self, config, *inputs, **kwargs): self.albert = TFAlbertMainLayer(config, name="albert") @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="albert-base-v2", + output_type=TFBaseModelOutputWithPooling, + config_class=_CONFIG_FOR_DOC, + ) def call(self, inputs, **kwargs): - r""" - Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs: - last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`): - Last layer hidden-state of the first token of the sequence (classification token) - further processed by a Linear layer and a Tanh activation function. The Linear - layer weights are trained from the next sentence prediction (classification) - objective during Albert pretraining. This output is usually *not* a good summary - of the semantic content of the input, you're often better with averaging or pooling - the sequence of hidden-states for the whole input sequence. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ outputs = self.albert(inputs, **kwargs) return outputs @@ -768,25 +813,10 @@ def get_output_embeddings(self): return self.albert.embeddings @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) + @replace_return_docstrings(output_type=TFAlbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) def call(self, inputs, **kwargs): r""" Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - sop_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`): - Prediction scores of the sentence order prediction (classification) head (scores of True/False continuation before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. Examples:: import tensorflow as tf @@ -797,13 +827,22 @@ def call(self, inputs, **kwargs): outputs = model(input_ids) prediction_scores, sop_scores = outputs[:2] """ - + return_dict = kwargs.get("return_dict") + return_dict = return_dict if return_dict is not None else self.albert.return_dict outputs = self.albert(inputs, **kwargs) sequence_output, pooled_output = outputs[:2] prediction_scores = self.predictions(sequence_output) sop_scores = self.sop_classifier(pooled_output, training=kwargs.get("training", False)) - outputs = (prediction_scores, sop_scores) + outputs[2:] - return outputs + + if not return_dict: + return (prediction_scores, sop_scores) + outputs[2:] + + return TFAlbertForPreTrainingOutput( + prediction_logits=prediction_scores, + sop_logits=sop_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) class TFAlbertSOPHead(tf.keras.layers.Layer): @@ -833,7 +872,12 @@ def get_output_embeddings(self): return self.albert.embeddings @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="albert-base-v2", + output_type=TFMaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -844,6 +888,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -853,27 +898,12 @@ def call( Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` - - Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs: - prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)` - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.albert.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[8] if len(inputs) > 8 else labels - if len(inputs) > 8: - inputs = inputs[:8] + labels = inputs[9] if len(inputs) > 9 else labels + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -886,20 +916,22 @@ def call( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) sequence_output = outputs[0] prediction_scores = self.predictions(sequence_output, training=training) - # Add hidden states and attention if they are here - outputs = (prediction_scores,) + outputs[2:] + loss = None if labels is None else self.compute_loss(labels, prediction_scores) - if labels is not None: - loss = self.compute_loss(labels, prediction_scores) - outputs = (loss,) + outputs + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((loss,) + output) if loss is not None else output - return outputs # prediction_scores, (hidden_states), (attentions) + return TFMaskedLMOutput( + loss=loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -919,7 +951,12 @@ def __init__(self, config, *inputs, **kwargs): ) @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="albert-base-v2", + output_type=TFSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -930,6 +967,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -939,27 +977,12 @@ def call( Indices should be in ``[0, ..., config.num_labels - 1]``. If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). - - Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs: - logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`) - Classification (or regression if config.num_labels==1) scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.albert.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[8] if len(inputs) > 8 else labels - if len(inputs) > 8: - inputs = inputs[:8] + labels = inputs[9] if len(inputs) > 9 else labels + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -972,6 +995,7 @@ def call( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -980,13 +1004,15 @@ def call( pooled_output = self.dropout(pooled_output, training=training) logits = self.classifier(pooled_output) - outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here + loss = None if labels is None else self.compute_loss(labels, logits) - if labels is not None: - loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), logits, (hidden_states), (attentions) + return TFSequenceClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -1006,7 +1032,12 @@ def __init__(self, config, *inputs, **kwargs): ) @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="albert-base-v2", + output_type=TFTokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -1017,6 +1048,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -1024,27 +1056,12 @@ def call( labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.albert.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[8] if len(inputs) > 8 else labels - if len(inputs) > 8: - inputs = inputs[:8] + labels = inputs[9] if len(inputs) > 9 else labels + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -1057,6 +1074,7 @@ def call( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -1065,13 +1083,15 @@ def call( sequence_output = self.dropout(sequence_output, training=training) logits = self.classifier(sequence_output) - outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here + loss = None if labels is None else self.compute_loss(labels, logits) - if labels is not None: - loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), logits, (hidden_states), (attentions) + return TFTokenClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -1089,7 +1109,12 @@ def __init__(self, config, *inputs, **kwargs): ) @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="albert-base-v2", + output_type=TFQuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -1100,6 +1125,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, start_positions=None, end_positions=None, training=False, @@ -1113,30 +1139,13 @@ def call( Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs: - start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): - Span-start scores (before SoftMax). - end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): - Span-end scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.albert.return_dict if isinstance(inputs, (tuple, list)): - start_positions = inputs[8] if len(inputs) > 8 else start_positions - end_positions = inputs[9] if len(inputs) > 9 else end_positions - if len(inputs) > 8: - inputs = inputs[:8] + start_positions = inputs[9] if len(inputs) > 9 else start_positions + end_positions = inputs[10] if len(inputs) > 10 else end_positions + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): start_positions = inputs.pop("start_positions", start_positions) end_positions = inputs.pop("end_positions", start_positions) @@ -1150,6 +1159,7 @@ def call( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -1160,15 +1170,23 @@ def call( start_logits = tf.squeeze(start_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1) - outputs = (start_logits, end_logits,) + outputs[2:] - + loss = None if start_positions is not None and end_positions is not None: labels = {"start_position": start_positions} labels["end_position"] = end_positions - loss = self.compute_loss(labels, outputs[:2]) - outputs = (loss,) + outputs - - return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) + loss = self.compute_loss(labels, (start_logits, end_logits)) + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFQuestionAnsweringModelOutput( + loss=loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) @add_start_docstrings( @@ -1196,7 +1214,12 @@ def dummy_inputs(self): return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="albert-base-v2", + output_type=TFMultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs, @@ -1207,6 +1230,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -1215,24 +1239,6 @@ def call( Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`: - `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above). - - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ if isinstance(inputs, (tuple, list)): input_ids = inputs[0] @@ -1243,8 +1249,9 @@ def call( inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states - labels = inputs[8] if len(inputs) > 8 else labels - assert len(inputs) <= 9, "Too many inputs." + return_dict = inputs[8] if len(inputs) > 8 else return_dict + labels = inputs[9] if len(inputs) > 9 else labels + assert len(inputs) <= 10, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -1254,10 +1261,12 @@ def call( inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_attentions) + return_dict = inputs.get("return_dict", return_dict) labels = inputs.get("labels", labels) - assert len(inputs) <= 9, "Too many inputs." + assert len(inputs) <= 10, "Too many inputs." else: input_ids = inputs + return_dict = return_dict if return_dict is not None else self.albert.return_dict if input_ids is not None: num_choices = shape_list(input_ids)[1] @@ -1280,6 +1289,7 @@ def call( inputs_embeds, output_attentions, output_hidden_states, + return_dict=return_dict, training=training, ) @@ -1289,10 +1299,12 @@ def call( logits = self.classifier(pooled_output) reshaped_logits = tf.reshape(logits, (-1, num_choices)) - outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here + loss = None if labels is None else self.compute_loss(labels, reshaped_logits) - if labels is not None: - loss = self.compute_loss(labels, reshaped_logits) - outputs = (loss,) + outputs + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), reshaped_logits, (hidden_states), (attentions) + return TFMultipleChoiceModelOutput( + loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) diff --git a/src/transformers/modeling_tf_bert.py b/src/transformers/modeling_tf_bert.py index 4c76120f5401..6768c6765c6e 100644 --- a/src/transformers/modeling_tf_bert.py +++ b/src/transformers/modeling_tf_bert.py @@ -17,6 +17,8 @@ import logging +from dataclasses import dataclass +from typing import Optional, Tuple import numpy as np import tensorflow as tf @@ -24,9 +26,22 @@ from .configuration_bert import BertConfig from .file_utils import ( MULTIPLE_CHOICE_DUMMY_INPUTS, + ModelOutput, add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable, + replace_return_docstrings, +) +from .modeling_tf_outputs import ( + TFBaseModelOutput, + TFBaseModelOutputWithPooling, + TFCausalLMOutput, + TFMaskedLMOutput, + TFMultipleChoiceModelOutput, + TFNextSentencePredictorOutput, + TFQuestionAnsweringModelOutput, + TFSequenceClassifierOutput, + TFTokenClassifierOutput, ) from .modeling_tf_utils import ( TFCausalLanguageModelingLoss, @@ -45,6 +60,7 @@ logger = logging.getLogger(__name__) +_CONFIG_FOR_DOC = "BertConfig" _TOKENIZER_FOR_DOC = "BertTokenizer" TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ @@ -389,9 +405,18 @@ def __init__(self, config, **kwargs): super().__init__(**kwargs) self.layer = [TFBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)] - def call(self, hidden_states, attention_mask, head_mask, output_attentions, output_hidden_states, training=False): - all_hidden_states = () - all_attentions = () + def call( + self, + hidden_states, + attention_mask, + head_mask, + output_attentions, + output_hidden_states, + return_dict, + training=False, + ): + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None for i, layer_module in enumerate(self.layer): if output_hidden_states: @@ -409,15 +434,11 @@ def call(self, hidden_states, attention_mask, head_mask, output_attentions, outp if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - outputs = (hidden_states,) - - if output_hidden_states: - outputs = outputs + (all_hidden_states,) - - if output_attentions: - outputs = outputs + (all_attentions,) - - return outputs # outputs, (hidden states), (attentions) + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) + return TFBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions + ) class TFBertPooler(tf.keras.layers.Layer): @@ -517,6 +538,7 @@ def __init__(self, config, **kwargs): self.initializer_range = config.initializer_range self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states + self.return_dict = config.use_return_dict self.embeddings = TFBertEmbeddings(config, name="embeddings") self.encoder = TFBertEncoder(config, name="encoder") self.pooler = TFBertPooler(config, name="pooler") @@ -545,6 +567,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, training=False, ): if isinstance(inputs, (tuple, list)): @@ -556,7 +579,8 @@ def call( inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states - assert len(inputs) <= 8, "Too many inputs." + return_dict = inputs[8] if len(inputs) > 8 else return_dict + assert len(inputs) <= 9, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -566,12 +590,14 @@ def call( inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) - assert len(inputs) <= 8, "Too many inputs." + return_dict = inputs.get("return_dict", return_dict) + assert len(inputs) <= 9, "Too many inputs." else: input_ids = inputs output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states + return_dict = return_dict if return_dict is not None else self.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -621,16 +647,22 @@ def call( head_mask, output_attentions, output_hidden_states, + return_dict, training=training, ) sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output) - outputs = (sequence_output, pooled_output,) + encoder_outputs[ - 1: - ] # add hidden_states and attentions if they are here - return outputs # sequence_output, pooled_output, (hidden_states), (attentions) + if not return_dict: + return (sequence_output, pooled_output,) + encoder_outputs[1:] + + return TFBaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) class TFBertPreTrainedModel(TFPreTrainedModel): @@ -642,6 +674,36 @@ class TFBertPreTrainedModel(TFPreTrainedModel): base_model_prefix = "bert" +@dataclass +class TFBertForPreTrainingOutput(ModelOutput): + """ + Output type of :class:`~transformers.TFBertForPreTrainingModel`. + + Args: + prediction_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + seq_relationship_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False + continuation before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + prediction_logits: tf.Tensor = None + seq_relationship_logits: tf.Tensor = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + BERT_START_DOCSTRING = r""" This model is a `tf.keras.Model `__ sub-class. Use it as a regular TF 2.0 Keras Model and @@ -712,6 +774,11 @@ class TFBertPreTrainedModel(TFPreTrainedModel): (if set to :obj:`False`) for evaluation. output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -725,32 +792,13 @@ def __init__(self, config, *inputs, **kwargs): self.bert = TFBertMainLayer(config, name="bert") @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="bert-base-cased", + output_type=TFBaseModelOutputWithPooling, + config_class=_CONFIG_FOR_DOC, + ) def call(self, inputs, **kwargs): - r""" - Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`): - Last layer hidden-state of the first token of the sequence (classification token) - further processed by a Linear layer and a Tanh activation function. The Linear - layer weights are trained from the next sentence prediction (classification) - objective during Bert pretraining. This output is usually *not* a good summary - of the semantic content of the input, you're often better with averaging or pooling - the sequence of hidden-states for the whole input sequence. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ outputs = self.bert(inputs, **kwargs) return outputs @@ -772,25 +820,10 @@ def get_output_embeddings(self): return self.bert.embeddings @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) + @replace_return_docstrings(output_type=TFBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) def call(self, inputs, **kwargs): r""" Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - seq_relationship_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`): - Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. Examples:: @@ -804,17 +837,23 @@ def call(self, inputs, **kwargs): prediction_scores, seq_relationship_scores = outputs[:2] """ + return_dict = kwargs.get("return_dict") + return_dict = return_dict if return_dict is not None else self.bert.return_dict outputs = self.bert(inputs, **kwargs) sequence_output, pooled_output = outputs[:2] prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False)) seq_relationship_score = self.nsp(pooled_output) - outputs = (prediction_scores, seq_relationship_score,) + outputs[ - 2: - ] # add hidden states and attention if they are here + if not return_dict: + return (prediction_scores, seq_relationship_score) + outputs[2:] - return outputs # prediction_scores, seq_relationship_score, (hidden_states), (attentions) + return TFBertForPreTrainingOutput( + prediction_logits=prediction_scores, + seq_relationship_logits=seq_relationship_score, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) @add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING) @@ -832,7 +871,12 @@ def get_output_embeddings(self): return self.bert.embeddings @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="bert-base-cased", + output_type=TFMaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -843,6 +887,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -852,27 +897,12 @@ def call( Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.bert.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[8] if len(inputs) > 8 else labels - if len(inputs) > 8: - inputs = inputs[:8] + labels = inputs[9] if len(inputs) > 9 else labels + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -885,19 +915,22 @@ def call( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) sequence_output = outputs[0] prediction_scores = self.mlm(sequence_output, training=training) - outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here + loss = None if labels is None else self.compute_loss(labels, prediction_scores) - if labels is not None: - loss = self.compute_loss(labels, prediction_scores) - outputs = (loss,) + outputs + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), prediction_scores, (hidden_states), (attentions) + return TFMaskedLMOutput( + loss=loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss): @@ -911,7 +944,12 @@ def __init__(self, config, *inputs, **kwargs): def get_output_embeddings(self): return self.bert.embeddings - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="bert-base-cased", + output_type=TFCausalLMOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -922,6 +960,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -929,27 +968,12 @@ def call( labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the cross entropy classification loss. Indices should be in ``[0, ..., config.vocab_size - 1]``. - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.bert.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[8] if len(inputs) > 8 else labels - if len(inputs) > 8: - inputs = inputs[:8] + labels = inputs[9] if len(inputs) > 9 else labels + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -962,21 +986,27 @@ def call( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) sequence_output = outputs[0] logits = self.mlm(sequence_output, training=training) - outputs = (logits,) + outputs[2:] # Add hidden states and attention if they are here + loss = None if labels is not None: # shift labels to the left and cut last logit token logits = logits[:, :-1] labels = labels[:, 1:] loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs - return outputs # prediction_scores, (hidden_states), (attentions) + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFCausalLMOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -990,23 +1020,10 @@ def __init__(self, config, *inputs, **kwargs): self.nsp = TFBertNSPHead(config, name="nsp___cls") @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) + @replace_return_docstrings(output_type=TFNextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC) def call(self, inputs, **kwargs): r""" Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - seq_relationship_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`) - Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. Examples:: @@ -1023,14 +1040,19 @@ def call(self, inputs, **kwargs): logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0] assert logits[0][0] < logits[0][1] # the next sentence was random """ + return_dict = kwargs.get("return_dict") + return_dict = return_dict if return_dict is not None else self.bert.return_dict outputs = self.bert(inputs, **kwargs) pooled_output = outputs[1] seq_relationship_score = self.nsp(pooled_output) - outputs = (seq_relationship_score,) + outputs[2:] # add hidden states and attention if they are here + if not return_dict: + return (seq_relationship_score,) + outputs[2:] - return outputs # seq_relationship_score, (hidden_states), (attentions) + return TFNextSentencePredictorOutput( + logits=seq_relationship_score, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -1050,7 +1072,12 @@ def __init__(self, config, *inputs, **kwargs): ) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="bert-base-cased", + output_type=TFSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -1061,6 +1088,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -1070,27 +1098,12 @@ def call( Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`): - Classification (or regression if config.num_labels==1) scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.bert.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[8] if len(inputs) > 8 else labels - if len(inputs) > 8: - inputs = inputs[:8] + labels = inputs[9] if len(inputs) > 9 else labels + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -1103,6 +1116,7 @@ def call( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -1111,13 +1125,15 @@ def call( pooled_output = self.dropout(pooled_output, training=training) logits = self.classifier(pooled_output) - outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here + loss = None if labels is None else self.compute_loss(labels, logits) - if labels is not None: - loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), logits, (hidden_states), (attentions) + return TFSequenceClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -1145,7 +1161,12 @@ def dummy_inputs(self): return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="bert-base-cased", + output_type=TFMultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs, @@ -1156,6 +1177,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -1164,24 +1186,6 @@ def call( Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`: - `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above). - - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ if isinstance(inputs, (tuple, list)): input_ids = inputs[0] @@ -1192,8 +1196,9 @@ def call( inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states - labels = inputs[8] if len(inputs) > 8 else labels - assert len(inputs) <= 9, "Too many inputs." + return_dict = inputs[8] if len(inputs) > 8 else return_dict + labels = inputs[9] if len(inputs) > 9 else labels + assert len(inputs) <= 10, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -1203,10 +1208,12 @@ def call( inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) + return_dict = inputs.get("return_dict", return_dict) labels = inputs.get("labels", labels) - assert len(inputs) <= 9, "Too many inputs." + assert len(inputs) <= 10, "Too many inputs." else: input_ids = inputs + return_dict = return_dict if return_dict is not None else self.bert.return_dict if input_ids is not None: num_choices = shape_list(input_ids)[1] @@ -1233,19 +1240,23 @@ def call( flat_inputs_embeds, output_attentions, output_hidden_states, + return_dict=return_dict, training=training, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output, training=training) logits = self.classifier(pooled_output) reshaped_logits = tf.reshape(logits, (-1, num_choices)) - outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here - if labels is not None: - loss = self.compute_loss(labels, reshaped_logits) - outputs = (loss,) + outputs + loss = None if labels is None else self.compute_loss(labels, reshaped_logits) - return outputs # (loss), reshaped_logits, (hidden_states), (attentions) + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFMultipleChoiceModelOutput( + loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -1265,7 +1276,12 @@ def __init__(self, config, *inputs, **kwargs): ) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="bert-base-cased", + output_type=TFTokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -1276,6 +1292,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -1283,27 +1300,12 @@ def call( labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.bert.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[8] if len(inputs) > 8 else labels - if len(inputs) > 8: - inputs = inputs[:8] + labels = inputs[9] if len(inputs) > 9 else labels + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -1316,6 +1318,7 @@ def call( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -1324,13 +1327,15 @@ def call( sequence_output = self.dropout(sequence_output, training=training) logits = self.classifier(sequence_output) - outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here + loss = None if labels is None else self.compute_loss(labels, logits) - if labels is not None: - loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), logits, (hidden_states), (attentions) + return TFTokenClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -1349,7 +1354,12 @@ def __init__(self, config, *inputs, **kwargs): ) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="bert-base-cased", + output_type=TFQuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -1360,6 +1370,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, start_positions=None, end_positions=None, training=False, @@ -1373,30 +1384,13 @@ def call( Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): - Span-start scores (before SoftMax). - end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): - Span-end scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.bert.return_dict if isinstance(inputs, (tuple, list)): - start_positions = inputs[8] if len(inputs) > 8 else start_positions - end_positions = inputs[9] if len(inputs) > 9 else end_positions - if len(inputs) > 8: - inputs = inputs[:8] + start_positions = inputs[9] if len(inputs) > 9 else start_positions + end_positions = inputs[10] if len(inputs) > 10 else end_positions + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): start_positions = inputs.pop("start_positions", start_positions) end_positions = inputs.pop("end_positions", start_positions) @@ -1410,6 +1404,7 @@ def call( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -1420,12 +1415,20 @@ def call( start_logits = tf.squeeze(start_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1) - outputs = (start_logits, end_logits,) + outputs[2:] - + loss = None if start_positions is not None and end_positions is not None: labels = {"start_position": start_positions} labels["end_position"] = end_positions - loss = self.compute_loss(labels, outputs[:2]) - outputs = (loss,) + outputs - - return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) + loss = self.compute_loss(labels, (start_logits, end_logits)) + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFQuestionAnsweringModelOutput( + loss=loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/src/transformers/modeling_tf_camembert.py b/src/transformers/modeling_tf_camembert.py index 1fefe7b3bb5d..e7a5a1d38e86 100644 --- a/src/transformers/modeling_tf_camembert.py +++ b/src/transformers/modeling_tf_camembert.py @@ -62,8 +62,6 @@ config (:class:`~transformers.CamembertConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. - output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. """ diff --git a/src/transformers/modeling_tf_ctrl.py b/src/transformers/modeling_tf_ctrl.py index caeec80f6d01..760944099a52 100644 --- a/src/transformers/modeling_tf_ctrl.py +++ b/src/transformers/modeling_tf_ctrl.py @@ -23,6 +23,7 @@ from .configuration_ctrl import CTRLConfig from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable +from .modeling_tf_outputs import TFBaseModelOutputWithPast, TFCausalLMOutputWithPast from .modeling_tf_utils import ( TFCausalLanguageModelingLoss, TFPreTrainedModel, @@ -35,7 +36,8 @@ logger = logging.getLogger(__name__) -_TOKENIZER_FOR_DOC = "CtrlTokenizer" +_CONFIG_FOR_DOC = "CTRLConfig" +_TOKENIZER_FOR_DOC = "CTRLTokenizer" TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = [ "ctrl" @@ -207,6 +209,7 @@ def __init__(self, config, **kwargs): self.output_hidden_states = config.output_hidden_states self.output_attentions = config.output_attentions self.use_cache = config.use_cache + self.return_dict = config.use_return_dict self.d_model_size = config.n_embd self.num_layers = config.n_layer @@ -260,6 +263,7 @@ def call( use_cache=None, output_attentions=None, output_hidden_states=None, + return_dict=None, training=False, ): @@ -274,7 +278,8 @@ def call( use_cache = inputs[7] if len(inputs) > 7 else use_cache output_attentions = inputs[8] if len(inputs) > 8 else output_attentions output_hidden_states = inputs[9] if len(inputs) > 9 else output_hidden_states - assert len(inputs) <= 10, "Too many inputs." + return_dict = inputs[10] if len(inputs) > 10 else return_dict + assert len(inputs) <= 11, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") past = inputs.get("past", past) @@ -286,13 +291,15 @@ def call( use_cache = inputs.get("use_cache", use_cache) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) - assert len(inputs) <= 10, "Too many inputs." + return_dict = inputs.get("return_dict", return_dict) + assert len(inputs) <= 11, "Too many inputs." else: input_ids = inputs output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states use_cache = use_cache if use_cache is not None else self.use_cache + return_dict = return_dict if return_dict is not None else self.return_dict # If using past key value states, only the last tokens # should be given as an input @@ -374,9 +381,9 @@ def call( hidden_states = self.dropout(hidden_states, training=training) output_shape = input_shape + [shape_list(hidden_states)[-1]] - presents = () - all_hidden_states = () - all_attentions = [] + presents = () if use_cache else None + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None for i, (h, layer_past) in enumerate(zip(self.h, past)): if output_hidden_states: all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),) @@ -396,24 +403,27 @@ def call( presents = presents + (present,) if output_attentions: - all_attentions.append(outputs[2]) + all_attentions = all_attentions + (outputs[2],) hidden_states = self.layernorm(hidden_states) hidden_states = tf.reshape(hidden_states, output_shape) if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - outputs = (hidden_states,) - if use_cache: - outputs = outputs + (presents,) - if output_hidden_states: - outputs = outputs + (all_hidden_states,) if output_attentions: # let the number of heads free (-1) so we can extract attention even after head pruning attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:] all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions) - outputs = outputs + (all_attentions,) - return outputs + + if not return_dict: + return tuple(v for v in [hidden_states, presents, all_hidden_states, all_attentions] if v is not None) + + return TFBaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=presents, + hidden_states=all_hidden_states, + attentions=all_attentions, + ) class TFCTRLPreTrainedModel(TFPreTrainedModel): @@ -503,6 +513,11 @@ class TFCTRLPreTrainedModel(TFPreTrainedModel): (if set to :obj:`False`) for evaluation. output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -516,29 +531,13 @@ def __init__(self, config, *inputs, **kwargs): self.transformer = TFCTRLMainLayer(config, name="transformer") @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="ctrl") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="ctrl", + output_type=TFBaseModelOutputWithPast, + config_class=_CONFIG_FOR_DOC, + ) def call(self, inputs, **kwargs): - r""" - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.CTRLConfig`) and inputs: - last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the last layer of the model. - past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): - Contains pre-computed hidden-states (key and values in the attention blocks). - Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model - should not be passed as input ids as they have already been computed. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ outputs = self.transformer(inputs, **kwargs) return outputs @@ -585,7 +584,12 @@ def prepare_inputs_for_generation(self, inputs, past, **kwargs): return {"inputs": inputs, "past": past, "use_cache": kwargs["use_cache"]} @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="ctrl") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="ctrl", + output_type=TFCausalLMOutputWithPast, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs, @@ -598,6 +602,7 @@ def call( use_cache=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -605,31 +610,12 @@ def call( labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the cross entropy classification loss. Indices should be in ``[0, ..., config.vocab_size - 1]``. - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.CTRLConfig`) and inputs: - prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): - Contains pre-computed hidden-states (key and values in the attention blocks). - Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model - should not be passed as input ids as they have already been computed. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[10] if len(inputs) > 10 else labels - if len(inputs) > 10: - inputs = inputs[:10] + labels = inputs[11] if len(inputs) > 11 else labels + if len(inputs) > 11: + inputs = inputs[:11] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -644,6 +630,7 @@ def call( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -651,12 +638,21 @@ def call( logits = self.lm_head(hidden_states) - outputs = (logits,) + transformer_outputs[1:] + loss = None if labels is not None: # shift labels to the left and cut last logit token logits = logits[:, :-1] labels = labels[:, 1:] loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs - return outputs # lm_logits, presents, (all hidden_states), (attentions) + if not return_dict: + output = (logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) diff --git a/src/transformers/modeling_tf_distilbert.py b/src/transformers/modeling_tf_distilbert.py index 892417627f44..4a9484aa1fe1 100644 --- a/src/transformers/modeling_tf_distilbert.py +++ b/src/transformers/modeling_tf_distilbert.py @@ -29,6 +29,14 @@ add_start_docstrings, add_start_docstrings_to_callable, ) +from .modeling_tf_outputs import ( + TFBaseModelOutput, + TFMaskedLMOutput, + TFMultipleChoiceModelOutput, + TFQuestionAnsweringModelOutput, + TFSequenceClassifierOutput, + TFTokenClassifierOutput, +) from .modeling_tf_utils import ( TFMaskedLanguageModelingLoss, TFMultipleChoiceLoss, @@ -46,6 +54,7 @@ logger = logging.getLogger(__name__) +_CONFIG_FOR_DOC = "DistilBertConfig" _TOKENIZER_FOR_DOC = "DistilBertTokenizer" TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ @@ -359,7 +368,7 @@ def __init__(self, config, **kwargs): self.layer = [TFTransformerBlock(config, name="layer_._{}".format(i)) for i in range(config.n_layers)] - def call(self, x, attn_mask, head_mask, output_attentions, output_hidden_states, training=False): + def call(self, x, attn_mask, head_mask, output_attentions, output_hidden_states, return_dict, training=False): """ Parameters ---------- @@ -379,8 +388,8 @@ def call(self, x, attn_mask, head_mask, output_attentions, output_hidden_states, Tuple of length n_layers with the attention weights from each layer Optional: only if output_attentions=True """ - all_hidden_states = () - all_attentions = () + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None hidden_state = x for i, layer_module in enumerate(self.layer): @@ -401,12 +410,11 @@ def call(self, x, attn_mask, head_mask, output_attentions, output_hidden_states, if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_state,) - outputs = (hidden_state,) - if output_hidden_states: - outputs = outputs + (all_hidden_states,) - if output_attentions: - outputs = outputs + (all_attentions,) - return outputs # last-layer hidden state, (all hidden states), (all attentions) + if not return_dict: + return tuple(v for v in [hidden_state, all_hidden_states, all_attentions] if v is not None) + return TFBaseModelOutput( + last_hidden_state=hidden_state, hidden_states=all_hidden_states, attentions=all_attentions + ) @keras_serializable @@ -418,6 +426,7 @@ def __init__(self, config, **kwargs): self.num_hidden_layers = config.num_hidden_layers self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states + self.return_dict = config.use_return_dict self.embeddings = TFEmbeddings(config, name="embeddings") # Embeddings self.transformer = TFTransformer(config, name="transformer") # Encoder @@ -440,6 +449,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, training=False, ): if isinstance(inputs, (tuple, list)): @@ -449,7 +459,8 @@ def call( inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds output_attentions = inputs[4] if len(inputs) > 4 else output_attentions output_hidden_states = inputs[5] if len(inputs) > 5 else output_hidden_states - assert len(inputs) <= 6, "Too many inputs." + return_dict = inputs[6] if len(inputs) > 6 else return_dict + assert len(inputs) <= 7, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -457,12 +468,14 @@ def call( inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) - assert len(inputs) <= 6, "Too many inputs." + return_dict = inputs.get("return_dict", return_dict) + assert len(inputs) <= 7, "Too many inputs." else: input_ids = inputs output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states + return_dict = return_dict if return_dict is not None else self.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -491,7 +504,13 @@ def call( embedding_output = self.embeddings(input_ids, inputs_embeds=inputs_embeds) # (bs, seq_length, dim) tfmr_output = self.transformer( - embedding_output, attention_mask, head_mask, output_attentions, output_hidden_states, training=training + embedding_output, + attention_mask, + head_mask, + output_attentions, + output_hidden_states, + return_dict, + training=training, ) return tfmr_output # last-layer hidden-state, (all hidden_states), (all attentions) @@ -564,9 +583,13 @@ class TFDistilBertPreTrainedModel(TFPreTrainedModel): training (:obj:`boolean`, `optional`, defaults to :obj:`False`): Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them (if set to :obj:`False`) for evaluation. - output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -580,25 +603,13 @@ def __init__(self, config, *inputs, **kwargs): self.distilbert = TFDistilBertMainLayer(config, name="distilbert") # Embeddings @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="distilbert-base-uncased", + output_type=TFBaseModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call(self, inputs, **kwargs): - r""" - Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs: - last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ outputs = self.distilbert(inputs, **kwargs) return outputs @@ -642,7 +653,12 @@ def get_output_embeddings(self): return self.vocab_projector.input_embeddings @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="distilbert-base-uncased", + output_type=TFMaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -651,6 +667,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -660,27 +677,12 @@ def call( Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` - - Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs: - prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.distilbert.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[6] if len(inputs) > 6 else labels - if len(inputs) > 6: - inputs = inputs[:6] + labels = inputs[7] if len(inputs) > 7 else labels + if len(inputs) > 7: + inputs = inputs[:7] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -691,6 +693,7 @@ def call( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -700,13 +703,18 @@ def call( prediction_logits = self.vocab_layer_norm(prediction_logits) # (bs, seq_length, dim) prediction_logits = self.vocab_projector(prediction_logits) - outputs = (prediction_logits,) + distilbert_output[1:] + loss = None if labels is None else self.compute_loss(labels, prediction_logits) - if labels is not None: - loss = self.compute_loss(labels, prediction_logits) - outputs = (loss,) + outputs + if not return_dict: + output = (prediction_logits,) + distilbert_output[1:] + return ((loss,) + output) if loss is not None else output - return outputs # logits, (hidden_states), (attentions) + return TFMaskedLMOutput( + loss=loss, + logits=prediction_logits, + hidden_states=distilbert_output.hidden_states, + attentions=distilbert_output.attentions, + ) @add_start_docstrings( @@ -732,7 +740,12 @@ def __init__(self, config, *inputs, **kwargs): self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="distilbert-base-uncased", + output_type=TFSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -741,6 +754,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -750,27 +764,12 @@ def call( Indices should be in ``[0, ..., config.num_labels - 1]``. If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). - - Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs: - logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`): - Classification (or regression if config.num_labels==1) scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.distilbert.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[6] if len(inputs) > 6 else labels - if len(inputs) > 6: - inputs = inputs[:6] + labels = inputs[7] if len(inputs) > 7 else labels + if len(inputs) > 7: + inputs = inputs[:7] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -781,6 +780,7 @@ def call( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -790,13 +790,18 @@ def call( pooled_output = self.dropout(pooled_output, training=training) # (bs, dim) logits = self.classifier(pooled_output) # (bs, dim) - outputs = (logits,) + distilbert_output[1:] + loss = None if labels is None else self.compute_loss(labels, logits) - if labels is not None: - loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs + if not return_dict: + output = (logits,) + distilbert_output[1:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), logits, (hidden_states), (attentions) + return TFSequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=distilbert_output.hidden_states, + attentions=distilbert_output.attentions, + ) @add_start_docstrings( @@ -816,7 +821,12 @@ def __init__(self, config, *inputs, **kwargs): ) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="distilbert-base-uncased", + output_type=TFTokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -825,6 +835,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -832,27 +843,12 @@ def call( labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. - - Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs: - scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.distilbert.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[6] if len(inputs) > 6 else labels - if len(inputs) > 6: - inputs = inputs[:6] + labels = inputs[7] if len(inputs) > 7 else labels + if len(inputs) > 7: + inputs = inputs[:7] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -863,6 +859,7 @@ def call( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -871,13 +868,15 @@ def call( sequence_output = self.dropout(sequence_output, training=training) logits = self.classifier(sequence_output) - outputs = (logits,) + outputs[1:] # add hidden states and attention if they are here + loss = None if labels is None else self.compute_loss(labels, logits) - if labels is not None: - loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), logits, (hidden_states), (attentions) + return TFTokenClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -911,7 +910,12 @@ def dummy_inputs(self): return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="distilbert-base-uncased", + output_type=TFMultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs, @@ -920,6 +924,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -928,24 +933,6 @@ def call( Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`: - `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above). - - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ if isinstance(inputs, (tuple, list)): input_ids = inputs[0] @@ -954,8 +941,9 @@ def call( inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds output_attentions = inputs[4] if len(inputs) > 4 else output_attentions output_hidden_states = inputs[5] if len(inputs) > 5 else output_hidden_states - labels = inputs[6] if len(inputs) > 6 else labels - assert len(inputs) <= 7, "Too many inputs." + return_dict = inputs[6] if len(inputs) > 6 else return_dict + labels = inputs[7] if len(inputs) > 7 else labels + assert len(inputs) <= 8, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -963,10 +951,12 @@ def call( inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) + return_dict = inputs.get("return_dict", return_dict) labels = inputs.get("labels", labels) - assert len(inputs) <= 7, "Too many inputs." + assert len(inputs) <= 8, "Too many inputs." else: input_ids = inputs + return_dict = return_dict if return_dict is not None else self.distilbert.return_dict if input_ids is not None: num_choices = shape_list(input_ids)[1] @@ -989,6 +979,7 @@ def call( flat_inputs_embeds, output_attentions, output_hidden_states, + return_dict=return_dict, training=training, ) hidden_state = distilbert_output[0] # (bs, seq_len, dim) @@ -997,13 +988,19 @@ def call( pooled_output = self.dropout(pooled_output, training=training) # (bs, dim) logits = self.classifier(pooled_output) reshaped_logits = tf.reshape(logits, (-1, num_choices)) - outputs = (reshaped_logits,) + distilbert_output[1:] # add hidden states and attention if they are here - if labels is not None: - loss = self.compute_loss(labels, reshaped_logits) - outputs = (loss,) + outputs + loss = None if labels is None else self.compute_loss(labels, reshaped_logits) - return outputs # (loss), reshaped_logits, (hidden_states), (attentions) + if not return_dict: + output = (reshaped_logits,) + distilbert_output[1:] + return ((loss,) + output) if loss is not None else output + + return TFMultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=distilbert_output.hidden_states, + attentions=distilbert_output.attentions, + ) @add_start_docstrings( @@ -1023,7 +1020,12 @@ def __init__(self, config, *inputs, **kwargs): self.dropout = tf.keras.layers.Dropout(config.qa_dropout) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="distilbert-base-uncased", + output_type=TFQuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -1032,6 +1034,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, start_positions=None, end_positions=None, training=False, @@ -1045,30 +1048,13 @@ def call( Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs: - start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): - Span-start scores (before SoftMax). - end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): - Span-end scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.distilbert.return_dict if isinstance(inputs, (tuple, list)): - start_positions = inputs[6] if len(inputs) > 6 else start_positions - end_positions = inputs[7] if len(inputs) > 7 else end_positions - if len(inputs) > 6: - inputs = inputs[:6] + start_positions = inputs[7] if len(inputs) > 7 else start_positions + end_positions = inputs[8] if len(inputs) > 8 else end_positions + if len(inputs) > 7: + inputs = inputs[:7] elif isinstance(inputs, (dict, BatchEncoding)): start_positions = inputs.pop("start_positions", start_positions) end_positions = inputs.pop("end_positions", start_positions) @@ -1080,6 +1066,7 @@ def call( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -1090,12 +1077,20 @@ def call( start_logits = tf.squeeze(start_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1) - outputs = (start_logits, end_logits,) + distilbert_output[1:] - + loss = None if start_positions is not None and end_positions is not None: labels = {"start_position": start_positions} labels["end_position"] = end_positions - loss = self.compute_loss(labels, outputs[:2]) - outputs = (loss,) + outputs - - return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) + loss = self.compute_loss(labels, (start_logits, end_logits)) + + if not return_dict: + output = (start_logits, end_logits) + distilbert_output[1:] + return ((loss,) + output) if loss is not None else output + + return TFQuestionAnsweringModelOutput( + loss=loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=distilbert_output.hidden_states, + attentions=distilbert_output.attentions, + ) diff --git a/src/transformers/modeling_tf_electra.py b/src/transformers/modeling_tf_electra.py index 5a7e366cd36f..6269d0238747 100644 --- a/src/transformers/modeling_tf_electra.py +++ b/src/transformers/modeling_tf_electra.py @@ -1,4 +1,6 @@ import logging +from dataclasses import dataclass +from typing import Optional, Tuple import tensorflow as tf @@ -6,11 +8,21 @@ from .file_utils import ( MULTIPLE_CHOICE_DUMMY_INPUTS, + ModelOutput, add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable, + replace_return_docstrings, ) from .modeling_tf_bert import ACT2FN, TFBertEncoder, TFBertPreTrainedModel +from .modeling_tf_outputs import ( + TFBaseModelOutput, + TFMaskedLMOutput, + TFMultipleChoiceModelOutput, + TFQuestionAnsweringModelOutput, + TFSequenceClassifierOutput, + TFTokenClassifierOutput, +) from .modeling_tf_utils import ( TFMaskedLanguageModelingLoss, TFMultipleChoiceLoss, @@ -27,8 +39,8 @@ logger = logging.getLogger(__name__) -_TOKENIZER_FOR_DOC = "ElectraTokenizer" _CONFIG_FOR_DOC = "ElectraConfig" +_TOKENIZER_FOR_DOC = "ElectraTokenizer" TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [ "google/electra-small-generator", @@ -254,6 +266,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, training=False, ): if isinstance(inputs, (tuple, list)): @@ -265,7 +278,8 @@ def call( inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states - assert len(inputs) <= 8, "Too many inputs." + return_dict = inputs[8] if len(inputs) > 8 else return_dict + assert len(inputs) <= 9, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -275,7 +289,8 @@ def call( inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) - assert len(inputs) <= 8, "Too many inputs." + return_dict = inputs.get("return_dict", return_dict) + assert len(inputs) <= 9, "Too many inputs." else: input_ids = inputs @@ -283,6 +298,7 @@ def call( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -312,12 +328,41 @@ def call( head_mask, output_attentions, output_hidden_states, + return_dict, training=training, ) return hidden_states +@dataclass +class TFElectraForPreTrainingOutput(ModelOutput): + """ + Output type of :class:`~transformers.TFElectraForPreTrainingModel`. + + Args: + loss (`optional`, returned when ``labels`` is provided, ``tf.Tensor`` of shape :obj:`(1,)`): + Total loss of the ELECTRA objective. + logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`): + Prediction scores of the head (scores for each token before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + logits: tf.Tensor = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + ELECTRA_START_DOCSTRING = r""" This model is a `tf.keras.Model `__ sub-class. Use it as a regular TF 2.0 Keras Model and @@ -380,9 +425,13 @@ def call( training (:obj:`boolean`, `optional`, defaults to :obj:`False`): Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them (if set to :obj:`False`) for evaluation. - output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -400,25 +449,13 @@ def __init__(self, config, *inputs, **kwargs): self.electra = TFElectraMainLayer(config, name="electra") @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="google/electra-small-discriminator", + output_type=TFBaseModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call(self, inputs, **kwargs): - r""" - Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs: - last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ outputs = self.electra(inputs, **kwargs) return outputs @@ -439,6 +476,7 @@ def __init__(self, config, **kwargs): self.discriminator_predictions = TFElectraDiscriminatorPredictions(config, name="discriminator_predictions") @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFElectraForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) def call( self, input_ids, @@ -449,24 +487,11 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, training=False, ): r""" Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs: - scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): - Prediction scores of the head (scores for each token before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. Examples:: @@ -479,6 +504,7 @@ def call( outputs = model(input_ids) scores = outputs[0] """ + return_dict = return_dict if return_dict is not None else self.electra.config.return_dict discriminator_hidden_states = self.electra( input_ids, @@ -489,14 +515,20 @@ def call( inputs_embeds, output_attentions, output_hidden_states, + return_dict=return_dict, training=training, ) discriminator_sequence_output = discriminator_hidden_states[0] logits = self.discriminator_predictions(discriminator_sequence_output) - output = (logits,) - output += discriminator_hidden_states[1:] - return output # (loss), scores, (hidden_states), (attentions) + if not return_dict: + return (logits,) + discriminator_hidden_states[1:] + + return TFElectraForPreTrainingOutput( + logits=logits, + hidden_states=discriminator_hidden_states.hidden_states, + attentions=discriminator_hidden_states.attentions, + ) class TFElectraMaskedLMHead(tf.keras.layers.Layer): @@ -539,7 +571,12 @@ def get_output_embeddings(self): return self.generator_lm_head @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-generator") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="google/electra-small-generator", + output_type=TFMaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, input_ids, @@ -550,6 +587,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -559,27 +597,12 @@ def call( Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` - - Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs: - prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.electra.config.return_dict if isinstance(input_ids, (tuple, list)): - labels = input_ids[8] if len(input_ids) > 8 else labels - if len(input_ids) > 8: - input_ids = input_ids[:8] + labels = input_ids[9] if len(input_ids) > 9 else labels + if len(input_ids) > 9: + input_ids = input_ids[:9] elif isinstance(input_ids, (dict, BatchEncoding)): labels = input_ids.pop("labels", labels) @@ -592,19 +615,25 @@ def call( inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) generator_sequence_output = generator_hidden_states[0] prediction_scores = self.generator_predictions(generator_sequence_output, training=training) prediction_scores = self.generator_lm_head(prediction_scores, training=training) - output = (prediction_scores,) - output += generator_hidden_states[1:] - if labels is not None: - loss = self.compute_loss(labels, prediction_scores) - output = (loss,) + output + loss = None if labels is None else self.compute_loss(labels, prediction_scores) - return output # (masked_lm_loss), prediction_scores, (hidden_states), (attentions) + if not return_dict: + output = (prediction_scores,) + generator_hidden_states[1:] + return ((loss,) + output) if loss is not None else output + + return TFMaskedLMOutput( + loss=loss, + logits=prediction_scores, + hidden_states=generator_hidden_states.hidden_states, + attentions=generator_hidden_states.attentions, + ) class TFElectraClassificationHead(tf.keras.layers.Layer): @@ -647,6 +676,7 @@ def __init__(self, config, *inputs, **kwargs): @add_code_sample_docstrings( tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator", + output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC, ) def call( @@ -659,23 +689,25 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): r""" - Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs: - logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`) - Classification (or regression if config.num_labels==1) scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + Labels for computing the sequence classification/regression loss. + Indices should be in :obj:`[0, ..., config.num_labels - 1]`. + If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), + If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ + return_dict = return_dict if return_dict is not None else self.electra.config.return_dict + if isinstance(input_ids, (tuple, list)): + labels = input_ids[9] if len(input_ids) > 9 else labels + if len(input_ids) > 9: + input_ids = input_ids[:9] + elif isinstance(input_ids, (dict, BatchEncoding)): + labels = input_ids.pop("labels", labels) + outputs = self.electra( input_ids, attention_mask, @@ -685,16 +717,20 @@ def call( inputs_embeds, output_attentions, output_hidden_states, + return_dict=return_dict, training=training, ) logits = self.classifier(outputs[0]) - outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here - if labels is not None: - loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs + loss = None if labels is None else self.compute_loss(labels, logits) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), logits, (hidden_states), (attentions) + return TFSequenceClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -724,7 +760,12 @@ def dummy_inputs(self): return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="google/electra-small-discriminator", + output_type=TFMultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs, @@ -735,6 +776,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -743,24 +785,6 @@ def call( Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs: - classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`: - `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above). - - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ if isinstance(inputs, (tuple, list)): input_ids = inputs[0] @@ -771,8 +795,9 @@ def call( inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states - labels = inputs[8] if len(inputs) > 8 else labels - assert len(inputs) <= 9, "Too many inputs." + return_dict = inputs[8] if len(inputs) > 8 else return_dict + labels = inputs[9] if len(inputs) > 9 else labels + assert len(inputs) <= 10, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -782,10 +807,12 @@ def call( inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) + return_dict = inputs.get("return_dict", return_dict) labels = inputs.get("labels", labels) - assert len(inputs) <= 9, "Too many inputs." + assert len(inputs) <= 10, "Too many inputs." else: input_ids = inputs + return_dict = return_dict if return_dict is not None else self.electra.config.return_dict if input_ids is not None: num_choices = shape_list(input_ids)[1] @@ -812,18 +839,22 @@ def call( flat_inputs_embeds, output_attentions, output_hidden_states, + return_dict=return_dict, training=training, ) logits = self.sequence_summary(outputs[0]) logits = self.classifier(logits) reshaped_logits = tf.reshape(logits, (-1, num_choices)) - outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here - if labels is not None: - loss = self.compute_loss(labels, reshaped_logits) - outputs = (loss,) + outputs + loss = None if labels is None else self.compute_loss(labels, reshaped_logits) - return outputs # (loss), reshaped_logits, (hidden_states), (attentions) + if not return_dict: + output = (reshaped_logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFMultipleChoiceModelOutput( + loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -843,7 +874,12 @@ def __init__(self, config, **kwargs): ) @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="google/electra-small-discriminator", + output_type=TFTokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs, @@ -854,6 +890,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -861,27 +898,12 @@ def call( labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. - - Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs: - scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.electra.config.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[8] if len(inputs) > 8 else labels - if len(inputs) > 8: - inputs = inputs[:8] + labels = inputs[9] if len(inputs) > 9 else labels + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -894,19 +916,25 @@ def call( inputs_embeds, output_attentions, output_hidden_states, + return_dict=return_dict, training=training, ) discriminator_sequence_output = discriminator_hidden_states[0] discriminator_sequence_output = self.dropout(discriminator_sequence_output) logits = self.classifier(discriminator_sequence_output) - outputs = (logits,) + discriminator_hidden_states[1:] + loss = None if labels is None else self.compute_loss(labels, logits) - if labels is not None: - loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs + if not return_dict: + output = (logits,) + discriminator_hidden_states[1:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), scores, (hidden_states), (attentions) + return TFTokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=discriminator_hidden_states.hidden_states, + attentions=discriminator_hidden_states.attentions, + ) @add_start_docstrings( @@ -925,7 +953,12 @@ def __init__(self, config, *inputs, **kwargs): ) @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="google/electra-small-discriminator", + output_type=TFQuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs, @@ -936,6 +969,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, start_positions=None, end_positions=None, training=False, @@ -949,30 +983,13 @@ def call( Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): - Span-start scores (before SoftMax). - end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): - Span-end scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.electra.config.return_dict if isinstance(inputs, (tuple, list)): - start_positions = inputs[8] if len(inputs) > 8 else start_positions - end_positions = inputs[9] if len(inputs) > 9 else end_positions - if len(inputs) > 8: - inputs = inputs[:8] + start_positions = inputs[9] if len(inputs) > 9 else start_positions + end_positions = inputs[10] if len(inputs) > 10 else end_positions + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): start_positions = inputs.pop("start_positions", start_positions) end_positions = inputs.pop("end_positions", start_positions) @@ -986,6 +1003,7 @@ def call( inputs_embeds, output_attentions, output_hidden_states, + return_dict=return_dict, training=training, ) discriminator_sequence_output = discriminator_hidden_states[0] @@ -995,12 +1013,20 @@ def call( start_logits = tf.squeeze(start_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1) - outputs = (start_logits, end_logits,) + discriminator_hidden_states[1:] - + loss = None if start_positions is not None and end_positions is not None: labels = {"start_position": start_positions} labels["end_position"] = end_positions - loss = self.compute_loss(labels, outputs[:2]) - outputs = (loss,) + outputs - - return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) + loss = self.compute_loss(labels, (start_logits, end_logits)) + + if not return_dict: + output = (start_logits, end_logits,) + discriminator_hidden_states[1:] + return ((loss,) + output) if loss is not None else output + + return TFQuestionAnsweringModelOutput( + loss=loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=discriminator_hidden_states.hidden_states, + attentions=discriminator_hidden_states.attentions, + ) diff --git a/src/transformers/modeling_tf_flaubert.py b/src/transformers/modeling_tf_flaubert.py index 77aa4f249f5b..a9dae60e8e1a 100644 --- a/src/transformers/modeling_tf_flaubert.py +++ b/src/transformers/modeling_tf_flaubert.py @@ -22,6 +22,7 @@ from .configuration_flaubert import FlaubertConfig from .file_utils import add_start_docstrings +from .modeling_tf_outputs import TFBaseModelOutput from .modeling_tf_utils import keras_serializable, shape_list from .modeling_tf_xlm import ( TFXLMForMultipleChoice, @@ -103,6 +104,11 @@ than the model's internal embedding lookup matrix. output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -126,6 +132,7 @@ def __init__(self, config, *inputs, **kwargs): self.pre_norm = getattr(config, "pre_norm", False) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states + self.return_dict = config.use_return_dict def call( self, @@ -140,6 +147,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, training=False, ): # removed: src_enc=None, src_len=None @@ -155,7 +163,8 @@ def call( inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds output_attentions = inputs[9] if len(inputs) > 9 else output_attentions output_hidden_states = inputs[10] if len(inputs) > 10 else output_hidden_states - assert len(inputs) <= 11, "Too many inputs." + return_dict = inputs[11] if len(inputs) > 11 else return_dict + assert len(inputs) <= 12, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -168,12 +177,14 @@ def call( inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) - assert len(inputs) <= 11, "Too many inputs." + return_dict = inputs.get("return_dict", return_dict) + assert len(inputs) <= 12, "Too many inputs." else: input_ids = inputs output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states + return_dict = return_dict if return_dict is not None else self.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -260,8 +271,8 @@ def call( tensor = tensor * mask[..., tf.newaxis] # transformer layers - hidden_states = () - attentions = () + hidden_states = () if output_hidden_states else None + attentions = () if output_attentions else None for i in range(self.n_layers): # LayerDrop dropout_probability = random.uniform(0, 1) @@ -321,12 +332,9 @@ def call( # move back sequence length to dimension 0 # tensor = tensor.transpose(0, 1) - outputs = (tensor,) - if output_hidden_states: - outputs = outputs + (hidden_states,) - if output_attentions: - outputs = outputs + (attentions,) - return outputs # outputs, (hidden_states), (attentions) + if not return_dict: + return tuple(v for v in [tensor, hidden_states, attentions] if v is not None) + return TFBaseModelOutput(last_hidden_state=tensor, hidden_states=hidden_states, attentions=attentions) @add_start_docstrings( diff --git a/src/transformers/modeling_tf_gpt2.py b/src/transformers/modeling_tf_gpt2.py index 8adaafb35ebf..5221ef46ceff 100644 --- a/src/transformers/modeling_tf_gpt2.py +++ b/src/transformers/modeling_tf_gpt2.py @@ -17,12 +17,21 @@ import logging +from dataclasses import dataclass +from typing import List, Optional, Tuple import numpy as np import tensorflow as tf from .configuration_gpt2 import GPT2Config -from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable +from .file_utils import ( + ModelOutput, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_callable, + replace_return_docstrings, +) +from .modeling_tf_outputs import TFBaseModelOutputWithPast, TFCausalLMOutputWithPast from .modeling_tf_utils import ( TFCausalLanguageModelingLoss, TFConv1D, @@ -38,6 +47,7 @@ logger = logging.getLogger(__name__) +_CONFIG_FOR_DOC = "GPT2Config" _TOKENIZER_FOR_DOC = "GPT2Tokenizer" TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [ @@ -214,12 +224,11 @@ def __init__(self, config, *inputs, **kwargs): self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.use_cache = config.use_cache + self.return_dict = config.use_return_dict self.num_hidden_layers = config.n_layer self.vocab_size = config.vocab_size self.n_embd = config.n_embd - self.output_hidden_states = self.output_hidden_states - self.output_attentions = self.output_attentions self.wte = TFSharedEmbeddings( config.vocab_size, config.hidden_size, initializer_range=config.initializer_range, name="wte" @@ -259,6 +268,7 @@ def call( use_cache=None, output_attentions=None, output_hidden_states=None, + return_dict=None, training=False, ): if isinstance(inputs, (tuple, list)): @@ -272,7 +282,8 @@ def call( use_cache = inputs[7] if len(inputs) > 7 else use_cache output_attentions = inputs[8] if len(inputs) > 8 else output_attentions output_hidden_states = inputs[9] if len(inputs) > 9 else output_hidden_states - assert len(inputs) <= 10, "Too many inputs." + return_dict = inputs[10] if len(inputs) > 10 else return_dict + assert len(inputs) <= 11, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") past = inputs.get("past", past) @@ -284,13 +295,15 @@ def call( use_cache = inputs.get("use_cache", use_cache) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) - assert len(inputs) <= 10, "Too many inputs." + return_dict = inputs.get("return_dict", return_dict) + assert len(inputs) <= 11, "Too many inputs." else: input_ids = inputs output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states use_cache = use_cache if use_cache is not None else self.use_cache + return_dict = return_dict if return_dict is not None else self.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -355,9 +368,9 @@ def call( output_shape = input_shape + [shape_list(hidden_states)[-1]] - presents = () - all_attentions = [] - all_hidden_states = () + presents = () if use_cache else None + all_attentions = () if output_attentions else None + all_hidden_states = () if output_hidden_states else None for i, (block, layer_past) in enumerate(zip(self.h, past)): if output_hidden_states: all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),) @@ -373,10 +386,11 @@ def call( ) hidden_states, present = outputs[:2] - presents = presents + (present,) + if use_cache: + presents = presents + (present,) if output_attentions: - all_attentions.append(outputs[2]) + all_attentions = all_attentions + (outputs[2],) hidden_states = self.ln_f(hidden_states) @@ -385,18 +399,20 @@ def call( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - outputs = (hidden_states,) - - if use_cache: - outputs = outputs + (presents,) - if output_hidden_states: - outputs = outputs + (all_hidden_states,) if output_attentions: # let the number of heads free (-1) so we can extract attention even after head pruning attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:] all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions) - outputs = outputs + (all_attentions,) - return outputs # last hidden state, presents, (all hidden_states), (attentions) + + if not return_dict: + return tuple(v for v in [hidden_states, presents, all_hidden_states, all_attentions] if v is not None) + + return TFBaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=presents, + hidden_states=all_hidden_states, + attentions=all_attentions, + ) class TFGPT2PreTrainedModel(TFPreTrainedModel): @@ -408,6 +424,42 @@ class TFGPT2PreTrainedModel(TFPreTrainedModel): base_model_prefix = "transformer" +@dataclass +class TFGPT2DoubleHeadsModelOutput(ModelOutput): + """ + Base class for outputs of models predicting if two sentences are consecutive or not. + + Args: + lm_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + mc_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`): + Prediction scores of the multiple choice classification head (scores for each choice before SoftMax). + past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): + List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape + :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`). + + Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see + ``past_key_values`` input) to speed up sequential decoding. + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + lm_logits: tf.Tensor = None + mc_logits: tf.Tensor = None + past_key_values: Optional[List[tf.Tensor]] = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + GPT2_START_DOCSTRING = r""" .. note:: @@ -482,6 +534,11 @@ class TFGPT2PreTrainedModel(TFPreTrainedModel): (if set to :obj:`False`) for evaluation. output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -495,29 +552,13 @@ def __init__(self, config, *inputs, **kwargs): self.transformer = TFGPT2MainLayer(config, name="transformer") @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="gpt2", + output_type=TFBaseModelOutputWithPast, + config_class=_CONFIG_FOR_DOC, + ) def call(self, inputs, **kwargs): - r""" - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs: - last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the last layer of the model. - past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): - Contains pre-computed hidden-states (key and values in the attention blocks). - Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model - should not be passed as input ids as they have already been computed. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ outputs = self.transformer(inputs, **kwargs) return outputs @@ -543,7 +584,12 @@ def prepare_inputs_for_generation(self, inputs, past, **kwargs): return {"inputs": inputs, "past": past, "use_cache": kwargs["use_cache"]} @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="gpt2", + output_type=TFCausalLMOutputWithPast, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs, @@ -556,6 +602,7 @@ def call( use_cache=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -563,31 +610,12 @@ def call( labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the cross entropy classification loss. Indices should be in ``[0, ..., config.vocab_size - 1]``. - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs: - prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): - Contains pre-computed hidden-states (key and values in the attention blocks). - Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model - should not be passed as input ids as they have already been computed. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[10] if len(inputs) > 10 else labels - if len(inputs) > 10: - inputs = inputs[:10] + labels = inputs[11] if len(inputs) > 11 else labels + if len(inputs) > 11: + inputs = inputs[:11] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -602,6 +630,7 @@ def call( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -609,15 +638,24 @@ def call( logits = self.transformer.wte(hidden_states, mode="linear") - outputs = (logits,) + transformer_outputs[1:] + loss = None if labels is not None: # shift labels to the left and cut last logit token logits = logits[:, :-1] labels = labels[:, 1:] loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs - return outputs # lm_logits, presents, (all hidden_states), (attentions) + if not return_dict: + output = (logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) @add_start_docstrings( @@ -641,6 +679,7 @@ def get_output_embeddings(self): return self.transformer.wte @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFGPT2DoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC) def call( self, inputs, @@ -654,6 +693,7 @@ def call( use_cache=None, output_attentions=None, output_hidden_states=None, + return_dict=None, training=False, ): r""" @@ -662,26 +702,6 @@ def call( Selected in the range ``[0, input_ids.size(-1) - 1[``. Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs: - lm_prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - mc_prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`): - Prediction scores of the multiple choice classification head (scores for each choice before SoftMax). - past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): - Contains pre-computed hidden-states (key and values in the attention blocks). - Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model - should not be passed as `input_ids` as they have already been computed. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. Examples:: @@ -717,8 +737,10 @@ def call( inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds mc_token_ids = inputs[7] if len(inputs) > 7 else mc_token_ids use_cache = inputs[8] if len(inputs) > 8 else use_cache - output_attentions = inputs[9] if len(inputs) > 8 else output_attentions - assert len(inputs) <= 10, "Too many inputs." + output_attentions = inputs[9] if len(inputs) > 9 else output_attentions + output_hidden_states = inputs[10] if len(inputs) > 10 else output_hidden_states + return_dict = inputs[11] if len(inputs) > 11 else return_dict + assert len(inputs) <= 12, "Too many inputs." elif isinstance(inputs, dict): input_ids = inputs.get("input_ids") past = inputs.get("past", past) @@ -730,9 +752,12 @@ def call( mc_token_ids = inputs.get("mc_token_ids", mc_token_ids) use_cache = inputs.get("use_cache", use_cache) output_attentions = inputs.get("output_attentions", output_attentions) - assert len(inputs) <= 10, "Too many inputs." + output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) + return_dict = inputs.get("return_dict", return_dict) + assert len(inputs) <= 12, "Too many inputs." else: input_ids = inputs + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if input_ids is not None: input_shapes = shape_list(input_ids) @@ -755,6 +780,7 @@ def call( use_cache, output_attentions, output_hidden_states, + return_dict=return_dict, training=training, ) hidden_states = transformer_outputs[0] @@ -762,6 +788,14 @@ def call( lm_logits = self.transformer.wte(hidden_states, mode="linear") mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids, training=training) mc_logits = tf.squeeze(mc_logits, axis=-1) - outputs = (lm_logits, mc_logits) + transformer_outputs[1:] - return outputs # lm logits, mc logits, presents, (all hidden_states), (attentions) + if not return_dict: + return (lm_logits, mc_logits) + transformer_outputs[1:] + + return TFGPT2DoubleHeadsModelOutput( + lm_logits=lm_logits, + mc_logits=mc_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) diff --git a/src/transformers/modeling_tf_mobilebert.py b/src/transformers/modeling_tf_mobilebert.py index 5cee9e764be7..fd748c30ca35 100644 --- a/src/transformers/modeling_tf_mobilebert.py +++ b/src/transformers/modeling_tf_mobilebert.py @@ -17,17 +17,31 @@ import logging +from dataclasses import dataclass +from typing import Optional, Tuple import tensorflow as tf from . import MobileBertConfig from .file_utils import ( MULTIPLE_CHOICE_DUMMY_INPUTS, + ModelOutput, add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable, + replace_return_docstrings, ) from .modeling_tf_bert import TFBertIntermediate, gelu, gelu_new, swish +from .modeling_tf_outputs import ( + TFBaseModelOutput, + TFBaseModelOutputWithPooling, + TFMaskedLMOutput, + TFMultipleChoiceModelOutput, + TFNextSentencePredictorOutput, + TFQuestionAnsweringModelOutput, + TFSequenceClassifierOutput, + TFTokenClassifierOutput, +) from .modeling_tf_utils import ( TFMaskedLanguageModelingLoss, TFMultipleChoiceLoss, @@ -44,6 +58,7 @@ logger = logging.getLogger(__name__) +_CONFIG_FOR_DOC = "MobileBertConfig" _TOKENIZER_FOR_DOC = "MobileBertTokenizer" TF_MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ @@ -541,9 +556,18 @@ def __init__(self, config, **kwargs): self.output_hidden_states = config.output_hidden_states self.layer = [TFMobileBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)] - def call(self, hidden_states, attention_mask, head_mask, output_attentions, output_hidden_states, training=False): - all_hidden_states = () - all_attentions = () + def call( + self, + hidden_states, + attention_mask, + head_mask, + output_attentions, + output_hidden_states, + return_dict, + training=False, + ): + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None for i, layer_module in enumerate(self.layer): if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) @@ -561,12 +585,11 @@ def call(self, hidden_states, attention_mask, head_mask, output_attentions, outp if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - outputs = (hidden_states,) - if output_hidden_states: - outputs = outputs + (all_hidden_states,) - if output_attentions: - outputs = outputs + (all_attentions,) - return outputs # outputs, (hidden states), (attentions) + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) + return TFBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions + ) class TFMobileBertPooler(tf.keras.layers.Layer): @@ -660,6 +683,7 @@ def __init__(self, config, **kwargs): self.num_hidden_layers = config.num_hidden_layers self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states + self.return_dict = config.use_return_dict self.embeddings = TFMobileBertEmbeddings(config, name="embeddings") self.encoder = TFMobileBertEncoder(config, name="encoder") @@ -688,6 +712,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, training=False, ): if isinstance(inputs, (tuple, list)): @@ -699,7 +724,8 @@ def call( inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states - assert len(inputs) <= 8, "Too many inputs." + return_dict = inputs[8] if len(inputs) > 8 else return_dict + assert len(inputs) <= 9, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -709,12 +735,14 @@ def call( inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) - assert len(inputs) <= 8, "Too many inputs." + return_dict = inputs.get("return_dict", return_dict) + assert len(inputs) <= 9, "Too many inputs." else: input_ids = inputs output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states + return_dict = return_dict if return_dict is not None else self.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -763,16 +791,22 @@ def call( head_mask, output_attentions, output_hidden_states, + return_dict, training=training, ) sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output) - outputs = (sequence_output, pooled_output,) + encoder_outputs[ - 1: - ] # add hidden_states and attentions if they are here - return outputs # sequence_output, pooled_output, (hidden_states), (attentions) + if not return_dict: + return (sequence_output, pooled_output,) + encoder_outputs[1:] + + return TFBaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) class TFMobileBertPreTrainedModel(TFPreTrainedModel): @@ -784,6 +818,37 @@ class TFMobileBertPreTrainedModel(TFPreTrainedModel): base_model_prefix = "mobilebert" +@dataclass +class TFMobileBertForPreTrainingOutput(ModelOutput): + """ + Output type of :class:`~transformers.TFMobileBertForPreTrainingModel`. + + Args: + prediction_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + seq_relationship_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False + continuation before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[tf.Tensor] = None + prediction_logits: tf.Tensor = None + seq_relationship_logits: tf.Tensor = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + MOBILEBERT_START_DOCSTRING = r""" This model is a `tf.keras.Model `__ sub-class. Use it as a regular TF 2.0 Keras Model and @@ -852,6 +917,13 @@ class TFMobileBertPreTrainedModel(TFPreTrainedModel): training (:obj:`boolean`, `optional`, defaults to :obj:`False`): Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them (if set to :obj:`False`) for evaluation. + output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -865,32 +937,13 @@ def __init__(self, config, *inputs, **kwargs): self.mobilebert = TFMobileBertMainLayer(config, name="mobilebert") @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="google/mobilebert-uncased", + output_type=TFBaseModelOutputWithPooling, + config_class=_CONFIG_FOR_DOC, + ) def call(self, inputs, **kwargs): - r""" - Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs: - last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`): - Last layer hidden-state of the first token of the sequence (classification token) - further processed by a Linear layer and a Tanh activation function. The Linear - layer weights are trained from the next sentence prediction (classification) - objective during the original Bert pretraining. This output is usually *not* a good summary - of the semantic content of the input, you're often better with averaging or pooling - the sequence of hidden-states for the whole input sequence. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ outputs = self.mobilebert(inputs, **kwargs) return outputs @@ -911,25 +964,10 @@ def get_output_embeddings(self): return self.mobilebert.embeddings @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) + @replace_return_docstrings(output_type=TFMobileBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) def call(self, inputs, **kwargs): r""" Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs: - prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - seq_relationship_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`): - Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. Examples:: @@ -943,16 +981,23 @@ def call(self, inputs, **kwargs): >>> prediction_scores, seq_relationship_scores = outputs[:2] """ + return_dict = kwargs.get("return_dict") + return_dict = return_dict if return_dict is not None else self.mobilebert.return_dict outputs = self.mobilebert(inputs, **kwargs) sequence_output, pooled_output = outputs[:2] prediction_scores = self.predictions(sequence_output) seq_relationship_score = self.seq_relationship(pooled_output) - outputs = (prediction_scores, seq_relationship_score,) + outputs[ - 2: - ] # add hidden states and attention if they are here - return outputs # prediction_scores, seq_relationship_score, (hidden_states), (attentions) + if not return_dict: + return (prediction_scores, seq_relationship_score) + outputs[2:] + + return TFMobileBertForPreTrainingOutput( + prediction_logits=prediction_scores, + seq_relationship_logits=seq_relationship_score, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) @add_start_docstrings("""MobileBert Model with a `language modeling` head on top. """, MOBILEBERT_START_DOCSTRING) @@ -967,7 +1012,12 @@ def get_output_embeddings(self): return self.mobilebert.embeddings @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="google/mobilebert-uncased", + output_type=TFMaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -978,6 +1028,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -986,27 +1037,12 @@ def call( Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs: - prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.mobilebert.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[8] if len(inputs) > 8 else labels - if len(inputs) > 8: - inputs = inputs[:8] + labels = inputs[9] if len(inputs) > 9 else labels + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -1019,18 +1055,22 @@ def call( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) sequence_output = outputs[0] prediction_scores = self.mlm(sequence_output, training=training) - outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here - if labels is not None: - loss = self.compute_loss(labels, prediction_scores) - outputs = (loss,) + outputs + loss = None if labels is None else self.compute_loss(labels, prediction_scores) - return outputs # (loss), prediction_scores, (hidden_states), (attentions) + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFMaskedLMOutput( + loss=loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) class TFMobileBertOnlyNSPHead(tf.keras.layers.Layer): @@ -1055,23 +1095,10 @@ def __init__(self, config, *inputs, **kwargs): self.cls = TFMobileBertOnlyNSPHead(config, name="seq_relationship___cls") @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) + @replace_return_docstrings(output_type=TFNextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC) def call(self, inputs, **kwargs): r""" Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs: - seq_relationship_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`) - Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. Examples:: @@ -1087,14 +1114,19 @@ def call(self, inputs, **kwargs): >>> logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0] """ + return_dict = kwargs.get("return_dict") + return_dict = return_dict if return_dict is not None else self.mobilebert.return_dict outputs = self.mobilebert(inputs, **kwargs) pooled_output = outputs[1] seq_relationship_score = self.cls(pooled_output) - outputs = (seq_relationship_score,) + outputs[2:] # add hidden states and attention if they are here + if not return_dict: + return (seq_relationship_score,) + outputs[2:] - return outputs # seq_relationship_score, (hidden_states), (attentions) + return TFNextSentencePredictorOutput( + logits=seq_relationship_score, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -1114,7 +1146,12 @@ def __init__(self, config, *inputs, **kwargs): ) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="google/mobilebert-uncased", + output_type=TFSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -1125,6 +1162,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -1134,27 +1172,12 @@ def call( Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs: - logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`): - Classification (or regression if config.num_labels==1) scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.mobilebert.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[8] if len(inputs) > 8 else labels - if len(inputs) > 8: - inputs = inputs[:8] + labels = inputs[9] if len(inputs) > 9 else labels + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -1167,6 +1190,7 @@ def call( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -1175,13 +1199,15 @@ def call( pooled_output = self.dropout(pooled_output, training=training) logits = self.classifier(pooled_output) - outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here + loss = None if labels is None else self.compute_loss(labels, logits) - if labels is not None: - loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), logits, (hidden_states), (attentions) + return TFSequenceClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -1200,7 +1226,12 @@ def __init__(self, config, *inputs, **kwargs): ) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="google/mobilebert-uncased", + output_type=TFQuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -1211,6 +1242,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, start_positions=None, end_positions=None, training=False, @@ -1224,30 +1256,13 @@ def call( Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs: - start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): - Span-start scores (before SoftMax). - end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): - Span-end scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.mobilebert.return_dict if isinstance(inputs, (tuple, list)): - start_positions = inputs[8] if len(inputs) > 8 else start_positions - end_positions = inputs[9] if len(inputs) > 9 else end_positions - if len(inputs) > 8: - inputs = inputs[:8] + start_positions = inputs[9] if len(inputs) > 9 else start_positions + end_positions = inputs[10] if len(inputs) > 10 else end_positions + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): start_positions = inputs.pop("start_positions", start_positions) end_positions = inputs.pop("end_positions", start_positions) @@ -1261,6 +1276,7 @@ def call( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -1271,15 +1287,23 @@ def call( start_logits = tf.squeeze(start_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1) - outputs = (start_logits, end_logits,) + outputs[2:] - + loss = None if start_positions is not None and end_positions is not None: labels = {"start_position": start_positions} labels["end_position"] = end_positions - loss = self.compute_loss(labels, outputs[:2]) - outputs = (loss,) + outputs - - return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) + loss = self.compute_loss(labels, (start_logits, end_logits)) + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFQuestionAnsweringModelOutput( + loss=loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) @add_start_docstrings( @@ -1307,7 +1331,12 @@ def dummy_inputs(self): return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="google/mobilebert-uncased", + output_type=TFMultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs, @@ -1318,6 +1347,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -1326,24 +1356,6 @@ def call( Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs: - classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`: - `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above). - - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ if isinstance(inputs, (tuple, list)): input_ids = inputs[0] @@ -1354,8 +1366,9 @@ def call( inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states - labels = inputs[8] if len(inputs) > 8 else labels - assert len(inputs) <= 9, "Too many inputs." + return_dict = inputs[8] if len(inputs) > 8 else return_dict + labels = inputs[9] if len(inputs) > 9 else labels + assert len(inputs) <= 10, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -1365,10 +1378,12 @@ def call( inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) + return_dict = inputs.get("return_dict", return_dict) labels = inputs.get("labels", labels) - assert len(inputs) <= 9, "Too many inputs." + assert len(inputs) <= 10, "Too many inputs." else: input_ids = inputs + return_dict = return_dict if return_dict is not None else self.mobilebert.return_dict if input_ids is not None: num_choices = shape_list(input_ids)[1] @@ -1395,19 +1410,23 @@ def call( flat_inputs_embeds, output_attentions, output_hidden_states, + return_dict=return_dict, training=training, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output, training=training) logits = self.classifier(pooled_output) reshaped_logits = tf.reshape(logits, (-1, num_choices)) - outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here - if labels is not None: - loss = self.compute_loss(labels, reshaped_logits) - outputs = (loss,) + outputs + loss = None if labels is None else self.compute_loss(labels, reshaped_logits) - return outputs # (loss), reshaped_logits, (hidden_states), (attentions) + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFMultipleChoiceModelOutput( + loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -1427,7 +1446,12 @@ def __init__(self, config, *inputs, **kwargs): ) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="google/mobilebert-uncased", + output_type=TFTokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -1438,6 +1462,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -1445,27 +1470,12 @@ def call( labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs: - scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.mobilebert.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[8] if len(inputs) > 8 else labels - if len(inputs) > 8: - inputs = inputs[:8] + labels = inputs[9] if len(inputs) > 9 else labels + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -1478,6 +1488,7 @@ def call( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -1486,10 +1497,12 @@ def call( sequence_output = self.dropout(sequence_output, training=training) logits = self.classifier(sequence_output) - outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here + loss = None if labels is None else self.compute_loss(labels, logits) - if labels is not None: - loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), logits, (hidden_states), (attentions) + return TFTokenClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) diff --git a/src/transformers/modeling_tf_openai.py b/src/transformers/modeling_tf_openai.py index d5174f142a8c..e37478ef51f2 100644 --- a/src/transformers/modeling_tf_openai.py +++ b/src/transformers/modeling_tf_openai.py @@ -17,12 +17,21 @@ import logging +from dataclasses import dataclass +from typing import Optional, Tuple import numpy as np import tensorflow as tf from .configuration_openai import OpenAIGPTConfig -from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable +from .file_utils import ( + ModelOutput, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_callable, + replace_return_docstrings, +) +from .modeling_tf_outputs import TFBaseModelOutput, TFCausalLMOutput from .modeling_tf_utils import ( TFCausalLanguageModelingLoss, TFConv1D, @@ -38,6 +47,7 @@ logger = logging.getLogger(__name__) +_CONFIG_FOR_DOC = "OpenAIGPTConfig" _TOKENIZER_FOR_DOC = "OpenAIGPTTokenizer" TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [ @@ -208,6 +218,7 @@ def __init__(self, config, *inputs, **kwargs): super().__init__(*inputs, **kwargs) self.output_hidden_states = config.output_hidden_states self.output_attentions = config.output_attentions + self.return_dict = config.use_return_dict self.num_hidden_layers = config.n_layer self.vocab_size = config.vocab_size self.n_embd = config.n_embd @@ -247,6 +258,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, training=False, ): if isinstance(inputs, (tuple, list)): @@ -258,7 +270,8 @@ def call( inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states - assert len(inputs) <= 8, "Too many inputs." + return_dict = inputs[8] if len(inputs) > 8 else return_dict + assert len(inputs) <= 9, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -268,12 +281,14 @@ def call( inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) - assert len(inputs) <= 8, "Too many inputs." + return_dict = inputs.get("return_dict", return_dict) + assert len(inputs) <= 9, "Too many inputs." else: input_ids = inputs output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states + return_dict = return_dict if return_dict is not None else self.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -333,8 +348,8 @@ def call( output_shape = input_shape + [shape_list(hidden_states)[-1]] - all_attentions = [] - all_hidden_states = () + all_attentions = () if output_attentions else None + all_hidden_states = () if output_hidden_states else None for i, block in enumerate(self.h): if output_hidden_states: all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),) @@ -342,22 +357,24 @@ def call( outputs = block(hidden_states, attention_mask, head_mask[i], output_attentions, training=training) hidden_states = outputs[0] if output_attentions: - all_attentions.append(outputs[1]) + all_attentions = all_attentions + (outputs[1],) hidden_states = tf.reshape(hidden_states, output_shape) # Add last hidden state if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - outputs = (hidden_states,) - if output_hidden_states: - outputs = outputs + (all_hidden_states,) if output_attentions: # let the number of heads free (-1) so we can extract attention even after head pruning attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:] all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions) - outputs = outputs + (all_attentions,) - return outputs # last hidden state, (all hidden_states), (attentions) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) + + return TFBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions, + ) class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel): @@ -369,6 +386,35 @@ class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel): base_model_prefix = "transformer" +@dataclass +class TFOpenAIGPTDoubleHeadsModelOutput(ModelOutput): + """ + Base class for outputs of models predicting if two sentences are consecutive or not. + + Args: + lm_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + mc_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`): + Prediction scores of the multiple choice classification head (scores for each choice before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + lm_logits: tf.Tensor = None + mc_logits: tf.Tensor = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + OPENAI_GPT_START_DOCSTRING = r""" .. note:: @@ -436,6 +482,11 @@ class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel): (if set to :obj:`False`) for evaluation. output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -449,25 +500,13 @@ def __init__(self, config, *inputs, **kwargs): self.transformer = TFOpenAIGPTMainLayer(config, name="transformer") @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="openai-gpt", + output_type=TFBaseModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call(self, inputs, **kwargs): - r""" - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs: - last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the last layer of the model. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ outputs = self.transformer(inputs, **kwargs) return outputs @@ -486,7 +525,12 @@ def get_output_embeddings(self): return self.transformer.tokens_embed @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="openai-gpt", + output_type=TFCausalLMOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs, @@ -497,6 +541,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -504,27 +549,12 @@ def call( labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the cross entropy classification loss. Indices should be in ``[0, ..., config.vocab_size - 1]``. - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs: - prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[8] if len(inputs) > 8 else labels - if len(inputs) > 8: - inputs = inputs[:8] + labels = inputs[9] if len(inputs) > 9 else labels + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -537,21 +567,30 @@ def call( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) hidden_states = transformer_outputs[0] logits = self.transformer.tokens_embed(hidden_states, mode="linear") - outputs = (logits,) + transformer_outputs[1:] + loss = None if labels is not None: # shift labels to the left and cut last logit token logits = logits[:, :-1] labels = labels[:, 1:] loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs - return outputs # lm_logits, (all hidden_states), (attentions) + if not return_dict: + output = (logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFCausalLMOutput( + loss=loss, + logits=logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) @add_start_docstrings( @@ -575,6 +614,7 @@ def get_output_embeddings(self): return self.transformer.tokens_embed @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFOpenAIGPTDoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC) def call( self, inputs, @@ -586,6 +626,7 @@ def call( mc_token_ids=None, output_attentions=None, output_hidden_states=None, + return_dict=None, training=False, ): r""" @@ -594,27 +635,6 @@ def call( Selected in the range ``[0, input_ids.size(-1) - 1]``. Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs: - lm_prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - mc_prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`): - Prediction scores of the multiple choice classification head (scores for each choice before SoftMax). - past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): - Contains pre-computed hidden-states (key and values in the attention blocks). - Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model - should not be passed as input ids as they have already been computed. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - Examples:: @@ -646,7 +666,9 @@ def call( inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds mc_token_ids = inputs[6] if len(inputs) > 6 else mc_token_ids output_attentions = inputs[7] if len(inputs) > 7 else output_attentions - assert len(inputs) <= 8, "Too many inputs." + output_hidden_states = inputs[8] if len(inputs) > 8 else output_hidden_states + return_dict = inputs[9] if len(inputs) > 9 else return_dict + assert len(inputs) <= 10, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -656,9 +678,12 @@ def call( inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) mc_token_ids = inputs.get("mc_token_ids", mc_token_ids) output_attentions = inputs.get("output_attentions", output_attentions) - assert len(inputs) <= 8, "Too many inputs." + output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) + return_dict = inputs.get("return_dict", return_dict) + assert len(inputs) <= 10, "Too many inputs." else: input_ids = inputs + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if input_ids is not None: input_shapes = shape_list(input_ids) @@ -679,6 +704,7 @@ def call( inputs_embeds, output_attentions, output_hidden_states, + return_dict=return_dict, training=training, ) hidden_states = transformer_outputs[0] @@ -686,6 +712,13 @@ def call( lm_logits = self.transformer.tokens_embed(hidden_states, mode="linear") mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids, training=training) mc_logits = tf.squeeze(mc_logits, axis=-1) - outputs = (lm_logits, mc_logits) + transformer_outputs[1:] - return outputs # lm logits, mc logits, (all hidden_states), (attentions) + if not return_dict: + return (lm_logits, mc_logits) + transformer_outputs[1:] + + return TFOpenAIGPTDoubleHeadsModelOutput( + lm_logits=lm_logits, + mc_logits=mc_logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) diff --git a/src/transformers/modeling_tf_outputs.py b/src/transformers/modeling_tf_outputs.py new file mode 100644 index 000000000000..8d61a175723e --- /dev/null +++ b/src/transformers/modeling_tf_outputs.py @@ -0,0 +1,555 @@ +from dataclasses import dataclass +from typing import List, Optional, Tuple + +import tensorflow as tf + +from .file_utils import ModelOutput + + +@dataclass +class TFBaseModelOutput(ModelOutput): + """ + Base class for model's outputs, with potential hidden states and attentions. + + Args: + last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (:obj:`tuple(tf.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + last_hidden_state: tf.Tensor = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFBaseModelOutputWithPooling(ModelOutput): + """ + Base class for model's outputs that also contains a pooling of the last hidden states. + + Args: + last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`): + Last layer hidden-state of the first token of the sequence (classification token) + further processed by a Linear layer and a Tanh activation function. The Linear + layer weights are trained from the next sentence prediction (classification) + objective during pretraining. + + This output is usually *not* a good summary + of the semantic content of the input, you're often better with averaging or pooling + the sequence of hidden-states for the whole input sequence. + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + last_hidden_state: tf.Tensor = None + pooler_output: tf.Tensor = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFBaseModelOutputWithPast(ModelOutput): + """ + Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding). + + Args: + last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + + If `past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, 1, hidden_size)` is output. + past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): + List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape + :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`). + + Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see + ``past_key_values`` input) to speed up sequential decoding. + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + last_hidden_state: tf.Tensor = None + past_key_values: Optional[List[tf.Tensor]] = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFSeq2SeqModelOutput(ModelOutput): + """ + Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential + decoding. + + Args: + last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + + If ``decoder_past_key_values`` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, 1, hidden_size)` is output. + decoder_past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): + List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape + :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`). + + Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be + used (see ``decoder_past_key_values`` input) to speed up sequential decoding. + decoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. + decoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + encoder_last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + """ + + last_hidden_state: tf.Tensor = None + decoder_past_key_values: Optional[List[tf.Tensor]] = None + decoder_hidden_states: Optional[Tuple[tf.Tensor]] = None + decoder_attentions: Optional[Tuple[tf.Tensor]] = None + encoder_last_hidden_state: Optional[tf.Tensor] = None + encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None + encoder_attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFCausalLMOutput(ModelOutput): + """ + Base class for causal language model (or autoregressive) outputs. + + Args: + loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): + Language modeling loss (for next-token prediction). + logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[tf.Tensor] = None + logits: tf.Tensor = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFCausalLMOutputWithPast(ModelOutput): + """ + Base class for causal language model (or autoregressive) outputs. + + Args: + loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): + Language modeling loss (for next-token prediction). + logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): + List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape + :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`). + + Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see + ``past_key_values`` input) to speed up sequential decoding. + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[tf.Tensor] = None + logits: tf.Tensor = None + past_key_values: Optional[List[tf.Tensor]] = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFMaskedLMOutput(ModelOutput): + """ + Base class for masked language models outputs. + + Args: + loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): + Masked languaged modeling (MLM) loss. + logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[tf.Tensor] = None + logits: tf.Tensor = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFSeq2SeqLMOutput(ModelOutput): + """ + Base class for sequence-to-sequence language models outputs. + + Args: + loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): + Languaged modeling loss. + logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + decoder_past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): + List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape + :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`). + + Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be + used (see ``decoder_past_key_values`` input) to speed up sequential decoding. + decoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. + decoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + encoder_last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + """ + + loss: Optional[tf.Tensor] = None + logits: tf.Tensor = None + decoder_past_key_values: Optional[List[tf.Tensor]] = None + decoder_hidden_states: Optional[Tuple[tf.Tensor]] = None + decoder_attentions: Optional[Tuple[tf.Tensor]] = None + encoder_last_hidden_state: Optional[tf.Tensor] = None + encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None + encoder_attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFNextSentencePredictorOutput(ModelOutput): + """ + Base class for outputs of models predicting if two sentences are consecutive or not. + + Args: + logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + logits: tf.Tensor = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFSequenceClassifierOutput(ModelOutput): + """ + Base class for outputs of sentence classification models. + + Args: + loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[tf.Tensor] = None + logits: tf.Tensor = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFSeq2SeqSequenceClassifierOutput(ModelOutput): + """ + Base class for outputs of sequence-to-sequence sentence classification models. + + Args: + loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + decoder_past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): + List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape + :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`). + + Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be + used (see ``decoder_past_key_values`` input) to speed up sequential decoding. + decoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. + decoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + encoder_last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + """ + + loss: Optional[tf.Tensor] = None + logits: tf.Tensor = None + decoder_past_key_values: Optional[List[tf.Tensor]] = None + decoder_hidden_states: Optional[Tuple[tf.Tensor]] = None + decoder_attentions: Optional[Tuple[tf.Tensor]] = None + encoder_last_hidden_state: Optional[tf.Tensor] = None + encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None + encoder_attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFMultipleChoiceModelOutput(ModelOutput): + """ + Base class for outputs of multiple choice models. + + Args: + loss (:obj:`tf.Tensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided): + Classification loss. + logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`): + `num_choices` is the second dimension of the input tensors. (see `input_ids` above). + + Classification scores (before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[tf.Tensor] = None + logits: tf.Tensor = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFTokenClassifierOutput(ModelOutput): + """ + Base class for outputs of token classification models. + + Args: + loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) : + Classification loss. + logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): + Classification scores (before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[tf.Tensor] = None + logits: tf.Tensor = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFQuestionAnsweringModelOutput(ModelOutput): + """ + Base class for outputs of question answering models. + + Args: + loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): + Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. + start_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): + Span-start scores (before SoftMax). + end_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): + Span-end scores (before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[tf.Tensor] = None + start_logits: tf.Tensor = None + end_logits: tf.Tensor = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFSeq2SeqQuestionAnsweringModelOutput(ModelOutput): + """ + Base class for outputs of sequence-to-sequence question answering models. + + Args: + loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): + Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. + start_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): + Span-start scores (before SoftMax). + end_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): + Span-end scores (before SoftMax). + decoder_past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): + List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape + :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`). + + Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be + used (see ``decoder_past_key_values`` input) to speed up sequential decoding. + decoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. + decoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + encoder_last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + """ + + loss: Optional[tf.Tensor] = None + start_logits: tf.Tensor = None + end_logits: tf.Tensor = None + decoder_past_key_values: Optional[List[tf.Tensor]] = None + decoder_hidden_states: Optional[Tuple[tf.Tensor]] = None + decoder_attentions: Optional[Tuple[tf.Tensor]] = None + encoder_last_hidden_state: Optional[tf.Tensor] = None + encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None + encoder_attentions: Optional[Tuple[tf.Tensor]] = None diff --git a/src/transformers/modeling_tf_roberta.py b/src/transformers/modeling_tf_roberta.py index fbe1a4be5869..3eb5e5a69848 100644 --- a/src/transformers/modeling_tf_roberta.py +++ b/src/transformers/modeling_tf_roberta.py @@ -28,6 +28,14 @@ add_start_docstrings_to_callable, ) from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu +from .modeling_tf_outputs import ( + TFBaseModelOutputWithPooling, + TFMaskedLMOutput, + TFMultipleChoiceModelOutput, + TFQuestionAnsweringModelOutput, + TFSequenceClassifierOutput, + TFTokenClassifierOutput, +) from .modeling_tf_utils import ( TFMaskedLanguageModelingLoss, TFMultipleChoiceLoss, @@ -44,6 +52,7 @@ logger = logging.getLogger(__name__) +_CONFIG_FOR_DOC = "RobertaConfig" _TOKENIZER_FOR_DOC = "RobertaTokenizer" TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [ @@ -190,6 +199,11 @@ class TFRobertaPreTrainedModel(TFPreTrainedModel): (if set to :obj:`False`) for evaluation. output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -203,32 +217,13 @@ def __init__(self, config, *inputs, **kwargs): self.roberta = TFRobertaMainLayer(config, name="roberta") @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="roberta-base", + output_type=TFBaseModelOutputWithPooling, + config_class=_CONFIG_FOR_DOC, + ) def call(self, inputs, **kwargs): - r""" - Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: - last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`): - Last layer hidden-state of the first token of the sequence (classification token) - further processed by a Linear layer and a Tanh activation function. The Linear - layer weights are trained from the next sentence prediction (classification) - objective during Bert pretraining. This output is usually *not* a good summary - of the semantic content of the input, you're often better with averaging or pooling - the sequence of hidden-states for the whole input sequence. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ outputs = self.roberta(inputs, **kwargs) return outputs @@ -276,7 +271,12 @@ def get_output_embeddings(self): return self.lm_head.decoder @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="roberta-base", + output_type=TFMaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -287,6 +287,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -296,27 +297,12 @@ def call( Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: - prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.roberta.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[8] if len(inputs) > 8 else labels - if len(inputs) > 8: - inputs = inputs[:8] + labels = inputs[9] if len(inputs) > 9 else labels + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -329,6 +315,7 @@ def call( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -337,13 +324,15 @@ def call( sequence_output = outputs[0] prediction_scores = self.lm_head(sequence_output) - outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here + loss = None if labels is None else self.compute_loss(labels, prediction_scores) - if labels is not None: - loss = self.compute_loss(labels, prediction_scores) - outputs = (loss,) + outputs + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), prediction_scores, (hidden_states), (attentions) + return TFMaskedLMOutput( + loss=loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) class TFRobertaClassificationHead(tf.keras.layers.Layer): @@ -385,7 +374,12 @@ def __init__(self, config, *inputs, **kwargs): self.classifier = TFRobertaClassificationHead(config, name="classifier") @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="roberta-base", + output_type=TFSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -396,30 +390,22 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): r""" - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: - logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`): - Classification (or regression if config.num_labels==1) scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. + labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + Labels for computing the sequence classification/regression loss. + Indices should be in :obj:`[0, ..., config.num_labels - 1]`. + If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), + If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ + return_dict = return_dict if return_dict is not None else self.roberta.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[8] if len(inputs) > 8 else labels - if len(inputs) > 8: - inputs = inputs[:8] + labels = inputs[9] if len(inputs) > 9 else labels + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -432,19 +418,22 @@ def call( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) sequence_output = outputs[0] logits = self.classifier(sequence_output, training=training) - outputs = (logits,) + outputs[2:] + loss = None if labels is None else self.compute_loss(labels, logits) - if labels is not None: - loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), logits, (hidden_states), (attentions) + return TFSequenceClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -472,7 +461,12 @@ def dummy_inputs(self): return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="roberta-base", + output_type=TFMultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs, @@ -483,6 +477,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -491,24 +486,6 @@ def call( Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`: - `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above). - - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ if isinstance(inputs, (tuple, list)): input_ids = inputs[0] @@ -519,8 +496,9 @@ def call( inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states - labels = inputs[8] if len(inputs) > 8 else labels - assert len(inputs) <= 9, "Too many inputs." + return_dict = inputs[8] if len(inputs) > 8 else return_dict + labels = inputs[9] if len(inputs) > 9 else labels + assert len(inputs) <= 10, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -530,10 +508,12 @@ def call( inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_attentions) + return_dict = inputs.get("return_dict", return_dict) labels = inputs.get("labels", labels) - assert len(inputs) <= 9, "Too many inputs." + assert len(inputs) <= 10, "Too many inputs." else: input_ids = inputs + return_dict = return_dict if return_dict is not None else self.roberta.return_dict if input_ids is not None: num_choices = shape_list(input_ids)[1] @@ -555,19 +535,23 @@ def call( inputs_embeds, output_attentions, output_hidden_states, + return_dict=return_dict, training=training, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output, training=training) logits = self.classifier(pooled_output) reshaped_logits = tf.reshape(logits, (-1, num_choices)) - outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here - if labels is not None: - loss = self.compute_loss(labels, reshaped_logits) - outputs = (loss,) + outputs + loss = None if labels is None else self.compute_loss(labels, reshaped_logits) + + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), reshaped_logits, (hidden_states), (attentions) + return TFMultipleChoiceModelOutput( + loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -587,7 +571,12 @@ def __init__(self, config, *inputs, **kwargs): ) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="roberta-base", + output_type=TFTokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -598,6 +587,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -605,27 +595,12 @@ def call( labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: - scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.roberta.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[8] if len(inputs) > 8 else labels - if len(inputs) > 8: - inputs = inputs[:8] + labels = inputs[9] if len(inputs) > 9 else labels + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -638,6 +613,7 @@ def call( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -646,13 +622,15 @@ def call( sequence_output = self.dropout(sequence_output, training=training) logits = self.classifier(sequence_output) - outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here + loss = None if labels is None else self.compute_loss(labels, logits) - if labels is not None: - loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), logits, (hidden_states), (attentions) + return TFTokenClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -670,7 +648,12 @@ def __init__(self, config, *inputs, **kwargs): ) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="roberta-base", + output_type=TFQuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -681,6 +664,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, start_positions=None, end_positions=None, training=False, @@ -694,30 +678,13 @@ def call( Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: - start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): - Span-start scores (before SoftMax). - end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): - Span-end scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.roberta.return_dict if isinstance(inputs, (tuple, list)): - start_positions = inputs[8] if len(inputs) > 8 else start_positions - end_positions = inputs[9] if len(inputs) > 9 else end_positions - if len(inputs) > 8: - inputs = inputs[:8] + start_positions = inputs[9] if len(inputs) > 9 else start_positions + end_positions = inputs[10] if len(inputs) > 10 else end_positions + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): start_positions = inputs.pop("start_positions", start_positions) end_positions = inputs.pop("end_positions", start_positions) @@ -731,6 +698,7 @@ def call( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -741,12 +709,20 @@ def call( start_logits = tf.squeeze(start_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1) - outputs = (start_logits, end_logits,) + outputs[2:] - + loss = None if start_positions is not None and end_positions is not None: labels = {"start_position": start_positions} labels["end_position"] = end_positions - loss = self.compute_loss(labels, outputs[:2]) - outputs = (loss,) + outputs - - return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) + loss = self.compute_loss(labels, (start_logits, end_logits)) + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFQuestionAnsweringModelOutput( + loss=loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/src/transformers/modeling_tf_t5.py b/src/transformers/modeling_tf_t5.py index 9858b8ae7665..819240ee7891 100644 --- a/src/transformers/modeling_tf_t5.py +++ b/src/transformers/modeling_tf_t5.py @@ -25,7 +25,14 @@ import tensorflow as tf from .configuration_t5 import T5Config -from .file_utils import DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings, add_start_docstrings_to_callable +from .file_utils import ( + DUMMY_INPUTS, + DUMMY_MASK, + add_start_docstrings, + add_start_docstrings_to_callable, + replace_return_docstrings, +) +from .modeling_tf_outputs import TFSeq2SeqLMOutput, TFSeq2SeqModelOutput from .modeling_tf_utils import ( TFCausalLanguageModelingLoss, TFPreTrainedModel, @@ -39,6 +46,7 @@ logger = logging.getLogger(__name__) +_CONFIG_FOR_DOC = "T5Config" _TOKENIZER_FOR_DOC = "T5Tokenizer" TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST = [ @@ -575,8 +583,8 @@ def call( head_mask = inputs[5] if len(inputs) > 5 else head_mask past_key_value_states = inputs[6] if len(inputs) > 6 else past_key_value_states use_cache = inputs[7] if len(inputs) > 7 else use_cache - output_attentions = inputs[8] if len(inputs) > 7 else output_attentions - output_hidden_states = inputs[9] if len(inputs) > 8 else output_hidden_states + output_attentions = inputs[8] if len(inputs) > 8 else output_attentions + output_hidden_states = inputs[9] if len(inputs) > 9 else output_hidden_states assert len(inputs) <= 10, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") @@ -934,6 +942,7 @@ def get_decoder(self): return self.decoder @add_start_docstrings_to_callable(T5_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFSeq2SeqModelOutput, config_class=_CONFIG_FOR_DOC) def call( self, inputs, @@ -948,29 +957,11 @@ def call( use_cache=None, output_attentions=None, output_hidden_states=None, + return_dict=None, training=False, ): r""" Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.T5Config`) and inputs: - last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - If `decoder_past_key_value_states` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, 1, hidden_size)` is output. - decoder_past_key_value_states (:obj:`tuple(tuple(tf.Tensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`, `optional`, returned when ``use_cache=True``): - Contains pre-computed key and value hidden-states of the attention blocks. - Can be used to speed up sequential decoding (see `decoder_past_key_value_states` input). - Note that when using `decoder_past_key_value_states`, the model only outputs the last `hidden-state` of the sequence of shape :obj:`(batch_size, 1, config.vocab_size)`. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. Examples:: @@ -996,7 +987,8 @@ def call( use_cache = inputs[9] if len(inputs) > 9 else use_cache output_attentions = inputs[10] if len(inputs) > 10 else output_attentions output_hidden_states = inputs[11] if len(inputs) > 11 else output_hidden_states - assert len(inputs) <= 12, "Too many inputs." + return_dict = inputs[12] if len(inputs) > 12 else return_dict + assert len(inputs) <= 13, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): if "inputs" in inputs: warnings.warn("Using `inputs` as a keyword argument is deprecated. Please use `input_ids` instead.") @@ -1013,11 +1005,13 @@ def call( use_cache = inputs.get("use_cache", use_cache) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) - assert len(inputs) <= 12, "Too many inputs." + return_dict = inputs.get("return_dict", return_dict) + assert len(inputs) <= 13, "Too many inputs." else: input_ids = inputs use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.return_dict # Encode if needed (training, first prediction pass) if encoder_outputs is None: @@ -1063,12 +1057,40 @@ def call( ], training=training, ) + past = ( + (encoder_outputs, decoder_outputs[1]) if cast_bool_to_primitive(use_cache, self.config.use_cache) else None + ) + if not return_dict: + if past is not None: + decoder_outputs = decoder_outputs[:1] + (past,) + decoder_outputs[2:] + return decoder_outputs + encoder_outputs + + # If put before, this breaks the tf compilation. + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) - if cast_bool_to_primitive(use_cache, self.config.use_cache) is True: - past = ((encoder_outputs, decoder_outputs[1]),) - decoder_outputs = decoder_outputs[:1] + past + decoder_outputs[2:] - - return decoder_outputs + encoder_outputs + # This is long and annoying but if we introduce return_dict at the TFT5MainLayer level (like in PyTorch) + # TF refuses to compile anymore. + if not cast_bool_to_primitive(use_cache, self.config.use_cache): + decoder_outputs = decoder_outputs[:1] + (None,) + decoder_outputs[1:] + if not cast_bool_to_primitive(output_hidden_states, self.config.output_hidden_states): + encoder_outputs = encoder_outputs[:1] + (None,) + encoder_outputs[1:] + decoder_outputs = decoder_outputs[:2] + (None,) + decoder_outputs[2:] + if not cast_bool_to_primitive(output_attentions, self.config.output_attentions): + encoder_outputs = encoder_outputs + (None,) + decoder_outputs = decoder_outputs + (None,) + + return TFSeq2SeqModelOutput( + last_hidden_state=decoder_outputs[0], + decoder_past_key_values=past, + decoder_hidden_states=decoder_outputs[2], + decoder_attentions=decoder_outputs[3], + encoder_last_hidden_state=encoder_outputs[0], + encoder_hidden_states=encoder_outputs[1], + encoder_attentions=encoder_outputs[2], + ) @add_start_docstrings("""T5 Model with a `language modeling` head on top. """, T5_START_DOCSTRING) @@ -1115,6 +1137,7 @@ def get_decoder(self): return self.decoder @add_start_docstrings_to_callable(T5_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) def call( self, inputs, @@ -1129,6 +1152,7 @@ def call( use_cache=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -1138,24 +1162,6 @@ def call( Indices should be in ``[0, ..., config.vocab_size - 1]``. Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.T5Config`) and inputs: - prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - decoder_past_key_value_states (:obj:`tuple(tuple(tf.Tensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`, `optional`, returned when ``use_cache=True``): - Contains pre-computed key and value hidden-states of the attention blocks. - Can be used to speed up sequential decoding (see `decoder_past_key_value_states` input). - Note that when using `decoder_past_key_value_states`, the model only outputs the last `prediction_score` of the sequence of shape :obj:`(batch_size, 1, config.vocab_size)`. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. Examples:: @@ -1186,8 +1192,9 @@ def call( use_cache = inputs[9] if len(inputs) > 9 else use_cache output_attentions = inputs[10] if len(inputs) > 10 else output_attentions output_hidden_states = inputs[11] if len(inputs) > 11 else output_hidden_states - labels = inputs[12] if len(inputs) > 12 else labels - assert len(inputs) <= 13, "Too many inputs." + return_dict = inputs[12] if len(inputs) > 12 else return_dict + labels = inputs[13] if len(inputs) > 13 else labels + assert len(inputs) <= 14, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): if "inputs" in inputs: warnings.warn("Using `inputs` as a keyword argument is deprecated. Please use `input_ids` instead.") @@ -1204,12 +1211,14 @@ def call( use_cache = inputs.get("use_cache", use_cache) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) + return_dict = inputs.get("return_dict", return_dict) labels = inputs.get("labels", labels) - assert len(inputs) <= 13, "Too many inputs." + assert len(inputs) <= 14, "Too many inputs." else: input_ids = inputs use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.return_dict # Encode if needed (training, first prediction pass) if encoder_outputs is None: @@ -1261,22 +1270,48 @@ def call( training=training, ) - # insert decoder past at right place - # to speed up decoding - if cast_bool_to_primitive(use_cache, self.config.use_cache) is True: - past = ((encoder_outputs, decoder_outputs[1]),) - decoder_outputs = decoder_outputs[:1] + past + decoder_outputs[2:] - sequence_output = decoder_outputs[0] * (self.model_dim ** -0.5) embed_tokens = self.get_output_embeddings() logits = embed_tokens(sequence_output, mode="linear") - decoder_outputs = (logits,) + decoder_outputs[1:] - if labels is not None: - loss = self.compute_loss(labels, logits) - decoder_outputs = (loss,) + decoder_outputs + loss = None if labels is None else self.compute_loss(labels, logits) - return decoder_outputs + encoder_outputs + past = ( + (encoder_outputs, decoder_outputs[1]) if cast_bool_to_primitive(use_cache, self.config.use_cache) else None + ) + if not return_dict: + if past is not None: + decoder_outputs = decoder_outputs[:1] + (past,) + decoder_outputs[2:] + output = (logits,) + decoder_outputs[1:] + encoder_outputs + return ((loss,) + output) if loss is not None else output + + # Putting this before breaks tf compilation. + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + # This is long and annoying but if we introduce return_dict at the TFT5MainLayer level (like in PyTorch) + # TF refuses to compile anymore. + if not cast_bool_to_primitive(use_cache, self.config.use_cache): + decoder_outputs = decoder_outputs[:1] + (None,) + decoder_outputs[1:] + if not cast_bool_to_primitive(output_hidden_states, self.config.output_hidden_states): + encoder_outputs = encoder_outputs[:1] + (None,) + encoder_outputs[1:] + decoder_outputs = decoder_outputs[:2] + (None,) + decoder_outputs[2:] + if not cast_bool_to_primitive(output_attentions, self.config.output_attentions): + encoder_outputs = encoder_outputs + (None,) + decoder_outputs = decoder_outputs + (None,) + + return TFSeq2SeqLMOutput( + loss=loss, + logits=logits, + decoder_past_key_values=past, + decoder_hidden_states=decoder_outputs[2], + decoder_attentions=decoder_outputs[3], + encoder_last_hidden_state=encoder_outputs[0], + encoder_hidden_states=encoder_outputs[1], + encoder_attentions=encoder_outputs[2], + ) def prepare_inputs_for_generation(self, inputs, past, attention_mask, use_cache, **kwargs): assert past is not None, "past has to be defined for encoder_outputs" diff --git a/src/transformers/modeling_tf_transfo_xl.py b/src/transformers/modeling_tf_transfo_xl.py index d1979174a3a9..abeef52ff146 100644 --- a/src/transformers/modeling_tf_transfo_xl.py +++ b/src/transformers/modeling_tf_transfo_xl.py @@ -18,11 +18,13 @@ import logging +from dataclasses import dataclass +from typing import List, Optional, Tuple import tensorflow as tf from .configuration_transfo_xl import TransfoXLConfig -from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable +from .file_utils import ModelOutput, add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask from .modeling_tf_utils import TFPreTrainedModel, get_initializer, keras_serializable, shape_list from .tokenization_utils import BatchEncoding @@ -30,6 +32,7 @@ logger = logging.getLogger(__name__) +_CONFIG_FOR_DOC = "TransfoXLConfig" _TOKENIZER_FOR_DOC = "TransfoXLTokenizer" TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [ @@ -388,6 +391,7 @@ def __init__(self, config, **kwargs): super().__init__(**kwargs) self.output_hidden_states = config.output_hidden_states self.output_attentions = config.output_attentions + self.return_dict = config.use_return_dict self.n_token = config.vocab_size @@ -525,6 +529,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, training=False, ): if isinstance(inputs, (tuple, list)): @@ -533,8 +538,9 @@ def call( head_mask = inputs[2] if len(inputs) > 2 else head_mask inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds output_attentions = inputs[4] if len(inputs) > 4 else output_attentions - output_hidden_states = inputs[5] if len(inputs) > 4 else output_hidden_states - assert len(inputs) <= 6, "Too many inputs." + output_hidden_states = inputs[5] if len(inputs) > 5 else output_hidden_states + return_dict = inputs[6] if len(inputs) > 6 else return_dict + assert len(inputs) <= 7, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") mems = inputs.get("mems", mems) @@ -542,12 +548,14 @@ def call( inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) - assert len(inputs) <= 6, "Too many inputs." + return_dict = inputs.get("return_dict", return_dict) + assert len(inputs) <= 7, "Too many inputs." else: input_ids = inputs output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states + return_dict = return_dict if return_dict is not None else self.return_dict # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library # so we transpose here from shape [bsz, len] to shape [len, bsz] @@ -606,7 +614,7 @@ def call( # word_emb.new_ones((qlen, klen), dtype=torch.uint8), diagonal=1+mlen)[:,:,None] hids = [] - attentions = [] + attentions = [] if output_attentions else None if self.attn_type == 0: # default pos_seq = tf.range(klen - 1, -1, -1.0) if self.clamp_len > 0: @@ -633,17 +641,24 @@ def call( new_mems = self._update_mems(hids, mems, mlen, qlen) # We transpose back here to shape [bsz, len, hidden_dim] - outputs = [tf.transpose(core_out, perm=(1, 0, 2)), new_mems] + core_out = tf.transpose(core_out, perm=(1, 0, 2)) + if output_hidden_states: # Add last layer and transpose to library standard shape [bsz, len, hidden_dim] hids.append(core_out) - hids = list(tf.transpose(t, perm=(1, 0, 2)) for t in hids) - outputs.append(hids) + hids = tuple(tf.transpose(t, perm=(1, 0, 2)) for t in hids) + else: + hids = None if output_attentions: # Transpose to library standard shape [bsz, n_heads, query_seq_len, key_seq_len] - attentions = list(tf.transpose(t, perm=(2, 3, 0, 1)) for t in attentions) - outputs.append(attentions) - return outputs # last hidden state, new_mems, (all hidden states), (all attentions) + attentions = tuple(tf.transpose(t, perm=(2, 3, 0, 1)) for t in attentions) + + if not return_dict: + return tuple(v for v in [core_out, new_mems, hids, attentions] if v is not None) + + return TFTransfoXLModelOutput( + last_hidden_state=core_out, mems=new_mems, hidden_states=hids, attentions=attentions, + ) class TFTransfoXLPreTrainedModel(TFPreTrainedModel): @@ -655,6 +670,70 @@ class TFTransfoXLPreTrainedModel(TFPreTrainedModel): base_model_prefix = "transformer" +@dataclass +class TFTransfoXLModelOutput(ModelOutput): + """ + Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding). + + Args: + last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): + Contains pre-computed hidden-states (key and values in the attention blocks). + Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model + should not be passed as input ids as they have already been computed. + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + last_hidden_state: tf.Tensor = None + mems: List[tf.Tensor] = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFTransfoXLLMHeadModelOutput(ModelOutput): + """ + Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding). + + Args: + losses (:obj:`tf.Tensor` of shape `(batch_size, sequence_length-1)`, `optional`, returned when ``labels`` is provided) + Language modeling losses (not reduced). + prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token after SoftMax). + mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): + Contains pre-computed hidden-states (key and values in the attention blocks). + Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model + should not be passed as input ids as they have already been computed. + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + prediction_scores: tf.Tensor = None + mems: List[tf.Tensor] = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + TRANSFO_XL_START_DOCSTRING = r""" .. note:: @@ -706,6 +785,11 @@ class TFTransfoXLPreTrainedModel(TFPreTrainedModel): than the model's internal embedding lookup matrix. output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -719,29 +803,13 @@ def __init__(self, config, *inputs, **kwargs): self.transformer = TFTransfoXLMainLayer(config, name="transformer") @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="transfo-xl-wt103") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="transfo-xl-wt103", + output_type=TFTransfoXLModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call(self, inputs, **kwargs): - r""" - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.TransfoXLConfig`) and inputs: - last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the last layer of the model. - mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): - Contains pre-computed hidden-states (key and values in the attention blocks). - Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model - should not be passed as input ids as they have already been computed. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ outputs = self.transformer(inputs, **kwargs) return outputs @@ -797,57 +865,47 @@ def init_mems(self, bsz): return self.transformer.init_mems(bsz) @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="transfo-xl-wt103") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="transfo-xl-wt103", + output_type=TFTransfoXLLMHeadModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs, mems=None, head_mask=None, inputs_embeds=None, - labels=None, output_attentions=None, output_hidden_states=None, + return_dict=None, + labels=None, training=False, ): - r""" - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.TransfoXLConfig`) and inputs: - prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): - Contains pre-computed hidden-states (key and values in the attention blocks). - Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model - should not be passed as input ids as they have already been computed. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ if isinstance(inputs, (tuple, list)): input_ids = inputs[0] mems = inputs[1] if len(inputs) > 1 else mems head_mask = inputs[2] if len(inputs) > 2 else head_mask inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds - labels = inputs[4] if len(inputs) > 4 else labels - output_attentions = inputs[5] if len(inputs) > 5 else output_attentions - assert len(inputs) <= 6, "Too many inputs." + output_attentions = inputs[4] if len(inputs) > 4 else output_attentions + output_hidden_states = inputs[5] if len(inputs) > 5 else output_hidden_states + return_dict = inputs[6] if len(inputs) > 6 else return_dict + labels = inputs[7] if len(inputs) > 7 else labels + assert len(inputs) <= 8, "Too many inputs." elif isinstance(inputs, (BatchEncoding, dict)): input_ids = inputs.get("input_ids") mems = inputs.get("mems", mems) head_mask = inputs.get("head_mask", head_mask) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) - labels = inputs.get("labels", labels) output_attentions = inputs.get("output_attentions", output_attentions) - assert len(inputs) <= 6, "Too many inputs." + output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) + return_dict = inputs.get("return_dict", return_dict) + labels = inputs.get("labels", labels) + assert len(inputs) <= 8, "Too many inputs." else: input_ids = inputs + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if input_ids is not None: bsz, tgt_len = shape_list(input_ids)[:2] @@ -855,17 +913,30 @@ def call( bsz, tgt_len = shape_list(inputs_embeds)[:2] transformer_outputs = self.transformer( - input_ids, mems, head_mask, inputs_embeds, output_attentions, output_hidden_states, training=training + input_ids, + mems, + head_mask, + inputs_embeds, + output_attentions, + output_hidden_states, + return_dict, + training=training, ) last_hidden = transformer_outputs[0] pred_hid = last_hidden[:, -tgt_len:] - outputs = transformer_outputs[1:] softmax_output = self.crit(pred_hid, labels, training=training) - outputs = [softmax_output] + outputs - return outputs # logits, new_mems, (all hidden states), (all attentions) + if not return_dict: + return (softmax_output,) + transformer_outputs[1:] + + return TFTransfoXLLMHeadModelOutput( + prediction_scores=softmax_output, + mems=transformer_outputs.mems, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) def prepare_inputs_for_generation(self, inputs, past, **model_kwargs): inputs = {"inputs": inputs} diff --git a/src/transformers/modeling_tf_xlm.py b/src/transformers/modeling_tf_xlm.py index e1eddcc57cf5..1cd0d7e5800a 100644 --- a/src/transformers/modeling_tf_xlm.py +++ b/src/transformers/modeling_tf_xlm.py @@ -20,6 +20,8 @@ import logging import math import warnings +from dataclasses import dataclass +from typing import Optional, Tuple import numpy as np import tensorflow as tf @@ -27,10 +29,18 @@ from .configuration_xlm import XLMConfig from .file_utils import ( MULTIPLE_CHOICE_DUMMY_INPUTS, + ModelOutput, add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable, ) +from .modeling_tf_outputs import ( + TFBaseModelOutput, + TFMultipleChoiceModelOutput, + TFQuestionAnsweringModelOutput, + TFSequenceClassifierOutput, + TFTokenClassifierOutput, +) from .modeling_tf_utils import ( TFMultipleChoiceLoss, TFPreTrainedModel, @@ -48,6 +58,7 @@ logger = logging.getLogger(__name__) +_CONFIG_FOR_DOC = "XLMConfig" _TOKENIZER_FOR_DOC = "XLMTokenizer" TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST = [ @@ -224,6 +235,7 @@ def __init__(self, config, **kwargs): super().__init__(**kwargs) self.output_hidden_states = config.output_hidden_states self.output_attentions = config.output_attentions + self.return_dict = config.use_return_dict # encoder / decoder, output layer self.is_encoder = config.is_encoder @@ -340,6 +352,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, training=False, ): # removed: src_enc=None, src_len=None if isinstance(inputs, (tuple, list)): @@ -354,7 +367,8 @@ def call( inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds output_attentions = inputs[9] if len(inputs) > 9 else output_attentions output_hidden_states = inputs[10] if len(inputs) > 10 else output_hidden_states - assert len(inputs) <= 11, "Too many inputs." + return_dict = inputs[11] if len(inputs) > 11 else return_dict + assert len(inputs) <= 12, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -367,12 +381,14 @@ def call( inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) - assert len(inputs) <= 11, "Too many inputs." + return_dict = inputs.get("return_dict", return_dict) + assert len(inputs) <= 12, "Too many inputs." else: input_ids = inputs output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states + return_dict = return_dict if return_dict is not None else self.return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -454,8 +470,8 @@ def call( tensor = tensor * mask[..., tf.newaxis] # transformer layers - hidden_states = () - attentions = () + hidden_states = () if output_hidden_states else None + attentions = () if output_attentions else None for i in range(self.n_layers): if output_hidden_states: hidden_states = hidden_states + (tensor,) @@ -494,12 +510,9 @@ def call( # move back sequence length to dimension 0 # tensor = tensor.transpose(0, 1) - outputs = (tensor,) - if output_hidden_states: - outputs = outputs + (hidden_states,) - if output_attentions: - outputs = outputs + (attentions,) - return outputs # outputs, (hidden_states), (attentions) + if not return_dict: + return tuple(v for v in [tensor, hidden_states, attentions] if v is not None) + return TFBaseModelOutput(last_hidden_state=tensor, hidden_states=hidden_states, attentions=attentions) class TFXLMPreTrainedModel(TFPreTrainedModel): @@ -522,6 +535,33 @@ def dummy_inputs(self): return {"input_ids": inputs_list, "attention_mask": attns_list, "langs": langs_list} +# Remove when XLMWithLMHead computes loss like other LM models +@dataclass +class TFXLMWithLMHeadModelOutput(ModelOutput): + """ + Base class for :class:`~transformers.TFXLMWithLMHeadModel` outputs. + + Args: + logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + logits: tf.Tensor = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + XLM_START_DOCSTRING = r""" .. note:: @@ -603,6 +643,11 @@ def dummy_inputs(self): than the model's internal embedding lookup matrix. output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -616,25 +661,13 @@ def __init__(self, config, *inputs, **kwargs): self.transformer = TFXLMMainLayer(config, name="transformer") @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xlm-mlm-en-2048", + output_type=TFBaseModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call(self, inputs, **kwargs): - r""" - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs: - last_hidden_state (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): - Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ outputs = self.transformer(inputs, **kwargs) return outputs @@ -701,32 +734,26 @@ def prepare_inputs_for_generation(self, inputs, **kwargs): return {"inputs": inputs, "langs": langs} @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xlm-mlm-en-2048", + output_type=TFXLMWithLMHeadModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call(self, inputs, **kwargs): - r""" - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs: - prediction_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ + return_dict = kwargs.get("return_dict") + return_dict = return_dict if return_dict is not None else self.transformer.return_dict transformer_outputs = self.transformer(inputs, **kwargs) output = transformer_outputs[0] outputs = self.pred_layer(output) - outputs = (outputs,) + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here - return outputs + if not return_dict: + return (outputs,) + transformer_outputs[1:] + + return TFXLMWithLMHeadModelOutput( + logits=outputs, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions + ) @add_start_docstrings( @@ -743,7 +770,12 @@ def __init__(self, config, *inputs, **kwargs): self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary") @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xlm-mlm-en-2048", + output_type=TFSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -757,6 +789,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -766,27 +799,12 @@ def call( Indices should be in ``[0, ..., config.num_labels - 1]``. If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). - - Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs: - logits (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, config.num_labels)`): - Classification (or regression if config.num_labels==1) scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[11] if len(inputs) > 11 else labels - if len(inputs) > 11: - inputs = inputs[:11] + labels = inputs[12] if len(inputs) > 12 else labels + if len(inputs) > 12: + inputs = inputs[:12] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -802,19 +820,25 @@ def call( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) output = transformer_outputs[0] logits = self.sequence_summary(output) - outputs = (logits,) + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here + loss = None if labels is None else self.compute_loss(labels, logits) - if labels is not None: - loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs + if not return_dict: + output = (logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), logits, (hidden_states), (attentions) + return TFSequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) @add_start_docstrings( @@ -845,7 +869,12 @@ def dummy_inputs(self): } @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xlm-mlm-en-2048", + output_type=TFMultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs, @@ -859,6 +888,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -867,24 +897,6 @@ def call( Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`: - `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above). - - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ if isinstance(inputs, (tuple, list)): input_ids = inputs[0] @@ -898,8 +910,9 @@ def call( inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds output_attentions = inputs[9] if len(inputs) > 9 else output_attentions output_hidden_states = inputs[10] if len(inputs) > 10 else output_hidden_states - labels = inputs[11] if len(inputs) > 11 else labels - assert len(inputs) <= 12, "Too many inputs." + return_dict = inputs[11] if len(inputs) > 11 else return_dict + labels = inputs[12] if len(inputs) > 12 else labels + assert len(inputs) <= 13, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -912,10 +925,12 @@ def call( inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) + return_dict = inputs.get("return_dict", return_dict) labels = inputs.get("labels", labels) - assert len(inputs) <= 12, "Too many inputs." + assert len(inputs) <= 13, "Too many inputs." else: input_ids = inputs + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if input_ids is not None: num_choices = shape_list(input_ids)[1] @@ -955,19 +970,26 @@ def call( flat_inputs_embeds, output_attentions, output_hidden_states, + return_dict=return_dict, training=training, ) output = transformer_outputs[0] logits = self.sequence_summary(output) logits = self.logits_proj(logits) reshaped_logits = tf.reshape(logits, (-1, num_choices)) - outputs = (reshaped_logits,) + transformer_outputs[1:] # add hidden states and attention if they are here - if labels is not None: - loss = self.compute_loss(labels, reshaped_logits) - outputs = (loss,) + outputs + loss = None if labels is None else self.compute_loss(labels, reshaped_logits) + + if not return_dict: + output = (reshaped_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), reshaped_logits, (hidden_states), (attentions) + return TFMultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) @add_start_docstrings( @@ -987,7 +1009,12 @@ def __init__(self, config, *inputs, **kwargs): ) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xlm-mlm-en-2048", + output_type=TFTokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -1001,6 +1028,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -1008,27 +1036,12 @@ def call( labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[11] if len(inputs) > 11 else labels - if len(inputs) > 11: - inputs = inputs[:11] + labels = inputs[12] if len(inputs) > 12 else labels + if len(inputs) > 12: + inputs = inputs[:12] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -1044,6 +1057,7 @@ def call( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -1052,13 +1066,18 @@ def call( sequence_output = self.dropout(sequence_output, training=training) logits = self.classifier(sequence_output) - outputs = (logits,) + transformer_outputs[1:] # add hidden states and attention if they are here + loss = None if labels is None else self.compute_loss(labels, logits) - if labels is not None: - loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs + if not return_dict: + output = (logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), logits, (hidden_states), (attentions) + return TFTokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) @add_start_docstrings( @@ -1075,7 +1094,12 @@ def __init__(self, config, *inputs, **kwargs): ) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xlm-mlm-en-2048", + output_type=TFQuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -1089,6 +1113,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, start_positions=None, end_positions=None, training=False, @@ -1102,30 +1127,13 @@ def call( Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. - - Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs: - start_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length,)`): - Span-start scores (before SoftMax). - end_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length,)`): - Span-end scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if isinstance(inputs, (tuple, list)): - start_positions = inputs[11] if len(inputs) > 11 else start_positions - end_positions = inputs[12] if len(inputs) > 12 else end_positions - if len(inputs) > 11: - inputs = inputs[:11] + start_positions = inputs[12] if len(inputs) > 12 else start_positions + end_positions = inputs[13] if len(inputs) > 13 else end_positions + if len(inputs) > 12: + inputs = inputs[:12] elif isinstance(inputs, (dict, BatchEncoding)): start_positions = inputs.pop("start_positions", start_positions) end_positions = inputs.pop("end_positions", start_positions) @@ -1142,6 +1150,7 @@ def call( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -1152,14 +1161,20 @@ def call( start_logits = tf.squeeze(start_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1) - outputs = (start_logits, end_logits,) + transformer_outputs[ - 1: - ] # Keep mems, hidden states, attentions if there are in it - + loss = None if start_positions is not None and end_positions is not None: labels = {"start_position": start_positions} labels["end_position"] = end_positions - loss = self.compute_loss(labels, outputs[:2]) - outputs = (loss,) + outputs - - return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) + loss = self.compute_loss(labels, (start_logits, end_logits)) + + if not return_dict: + output = (start_logits, end_logits) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFQuestionAnsweringModelOutput( + loss=loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) diff --git a/src/transformers/modeling_tf_xlm_roberta.py b/src/transformers/modeling_tf_xlm_roberta.py index 5448595a7bbe..46bc96950cd4 100644 --- a/src/transformers/modeling_tf_xlm_roberta.py +++ b/src/transformers/modeling_tf_xlm_roberta.py @@ -62,8 +62,6 @@ config (:class:`~transformers.XLMRobertaConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. - output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. """ diff --git a/src/transformers/modeling_tf_xlnet.py b/src/transformers/modeling_tf_xlnet.py index e255e5adfd18..2054d230ce36 100644 --- a/src/transformers/modeling_tf_xlnet.py +++ b/src/transformers/modeling_tf_xlnet.py @@ -18,6 +18,8 @@ import logging +from dataclasses import dataclass +from typing import List, Optional, Tuple import numpy as np import tensorflow as tf @@ -25,9 +27,11 @@ from .configuration_xlnet import XLNetConfig from .file_utils import ( MULTIPLE_CHOICE_DUMMY_INPUTS, + ModelOutput, add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable, + replace_return_docstrings, ) from .modeling_tf_utils import ( TFCausalLanguageModelingLoss, @@ -47,6 +51,7 @@ logger = logging.getLogger(__name__) +_CONFIG_FOR_DOC = "XLNetConfig" _TOKENIZER_FOR_DOC = "XLNetTokenizer" TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [ @@ -436,6 +441,7 @@ def __init__(self, config, **kwargs): super().__init__(**kwargs) self.output_hidden_states = config.output_hidden_states self.output_attentions = config.output_attentions + self.return_dict = config.return_dict self.mem_len = config.mem_len self.reuse_len = config.reuse_len @@ -586,6 +592,7 @@ def call( use_cache=True, output_attentions=None, output_hidden_states=None, + return_dict=None, training=False, ): if isinstance(inputs, (tuple, list)): @@ -601,7 +608,8 @@ def call( use_cache = inputs[9] if len(inputs) > 9 else use_cache output_attentions = inputs[10] if len(inputs) > 10 else output_attentions output_hidden_states = inputs[11] if len(inputs) > 11 else output_hidden_states - assert len(inputs) <= 12, "Too many inputs." + return_dict = inputs[12] if len(inputs) > 12 else return_dict + assert len(inputs) <= 13, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -615,12 +623,14 @@ def call( use_cache = inputs.get("use_cache", use_cache) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) - assert len(inputs) <= 12, "Too many inputs." + return_dict = inputs.get("return_dict", return_dict) + assert len(inputs) <= 13, "Too many inputs." else: input_ids = inputs output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states + return_dict = return_dict if return_dict is not None else self.return_dict # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end # but we want a unified interface in the library with the batch size on the first dimension @@ -743,8 +753,8 @@ def call( if mems is None: mems = [None] * len(self.layer) - attentions = [] - hidden_states = [] + attentions = [] if output_attentions else None + hidden_states = [] if output_hidden_states else None for i, layer_module in enumerate(self.layer): # cache new mems if self.mem_len is not None and self.mem_len > 0 and use_cache: @@ -776,22 +786,24 @@ def call( output = self.dropout(output_g if output_g is not None else output_h, training=training) # Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method) - outputs = (tf.transpose(output, perm=(1, 0, 2)),) - - if self.mem_len is not None and self.mem_len > 0 and use_cache: - outputs = outputs + (new_mems,) + output = tf.transpose(output, perm=(1, 0, 2)) + if not (self.mem_len is not None and self.mem_len > 0 and use_cache): + new_mems = None if output_hidden_states: if output_g is not None: hidden_states = tuple(tf.transpose(h, perm=(1, 0, 2)) for hs in hidden_states for h in hs) else: hidden_states = tuple(tf.transpose(hs, perm=(1, 0, 2)) for hs in hidden_states) - outputs = outputs + (hidden_states,) if output_attentions: attentions = tuple(tf.transpose(t, perm=(2, 3, 0, 1)) for t in attentions) - outputs = outputs + (attentions,) - return outputs # outputs, (new_mems), (hidden_states), (attentions) + if not return_dict: + return tuple(v for v in [output, new_mems, hidden_states, attentions] if v is not None) + + return TFXLNetModelOutput( + last_hidden_state=output, mems=new_mems, hidden_states=hidden_states, attentions=attentions + ) class TFXLNetPreTrainedModel(TFPreTrainedModel): @@ -803,6 +815,218 @@ class TFXLNetPreTrainedModel(TFPreTrainedModel): base_model_prefix = "transformer" +@dataclass +class TFXLNetModelOutput(ModelOutput): + """ + Output type of :class:`~transformers.TFXLNetModel`. + + Args: + last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_predict, hidden_size)`): + Sequence of hidden-states at the last layer of the model. + + ``num_predict`` corresponds to ``target_mapping.shape[1]``. If ``target_mapping`` is ``None``, then + ``num_predict`` corresponds to ``sequence_length``. + mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): + Contains pre-computed hidden-states. + Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model + should not be passed as input ids as they have already been computed. + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + last_hidden_state: tf.Tensor = None + mems: Optional[List[tf.Tensor]] = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFXLNetLMHeadModelOutput(ModelOutput): + """ + Output type of :class:`~transformers.TFXLNetLMHeadModel`. + + Args: + loss (:obj:`tf.Tensor` of shape `(1,)`, `optional`, returned when ``labels`` is provided) + Language modeling loss (for next-token prediction). + logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_predict, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + + ``num_predict`` corresponds to ``target_mapping.shape[1]``. If ``target_mapping`` is ``None``, then + ``num_predict`` corresponds to ``sequence_length``. + mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): + Contains pre-computed hidden-states. + Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model + should not be passed as input ids as they have already been computed. + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[tf.Tensor] = None + logits: tf.Tensor = None + mems: Optional[List[tf.Tensor]] = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFXLNetForSequenceClassificationOutput(ModelOutput): + """ + Output type of :class:`~transformers.TFXLNetForSequenceClassification`. + + Args: + loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): + Contains pre-computed hidden-states. + Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model + should not be passed as input ids as they have already been computed. + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[tf.Tensor] = None + logits: tf.Tensor = None + mems: Optional[List[tf.Tensor]] = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFXLNetForTokenClassificationOutput(ModelOutput): + """ + Output type of :class:`~transformers.TFXLNetForTokenClassificationOutput`. + + Args: + loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) : + Classification loss. + logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): + Classification scores (before SoftMax). + mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): + Contains pre-computed hidden-states. + Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model + should not be passed as input ids as they have already been computed. + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[tf.Tensor] = None + logits: tf.Tensor = None + mems: Optional[List[tf.Tensor]] = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFXLNetForMultipleChoiceOutput(ModelOutput): + """ + Output type of :class:`~transformers.TFXLNetForMultipleChoice`. + + Args: + loss (:obj:`tf.Tensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided): + Classification loss. + logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`): + `num_choices` is the second dimension of the input tensors. (see `input_ids` above). + + Classification scores (before SoftMax). + mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): + Contains pre-computed hidden-states. + Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model + should not be passed as input ids as they have already been computed. + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[tf.Tensor] = None + logits: tf.Tensor = None + mems: Optional[List[tf.Tensor]] = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + +@dataclass +class TFXLNetForQuestionAnsweringSimpleOutput(ModelOutput): + """ + Output type of :class:`~transformers.TFXLNetForQuestionAnsweringSimple`. + + Args: + loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): + Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. + start_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): + Span-start scores (before SoftMax). + end_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): + Span-end scores (before SoftMax). + mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): + Contains pre-computed hidden-states. + Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model + should not be passed as input ids as they have already been computed. + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[tf.Tensor] = None + start_logits: tf.Tensor = None + end_logits: tf.Tensor = None + mems: Optional[List[tf.Tensor]] = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + XLNET_START_DOCSTRING = r""" .. note:: @@ -885,6 +1109,11 @@ class TFXLNetPreTrainedModel(TFPreTrainedModel): If `use_cache` is True, `mems` are returned and can be used to speed up decoding (see `mems`). Defaults to `True`. output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -898,29 +1127,13 @@ def __init__(self, config, *inputs, **kwargs): self.transformer = TFXLNetMainLayer(config, name="transformer") @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xlnet-base-cased", + output_type=TFXLNetModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call(self, inputs, **kwargs): - r""" - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs: - last_hidden_state (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the last layer of the model. - mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): - Contains pre-computed hidden-states (key and values in the attention blocks). - Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model - should not be passed as input ids as they have already been computed. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): - Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ outputs = self.transformer(inputs, **kwargs) return outputs @@ -980,6 +1193,7 @@ def prepare_inputs_for_generation(self, inputs, past, **kwargs): return inputs @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFXLNetLMHeadModelOutput, config_class=_CONFIG_FOR_DOC) def call( self, inputs, @@ -994,6 +1208,7 @@ def call( use_cache=True, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -1003,24 +1218,6 @@ def call( Indices should be in ``[0, ..., config.vocab_size - 1]``. Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs: - prediction_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): - Contains pre-computed hidden-states (key and values in the attention blocks). - Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model - should not be passed as input ids as they have already been computed. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. Examples:: @@ -1045,10 +1242,11 @@ def call( next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size] """ + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[12] if len(inputs) > 12 else labels - if len(inputs) > 12: - inputs = inputs[:12] + labels = inputs[13] if len(inputs) > 13 else labels + if len(inputs) > 13: + inputs = inputs[:13] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -1065,21 +1263,30 @@ def call( use_cache=True, output_attentions=None, output_hidden_states=None, + return_dict=return_dict, training=training, ) hidden_state = transformer_outputs[0] logits = self.lm_loss(hidden_state, training=training) - outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it - + loss = None if labels is not None: # shift labels to the left and cut last logit token logits = logits[:, :-1] labels = labels[:, 1:] loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs - return outputs # return logits, (mems), (hidden states), (attentions) + if not return_dict: + output = (logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFXLNetLMHeadModelOutput( + loss=loss, + logits=logits, + mems=transformer_outputs.mems, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) @add_start_docstrings( @@ -1101,7 +1308,12 @@ def __init__(self, config, *inputs, **kwargs): ) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xlnet-base-cased", + output_type=TFXLNetForSequenceClassificationOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -1116,6 +1328,7 @@ def call( use_cache=True, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -1125,31 +1338,12 @@ def call( Indices should be in ``[0, ..., config.num_labels - 1]``. If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs: - logits (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:(batch_size, config.num_labels)`): - Classification (or regression if config.num_labels==1) scores (before SoftMax). - mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): - Contains pre-computed hidden-states (key and values in the attention blocks). - Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model - should not be passed as input ids as they have already been computed. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[12] if len(inputs) > 12 else labels - if len(inputs) > 12: - inputs = inputs[:12] + labels = inputs[13] if len(inputs) > 13 else labels + if len(inputs) > 13: + inputs = inputs[:13] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -1166,19 +1360,26 @@ def call( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, ) output = transformer_outputs[0] output = self.sequence_summary(output) logits = self.logits_proj(output) - outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it + loss = None if labels is None else self.compute_loss(labels, logits) - if labels is not None: - loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs + if not return_dict: + output = (logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), logits, (hidden_states), (attentions) + return TFXLNetForSequenceClassificationOutput( + loss=loss, + logits=logits, + mems=transformer_outputs.mems, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) @add_start_docstrings( @@ -1208,7 +1409,12 @@ def dummy_inputs(self): return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xlnet-base-cased", + output_type=TFXLNetForMultipleChoiceOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -1223,6 +1429,7 @@ def call( use_cache=True, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -1231,24 +1438,6 @@ def call( Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`: - `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above). - - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ if isinstance(inputs, (tuple, list)): input_ids = inputs[0] @@ -1263,8 +1452,9 @@ def call( use_cache = inputs[9] if len(inputs) > 9 else use_cache output_attentions = inputs[10] if len(inputs) > 10 else output_attentions output_hidden_states = inputs[11] if len(inputs) > 11 else output_hidden_states - labels = inputs[12] if len(inputs) > 12 else labels - assert len(inputs) <= 13, "Too many inputs." + return_dict = inputs[12] if len(inputs) > 12 else return_dict + labels = inputs[13] if len(inputs) > 13 else labels + assert len(inputs) <= 14, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -1278,10 +1468,12 @@ def call( use_cache = inputs.get("use_cache", use_cache) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) + return_dict = inputs.get("return_dict", return_dict) labels = inputs.get("labels", labels) - assert len(inputs) <= 13, "Too many inputs." + assert len(inputs) <= 14, "Too many inputs." else: input_ids = inputs + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if input_ids is not None: num_choices = shape_list(input_ids)[1] @@ -1312,19 +1504,26 @@ def call( use_cache, output_attentions, output_hidden_states, + return_dict=return_dict, training=training, ) output = transformer_outputs[0] logits = self.sequence_summary(output) logits = self.logits_proj(logits) reshaped_logits = tf.reshape(logits, (-1, num_choices)) - outputs = (reshaped_logits,) + transformer_outputs[1:] # add hidden states and attention if they are here - - if labels is not None: - loss = self.compute_loss(labels, reshaped_logits) - outputs = (loss,) + outputs - - return outputs # (loss), logits, (mems), (hidden states), (attentions) + loss = None if labels is None else self.compute_loss(labels, reshaped_logits) + + if not return_dict: + output = (reshaped_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFXLNetForMultipleChoiceOutput( + loss=loss, + logits=reshaped_logits, + mems=transformer_outputs.mems, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) @add_start_docstrings( @@ -1343,7 +1542,12 @@ def __init__(self, config, *inputs, **kwargs): ) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xlnet-base-cased", + output_type=TFXLNetForTokenClassificationOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -1358,6 +1562,7 @@ def call( use_cache=True, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -1365,31 +1570,12 @@ def call( labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs: - logits (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:(batch_size, config.num_labels)`): - Classification scores (before SoftMax). - mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): - Contains pre-computed hidden-states (key and values in the attention blocks). - Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model - should not be passed as input ids as they have already been computed. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[12] if len(inputs) > 12 else labels - if len(inputs) > 12: - inputs = inputs[:12] + labels = inputs[13] if len(inputs) > 13 else labels + if len(inputs) > 13: + inputs = inputs[:13] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -1406,19 +1592,25 @@ def call( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) - output = transformer_outputs[0] + output = transformer_outputs[0] logits = self.classifier(output) - - outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it - - if labels is not None: - loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs - - return outputs # (loss), logits, (hidden_states), (attentions) + loss = None if labels is None else self.compute_loss(labels, logits) + + if not return_dict: + output = (logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFXLNetForTokenClassificationOutput( + loss=loss, + logits=logits, + mems=transformer_outputs.mems, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) @add_start_docstrings( @@ -1435,7 +1627,12 @@ def __init__(self, config, *inputs, **kwargs): ) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xlnet-base-cased", + output_type=TFXLNetForQuestionAnsweringSimpleOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -1450,6 +1647,7 @@ def call( use_cache=True, output_attentions=None, output_hidden_states=None, + return_dict=None, start_positions=None, end_positions=None, training=False, @@ -1463,36 +1661,13 @@ def call( Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. - - Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs: - loss (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): - Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. - start_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length,)`): - Span-start scores (before SoftMax). - end_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length,)`): - Span-end scores (before SoftMax). - mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): - Contains pre-computed hidden-states (key and values in the attention blocks). - Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model - should not be passed as input ids as they have already been computed. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if isinstance(inputs, (tuple, list)): - start_positions = inputs[12] if len(inputs) > 12 else start_positions - end_positions = inputs[13] if len(inputs) > 13 else end_positions - if len(inputs) > 12: - inputs = inputs[:12] + start_positions = inputs[13] if len(inputs) > 13 else start_positions + end_positions = inputs[14] if len(inputs) > 14 else end_positions + if len(inputs) > 13: + inputs = inputs[:13] elif isinstance(inputs, (dict, BatchEncoding)): start_positions = inputs.pop("start_positions", start_positions) end_positions = inputs.pop("end_positions", start_positions) @@ -1510,6 +1685,7 @@ def call( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -1520,17 +1696,24 @@ def call( start_logits = tf.squeeze(start_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1) - outputs = (start_logits, end_logits,) + transformer_outputs[ - 1: - ] # Keep mems, hidden states, attentions if there are in it - + loss = None if start_positions is not None and end_positions is not None: labels = {"start_position": start_positions} labels["end_position"] = end_positions - loss = self.compute_loss(labels, outputs[:2]) - outputs = (loss,) + outputs - - return outputs # (loss), start_logits, end_logits, (mems), (hidden_states), (attentions) + loss = self.compute_loss(labels, (start_logits, end_logits)) + + if not return_dict: + output = (start_logits, end_logits) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFXLNetForQuestionAnsweringSimpleOutput( + loss=loss, + start_logits=start_logits, + end_logits=end_logits, + mems=transformer_outputs.mems, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) # @add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of diff --git a/src/transformers/modeling_xlnet.py b/src/transformers/modeling_xlnet.py index 9746fb008f2a..ddb655656a8c 100644 --- a/src/transformers/modeling_xlnet.py +++ b/src/transformers/modeling_xlnet.py @@ -711,7 +711,7 @@ class XLNetForTokenClassificationOutput(ModelOutput): @dataclass class XLNetForMultipleChoiceOutput(ModelOutput): """ - Base class for outputs of multiple choice models. + Output type of :class:`~transformers.XLNetForMultipleChoice`. Args: loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided): @@ -747,7 +747,7 @@ class XLNetForMultipleChoiceOutput(ModelOutput): @dataclass class XLNetForQuestionAnsweringSimpleOutput(ModelOutput): """ - Base class for outputs of question answering models. + Output type of :class:`~transformers.XLNetForQuestionAnsweringSimple`. Args: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): @@ -784,7 +784,7 @@ class XLNetForQuestionAnsweringSimpleOutput(ModelOutput): @dataclass class XLNetForQuestionAnsweringOutput(ModelOutput): """ - Base class for outputs of question answering models using a :obj:`SquadHead`. + Output type of :class:`~transformers.XLNetForQuestionAnswering`. Args: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned if both :obj:`start_positions` and :obj:`end_positions` are provided): @@ -1227,7 +1227,6 @@ def forward( # Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method) output = output.permute(1, 0, 2).contiguous() - # TODO Teven: fix this test to only use use_cache. if not use_cache: new_mems = None diff --git a/templates/adding_a_new_model/modeling_tf_xxx.py b/templates/adding_a_new_model/modeling_tf_xxx.py index 07b47a10a8be..76dd0f08d3e8 100644 --- a/templates/adding_a_new_model/modeling_tf_xxx.py +++ b/templates/adding_a_new_model/modeling_tf_xxx.py @@ -31,6 +31,14 @@ add_start_docstrings, add_start_docstrings_to_callable, ) +from .modeling_tf_outputs import ( + TFBaseModelOutputWithPooling, + TFMaskedLMOutput, + TFMultipleChoiceModelOutput, + TFQuestionAnsweringModelOutput, + TFSequenceClassifierOutput, + TFTokenClassifierOutput, +) from .modeling_tf_utils import ( TFMaskedLanguageModelingLoss, TFMultipleChoiceLoss, @@ -46,6 +54,7 @@ logger = logging.getLogger(__name__) +_CONFIG_FOR_DOC = "XXXConfig" _TOKENIZER_FOR_DOC = "XxxTokenizer" #################################################### @@ -117,35 +126,60 @@ def _prune_heads(self, heads_to_prune): raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models def call( - self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False + self, + inputs, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + training=False, ): - # We allow three types of multi-inputs: - # - traditional keyword arguments in the call method - # - all the arguments provided as a dict in the first positional argument of call - # - all the arguments provided as a list/tuple (ordered) in the first positional argument of call - # The last two options are useful to use the tf.keras fit() method. - if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids position_ids = inputs[3] if len(inputs) > 3 else position_ids head_mask = inputs[4] if len(inputs) > 4 else head_mask - assert len(inputs) <= 5, "Too many inputs." - elif isinstance(inputs, dict): + inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds + output_attentions = inputs[6] if len(inputs) > 6 else output_attentions + output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states + return_dict = inputs[8] if len(inputs) > 8 else return_dict + assert len(inputs) <= 9, "Too many inputs." + elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) token_type_ids = inputs.get("token_type_ids", token_type_ids) position_ids = inputs.get("position_ids", position_ids) head_mask = inputs.get("head_mask", head_mask) - assert len(inputs) <= 5, "Too many inputs." + inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) + output_attentions = inputs.get("output_attentions", output_attentions) + output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) + return_dict = inputs.get("return_dict", return_dict) + assert len(inputs) <= 9, "Too many inputs." else: input_ids = inputs + output_attentions = output_attentions if output_attentions is not None else self.output_attentions + output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states + return_dict = return_dict if return_dict is not None else self.return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + if attention_mask is None: - attention_mask = tf.fill(shape_list(input_ids), 1) + attention_mask = tf.fill(input_shape, 1) if token_type_ids is None: - token_type_ids = tf.fill(shape_list(input_ids), 0) + token_type_ids = tf.fill(input_shape, 0) # We create a 3D attention mask from a 2D tensor mask. # Sizes are [batch_size, 1, 1, to_seq_length] @@ -174,14 +208,29 @@ def call( head_mask = [None] * self.num_hidden_layers # head_mask = tf.constant([0] * self.num_hidden_layers) - ################################## - # Replace this with your model code - embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids) - encoder_outputs = self.encoder([embedding_output, extended_attention_mask, head_mask], training=training) + embedding_output = self.embeddings(input_ids, position_ids, token_type_ids, inputs_embeds, training=training) + encoder_outputs = self.encoder( + embedding_output, + extended_attention_mask, + head_mask, + output_attentions, + output_hidden_states, + return_dict, + training=training, + ) + sequence_output = encoder_outputs[0] - outputs = (sequence_output,) + encoder_outputs[1:] # add hidden_states and attentions if they are here + pooled_output = self.pooler(sequence_output) + + if not return_dict: + return (sequence_output, pooled_output,) + encoder_outputs[1:] - return outputs # sequence_output, (hidden_states), (attentions) + return TFBaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) #################################################### @@ -274,6 +323,11 @@ class TFXxxPreTrainedModel(TFPreTrainedModel): (if set to :obj:`False`) for evaluation. output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -287,32 +341,13 @@ def __init__(self, config, *inputs, **kwargs): self.transformer = TFXxxMainLayer(config, name="transformer") @add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xxx-base-cased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xxx-base-cased", + output_type=TFBaseModelOutputWithPooling, + config_class=_CONFIG_FOR_DOC, + ) def call(self, inputs, **kwargs): - r""" - Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XxxConfig`) and inputs: - last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`): - Last layer hidden-state of the first token of the sequence (classification token) - further processed by a Linear layer and a Tanh activation function. The Linear - layer weights are trained from the next sentence prediction (classification) - objective during XXX pretraining. This output is usually *not* a good summary - of the semantic content of the input, you're often better with averaging or pooling - the sequence of hidden-states for the whole input sequence. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ outputs = self.transformer(inputs, **kwargs) return outputs @@ -329,7 +364,12 @@ def __init__(self, config, *inputs, **kwargs): self.mlm = TFXxxMLMHead(config, self.transformer.embeddings, name="mlm") @add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xxx-base-cased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xxx-base-cased", + output_type=TFMaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -340,6 +380,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -349,27 +390,12 @@ def call( Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XxxConfig`) and inputs: - prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[8] if len(inputs) > 8 else labels - if len(inputs) > 8: - inputs = inputs[:8] + labels = inputs[9] if len(inputs) > 9 else labels + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -382,19 +408,22 @@ def call( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) sequence_output = outputs[0] prediction_scores = self.mlm(sequence_output, training=training) - outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here + loss = None if labels is None else self.compute_loss(labels, prediction_scores) - if labels is not None: - loss = self.compute_loss(labels, prediction_scores) - outputs = (loss,) + outputs + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), prediction_scores, (hidden_states), (attentions) + return TFMaskedLMOutput( + loss=loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -414,7 +443,12 @@ def __init__(self, config, *inputs, **kwargs): ) @add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xxx-base-cased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xxx-base-cased", + output_type=TFSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -425,6 +459,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -434,27 +469,12 @@ def call( Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XxxConfig`) and inputs: - logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`): - Classification (or regression if config.num_labels==1) scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[8] if len(inputs) > 8 else labels - if len(inputs) > 8: - inputs = inputs[:8] + labels = inputs[9] if len(inputs) > 9 else labels + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -467,6 +487,7 @@ def call( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -475,13 +496,15 @@ def call( pooled_output = self.dropout(pooled_output, training=training) logits = self.classifier(pooled_output) - outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here + loss = None if labels is None else self.compute_loss(labels, logits) - if labels is not None: - loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), logits, (hidden_states), (attentions) + return TFSequenceClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -509,7 +532,12 @@ def dummy_inputs(self): return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} @add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xxx-base-cased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xxx-base-cased", + output_type=TFMultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs, @@ -520,6 +548,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -527,24 +556,7 @@ def call( labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension - of the input tensors. (see `input_ids` above) - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XxxConfig`) and inputs: - classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`: - `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above). - - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + of the input tensors. (see `input_ids` above)s after the attention softmax, used to compute the weighted average in the self-attention heads. """ if isinstance(inputs, (tuple, list)): @@ -556,8 +568,9 @@ def call( inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states - labels = inputs[8] if len(inputs) > 8 else labels - assert len(inputs) <= 9, "Too many inputs." + return_dict = inputs[8] if len(inputs) > 8 else return_dict + labels = inputs[9] if len(inputs) > 9 else labels + assert len(inputs) <= 10, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -567,10 +580,12 @@ def call( inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) output_attentions = inputs.get("output_attentions", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) + return_dict = inputs.get("return_dict", return_dict) labels = inputs.get("labels", labels) - assert len(inputs) <= 9, "Too many inputs." + assert len(inputs) <= 10, "Too many inputs." else: input_ids = inputs + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if input_ids is not None: num_choices = shape_list(input_ids)[1] @@ -598,6 +613,7 @@ def call( flat_inputs_embeds, output_attentions, output_hidden_states, + return_dict, ] outputs = self.transformer(flat_inputs, training=training) @@ -608,13 +624,15 @@ def call( logits = self.classifier(pooled_output) reshaped_logits = tf.reshape(logits, (-1, num_choices)) - outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here + loss = None if labels is None else self.compute_loss(labels, reshaped_logits) - if labels is not None: - loss = self.compute_loss(labels, reshaped_logits) - outputs = (loss,) + outputs + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), reshaped_logits, (hidden_states), (attentions) + return TFMultipleChoiceModelOutput( + loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -634,7 +652,12 @@ def __init__(self, config, *inputs, **kwargs): ) @add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xxx-base-cased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xxx-base-cased", + output_type=TFTokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -645,6 +668,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, labels=None, training=False, ): @@ -652,27 +676,12 @@ def call( labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XxxConfig`) and inputs: - scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if isinstance(inputs, (tuple, list)): - labels = inputs[8] if len(inputs) > 8 else labels - if len(inputs) > 8: - inputs = inputs[:8] + labels = inputs[9] if len(inputs) > 9 else labels + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) @@ -685,6 +694,7 @@ def call( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -693,13 +703,15 @@ def call( sequence_output = self.dropout(sequence_output, training=training) logits = self.classifier(sequence_output) - outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here + loss = None if labels is None else self.compute_loss(labels, logits) - if labels is not None: - loss = self.compute_loss(labels, logits) - outputs = (loss,) + outputs + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output - return outputs # (loss), logits, (hidden_states), (attentions) + return TFTokenClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) @add_start_docstrings( @@ -718,7 +730,12 @@ def __init__(self, config, *inputs, **kwargs): ) @add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING) - @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xxx-base-cased") + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xxx-base-cased", + output_type=TFQuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) def call( self, inputs=None, @@ -729,6 +746,7 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + return_dict=None, start_positions=None, end_positions=None, training=False, @@ -742,30 +760,13 @@ def call( Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. - - Return: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XxxConfig`) and inputs: - start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): - Span-start scores (before SoftMax). - end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): - Span-end scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. """ + return_dict = return_dict if return_dict is not None else self.transformer.return_dict if isinstance(inputs, (tuple, list)): - start_positions = inputs[8] if len(inputs) > 8 else start_positions - end_positions = inputs[9] if len(inputs) > 9 else end_positions - if len(inputs) > 8: - inputs = inputs[:8] + start_positions = inputs[9] if len(inputs) > 9 else start_positions + end_positions = inputs[10] if len(inputs) > 10 else end_positions + if len(inputs) > 9: + inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): start_positions = inputs.pop("start_positions", start_positions) end_positions = inputs.pop("end_positions", start_positions) @@ -779,6 +780,7 @@ def call( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, + return_dict=return_dict, training=training, ) @@ -789,12 +791,20 @@ def call( start_logits = tf.squeeze(start_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1) - outputs = (start_logits, end_logits,) + outputs[2:] - + loss = None if start_positions is not None and end_positions is not None: labels = {"start_position": start_positions} labels["end_position"] = end_positions - loss = self.compute_loss(labels, outputs[:2]) - outputs = (loss,) + outputs - - return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) + loss = self.compute_loss(labels, (start_logits, end_logits)) + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFQuestionAnsweringModelOutput( + loss=loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/templates/adding_a_new_model/tests/test_modeling_tf_xxx.py b/templates/adding_a_new_model/tests/test_modeling_tf_xxx.py index 3e12b3f74599..cd700e9aabdc 100644 --- a/templates/adding_a_new_model/tests/test_modeling_tf_xxx.py +++ b/templates/adding_a_new_model/tests/test_modeling_tf_xxx.py @@ -24,9 +24,11 @@ if is_tf_available(): + import tensorflow as tf from transformers.modeling_tf_xxx import ( TFXxxModel, TFXxxForMaskedLM, + TFXxxForMultipleChoice, TFXxxForSequenceClassification, TFXxxForTokenClassification, TFXxxForQuestionAnswering, @@ -40,6 +42,7 @@ class TFXxxModelTest(TFModelTesterMixin, unittest.TestCase): ( TFXxxModel, TFXxxForMaskedLM, + TFXxxForMultipleChoice, TFXxxForQuestionAnswering, TFXxxForSequenceClassification, TFXxxForTokenClassification, @@ -128,6 +131,7 @@ def prepare_config_and_inputs(self): max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, + return_dict=True, ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -137,33 +141,26 @@ def create_and_check_xxx_model( ): model = TFXxxModel(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - sequence_output, pooled_output = model(inputs) + result = model(inputs) inputs = [input_ids, input_mask] - sequence_output, pooled_output = model(inputs) + result = model(inputs) - sequence_output, pooled_output = model(input_ids) + result = model(input_ids) - result = { - "sequence_output": sequence_output.numpy(), - "pooled_output": pooled_output.numpy(), - } self.parent.assertListEqual( - list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size] ) - self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size]) + self.parent.assertListEqual(list(result["pooler_output"].shape), [self.batch_size, self.hidden_size]) def create_and_check_xxx_for_masked_lm( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): model = TFXxxForMaskedLM(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - (prediction_scores,) = model(inputs) - result = { - "prediction_scores": prediction_scores.numpy(), - } + result = model(inputs) self.parent.assertListEqual( - list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] + list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size] ) def create_and_check_xxx_for_sequence_classification( @@ -172,22 +169,32 @@ def create_and_check_xxx_for_sequence_classification( config.num_labels = self.num_labels model = TFXxxForSequenceClassification(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - (logits,) = model(inputs) - result = { - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels]) + def create_and_check_bert_for_multiple_choice( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_choices = self.num_choices + model = TFXxxForMultipleChoice(config=config) + multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1)) + multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1)) + multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1)) + inputs = { + "input_ids": multiple_choice_inputs_ids, + "attention_mask": multiple_choice_input_mask, + "token_type_ids": multiple_choice_token_type_ids, + } + result = model(inputs) + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) + def create_and_check_xxx_for_token_classification( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): config.num_labels = self.num_labels model = TFXxxForTokenClassification(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - (logits,) = model(inputs) - result = { - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual( list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels] ) @@ -197,11 +204,7 @@ def create_and_check_xxx_for_question_answering( ): model = TFXxxForQuestionAnswering(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - start_logits, end_logits = model(inputs) - result = { - "start_logits": start_logits.numpy(), - "end_logits": end_logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) diff --git a/tests/test_modeling_tf_albert.py b/tests/test_modeling_tf_albert.py index f59931424be5..ca807e848772 100644 --- a/tests/test_modeling_tf_albert.py +++ b/tests/test_modeling_tf_albert.py @@ -116,6 +116,7 @@ def prepare_config_and_inputs(self): max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, + return_dict=True, ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -129,21 +130,17 @@ def create_and_check_albert_model( # 'token_type_ids': token_type_ids} # sequence_output, pooled_output = model(**inputs) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - sequence_output, pooled_output = model(inputs) + result = model(inputs) inputs = [input_ids, input_mask] - sequence_output, pooled_output = model(inputs) + result = model(inputs) - sequence_output, pooled_output = model(input_ids) + result = model(input_ids) - result = { - "sequence_output": sequence_output.numpy(), - "pooled_output": pooled_output.numpy(), - } self.parent.assertListEqual( - list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size] ) - self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size]) + self.parent.assertListEqual(list(result["pooler_output"].shape), [self.batch_size, self.hidden_size]) def create_and_check_albert_for_pretraining( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -151,28 +148,19 @@ def create_and_check_albert_for_pretraining( config.num_labels = self.num_labels model = TFAlbertForPreTraining(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - prediction_scores, sop_scores = model(inputs) - result = { - "prediction_scores": prediction_scores.numpy(), - "sop_scores": sop_scores.numpy(), - } + result = model(inputs) self.parent.assertListEqual( - list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] + list(result["prediction_logits"].shape), [self.batch_size, self.seq_length, self.vocab_size] ) - self.parent.assertListEqual(list(result["sop_scores"].shape), [self.batch_size, self.num_labels]) + self.parent.assertListEqual(list(result["sop_logits"].shape), [self.batch_size, self.num_labels]) def create_and_check_albert_for_masked_lm( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): model = TFAlbertForMaskedLM(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - (prediction_scores,) = model(inputs) - result = { - "prediction_scores": prediction_scores.numpy(), - } - self.parent.assertListEqual( - list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] - ) + result = model(inputs) + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size]) def create_and_check_albert_for_sequence_classification( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -180,10 +168,7 @@ def create_and_check_albert_for_sequence_classification( config.num_labels = self.num_labels model = TFAlbertForSequenceClassification(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - (logits,) = model(inputs) - result = { - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels]) def create_and_check_albert_for_question_answering( @@ -191,11 +176,7 @@ def create_and_check_albert_for_question_answering( ): model = TFAlbertForQuestionAnswering(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - start_logits, end_logits = model(inputs) - result = { - "start_logits": start_logits.numpy(), - "end_logits": end_logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) diff --git a/tests/test_modeling_tf_bert.py b/tests/test_modeling_tf_bert.py index 7e1884bafca4..5026ce55fb2d 100644 --- a/tests/test_modeling_tf_bert.py +++ b/tests/test_modeling_tf_bert.py @@ -118,6 +118,7 @@ def prepare_config_and_inputs(self): max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, + return_dict=True, ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -130,18 +131,14 @@ def create_and_check_bert_model( sequence_output, pooled_output = model(inputs) inputs = [input_ids, input_mask] - sequence_output, pooled_output = model(inputs) + result = model(inputs) - sequence_output, pooled_output = model(input_ids) + result = model(input_ids) - result = { - "sequence_output": sequence_output.numpy(), - "pooled_output": pooled_output.numpy(), - } self.parent.assertListEqual( - list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size] ) - self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size]) + self.parent.assertListEqual(list(result["pooler_output"].shape), [self.batch_size, self.hidden_size]) def create_and_check_bert_lm_head( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -153,7 +150,7 @@ def create_and_check_bert_lm_head( "attention_mask": input_mask, "token_type_ids": token_type_ids, } - (prediction_scores,) = model(inputs) + prediction_scores = model(inputs)["logits"] self.parent.assertListEqual( list(prediction_scores.numpy().shape), [self.batch_size, self.seq_length, self.vocab_size] ) @@ -167,39 +164,27 @@ def create_and_check_bert_for_masked_lm( "attention_mask": input_mask, "token_type_ids": token_type_ids, } - (prediction_scores,) = model(inputs) - result = { - "prediction_scores": prediction_scores.numpy(), - } - self.parent.assertListEqual( - list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] - ) + result = model(inputs) + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size]) def create_and_check_bert_for_next_sequence_prediction( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): model = TFBertForNextSentencePrediction(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - (seq_relationship_score,) = model(inputs) - result = { - "seq_relationship_score": seq_relationship_score.numpy(), - } - self.parent.assertListEqual(list(result["seq_relationship_score"].shape), [self.batch_size, 2]) + result = model(inputs) + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, 2]) def create_and_check_bert_for_pretraining( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): model = TFBertForPreTraining(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - prediction_scores, seq_relationship_score = model(inputs) - result = { - "prediction_scores": prediction_scores.numpy(), - "seq_relationship_score": seq_relationship_score.numpy(), - } + result = model(inputs) self.parent.assertListEqual( - list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] + list(result["prediction_logits"].shape), [self.batch_size, self.seq_length, self.vocab_size] ) - self.parent.assertListEqual(list(result["seq_relationship_score"].shape), [self.batch_size, 2]) + self.parent.assertListEqual(list(result["seq_relationship_logits"].shape), [self.batch_size, 2]) def create_and_check_bert_for_sequence_classification( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -212,8 +197,7 @@ def create_and_check_bert_for_sequence_classification( "token_type_ids": token_type_ids, } - (logits,) = model(inputs) - result = {"logits": logits.numpy()} + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels]) def create_and_check_bert_for_multiple_choice( @@ -229,8 +213,7 @@ def create_and_check_bert_for_multiple_choice( "attention_mask": multiple_choice_input_mask, "token_type_ids": multiple_choice_token_type_ids, } - (logits,) = model(inputs) - result = {"logits": logits.numpy()} + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) def create_and_check_bert_for_token_classification( @@ -243,10 +226,7 @@ def create_and_check_bert_for_token_classification( "attention_mask": input_mask, "token_type_ids": token_type_ids, } - (logits,) = model(inputs) - result = { - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]) def create_and_check_bert_for_question_answering( @@ -259,8 +239,7 @@ def create_and_check_bert_for_question_answering( "token_type_ids": token_type_ids, } - start_logits, end_logits = model(inputs) - result = {"start_logits": start_logits.numpy(), "end_logits": end_logits.numpy()} + result = model(inputs) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) diff --git a/tests/test_modeling_tf_camembert.py b/tests/test_modeling_tf_camembert.py index fa962dd6152f..3eb47beb28ba 100644 --- a/tests/test_modeling_tf_camembert.py +++ b/tests/test_modeling_tf_camembert.py @@ -35,7 +35,7 @@ def test_output_embeds_base_model(self): [[5, 121, 11, 660, 16, 730, 25543, 110, 83, 6]], dtype=tf.int32, ) # J'aime le camembert !" - output = model(input_ids)[0] + output = model(input_ids)["last_hidden_state"] expected_shape = tf.TensorShape((1, 10, 768)) self.assertEqual(output.shape, expected_shape) # compare the actual values for a slice. diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py index 9aafb5d60075..0353314bab8d 100644 --- a/tests/test_modeling_tf_common.py +++ b/tests/test_modeling_tf_common.py @@ -146,7 +146,8 @@ def test_saved_model_with_hidden_states_output(self): tf.saved_model.save(model, tmpdirname) model = tf.keras.models.load_model(tmpdirname) outputs = model(inputs_dict) - hidden_states = [t.numpy() for t in outputs[-1]] + output = outputs[list(outputs.keys())[-1]] if isinstance(outputs, dict) else outputs[-1] + hidden_states = [t.numpy() for t in output] self.assertEqual(len(outputs), num_out) self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1) self.assertListEqual( @@ -177,7 +178,8 @@ def test_saved_model_with_attentions_output(self): tf.saved_model.save(model, tmpdirname) model = tf.keras.models.load_model(tmpdirname) outputs = model(inputs_dict) - attentions = [t.numpy() for t in outputs[-1]] + output = outputs[list(outputs.keys())[-1]] if isinstance(outputs, dict) else outputs[-1] + attentions = [t.numpy() for t in output] self.assertEqual(len(outputs), num_out) self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) self.assertListEqual( @@ -238,6 +240,8 @@ def assert_outputs_same(self, after_outputs, outputs): # Make sure we don't have nans if isinstance(after_outputs, tf.Tensor): out_1 = after_outputs.numpy() + elif isinstance(after_outputs, dict): + out_1 = after_outputs[list(after_outputs.keys())[0]] else: out_1 = after_outputs[0].numpy() out_2 = outputs[0].numpy() diff --git a/tests/test_modeling_tf_ctrl.py b/tests/test_modeling_tf_ctrl.py index 462a8bb2c265..854f5b565a6d 100644 --- a/tests/test_modeling_tf_ctrl.py +++ b/tests/test_modeling_tf_ctrl.py @@ -89,9 +89,10 @@ def prepare_config_and_inputs(self): # hidden_dropout_prob=self.hidden_dropout_prob, # attention_probs_dropout_prob=self.attention_probs_dropout_prob, n_positions=self.max_position_embeddings, - n_ctx=self.max_position_embeddings + n_ctx=self.max_position_embeddings, # type_vocab_size=self.type_vocab_size, - # initializer_range=self.initializer_range + # initializer_range=self.initializer_range, + return_dict=True, ) head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) @@ -111,30 +112,22 @@ def prepare_config_and_inputs(self): def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): model = TFCTRLModel(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - sequence_output = model(inputs)[0] + result = model(inputs) inputs = [input_ids, None, input_mask] # None is the input for 'past' - sequence_output = model(inputs)[0] + result = model(inputs) - sequence_output = model(input_ids)[0] + result = model(input_ids) - result = { - "sequence_output": sequence_output.numpy(), - } self.parent.assertListEqual( - list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size] ) def create_and_check_ctrl_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): model = TFCTRLLMHeadModel(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - prediction_scores = model(inputs)[0] - result = { - "prediction_scores": prediction_scores.numpy(), - } - self.parent.assertListEqual( - list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] - ) + result = model(inputs) + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size]) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() diff --git a/tests/test_modeling_tf_distilbert.py b/tests/test_modeling_tf_distilbert.py index e3c83a47a7c3..3f73958378d4 100644 --- a/tests/test_modeling_tf_distilbert.py +++ b/tests/test_modeling_tf_distilbert.py @@ -89,6 +89,7 @@ def prepare_config_and_inputs(self): attention_dropout=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, initializer_range=self.initializer_range, + return_dict=True, ) return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -99,18 +100,14 @@ def create_and_check_distilbert_model( model = TFDistilBertModel(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask} - outputs = model(inputs) - sequence_output = outputs[0] + result = model(inputs) inputs = [input_ids, input_mask] - (sequence_output,) = model(inputs) + result = model(inputs) - result = { - "sequence_output": sequence_output.numpy(), - } self.parent.assertListEqual( - list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size] ) def create_and_check_distilbert_for_masked_lm( @@ -118,11 +115,8 @@ def create_and_check_distilbert_for_masked_lm( ): model = TFDistilBertForMaskedLM(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask} - (prediction_scores,) = model(inputs) - result = {"prediction_scores": prediction_scores.numpy()} - self.parent.assertListEqual( - list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] - ) + result = model(inputs) + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size]) def create_and_check_distilbert_for_question_answering( self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -132,8 +126,7 @@ def create_and_check_distilbert_for_question_answering( "input_ids": input_ids, "attention_mask": input_mask, } - start_logits, end_logits = model(inputs) - result = {"start_logits": start_logits.numpy(), "end_logits": end_logits.numpy()} + result = model(inputs) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) @@ -143,8 +136,7 @@ def create_and_check_distilbert_for_sequence_classification( config.num_labels = self.num_labels model = TFDistilBertForSequenceClassification(config) inputs = {"input_ids": input_ids, "attention_mask": input_mask} - (logits,) = model(inputs) - result = {"logits": logits.numpy()} + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels]) def create_and_check_distilbert_for_multiple_choice( @@ -158,8 +150,7 @@ def create_and_check_distilbert_for_multiple_choice( "input_ids": multiple_choice_inputs_ids, "attention_mask": multiple_choice_input_mask, } - (logits,) = model(inputs) - result = {"logits": logits.numpy()} + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) def create_and_check_distilbert_for_token_classification( @@ -168,10 +159,7 @@ def create_and_check_distilbert_for_token_classification( config.num_labels = self.num_labels model = TFDistilBertForTokenClassification(config) inputs = {"input_ids": input_ids, "attention_mask": input_mask} - (logits,) = model(inputs) - result = { - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]) def prepare_config_and_inputs_for_common(self): diff --git a/tests/test_modeling_tf_electra.py b/tests/test_modeling_tf_electra.py index 625f935c622d..e9861375678a 100644 --- a/tests/test_modeling_tf_electra.py +++ b/tests/test_modeling_tf_electra.py @@ -95,6 +95,7 @@ def prepare_config_and_inputs(self): max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, + return_dict=True, ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -104,18 +105,15 @@ def create_and_check_electra_model( ): model = TFElectraModel(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - (sequence_output,) = model(inputs) + result = model(inputs) inputs = [input_ids, input_mask] - (sequence_output,) = model(inputs) + result = model(inputs) - (sequence_output,) = model(input_ids) + result = model(input_ids) - result = { - "sequence_output": sequence_output.numpy(), - } self.parent.assertListEqual( - list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size] ) def create_and_check_electra_for_masked_lm( @@ -123,24 +121,16 @@ def create_and_check_electra_for_masked_lm( ): model = TFElectraForMaskedLM(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - (prediction_scores,) = model(inputs) - result = { - "prediction_scores": prediction_scores.numpy(), - } - self.parent.assertListEqual( - list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] - ) + result = model(inputs) + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size]) def create_and_check_electra_for_pretraining( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): model = TFElectraForPreTraining(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - (prediction_scores,) = model(inputs) - result = { - "prediction_scores": prediction_scores.numpy(), - } - self.parent.assertListEqual(list(result["prediction_scores"].shape), [self.batch_size, self.seq_length]) + result = model(inputs) + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length]) def create_and_check_electra_for_sequence_classification( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -148,10 +138,7 @@ def create_and_check_electra_for_sequence_classification( config.num_labels = self.num_labels model = TFElectraForSequenceClassification(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - (logits,) = model(inputs) - result = { - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels]) def create_and_check_electra_for_multiple_choice( @@ -167,8 +154,7 @@ def create_and_check_electra_for_multiple_choice( "attention_mask": multiple_choice_input_mask, "token_type_ids": multiple_choice_token_type_ids, } - (logits,) = model(inputs) - result = {"logits": logits.numpy()} + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) def create_and_check_electra_for_question_answering( @@ -176,11 +162,7 @@ def create_and_check_electra_for_question_answering( ): model = TFElectraForQuestionAnswering(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - start_logits, end_logits = model(inputs) - result = { - "start_logits": start_logits.numpy(), - "end_logits": end_logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) @@ -190,10 +172,7 @@ def create_and_check_electra_for_token_classification( config.num_labels = self.num_labels model = TFElectraForTokenClassification(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - (logits,) = model(inputs) - result = { - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]) def prepare_config_and_inputs_for_common(self): diff --git a/tests/test_modeling_tf_flaubert.py b/tests/test_modeling_tf_flaubert.py index 399c78ca53da..7ec611e03547 100644 --- a/tests/test_modeling_tf_flaubert.py +++ b/tests/test_modeling_tf_flaubert.py @@ -113,6 +113,7 @@ def prepare_config_and_inputs(self): summary_type=self.summary_type, use_proj=self.use_proj, bos_token_id=self.bos_token_id, + return_dict=True, ) return ( @@ -141,16 +142,12 @@ def create_and_check_flaubert_model( ): model = TFFlaubertModel(config=config) inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids} - outputs = model(inputs) + result = model(inputs) inputs = [input_ids, input_mask] - outputs = model(inputs) - sequence_output = outputs[0] - result = { - "sequence_output": sequence_output.numpy(), - } + result = model(inputs) self.parent.assertListEqual( - list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size] ) def create_and_check_flaubert_lm_head( @@ -168,13 +165,7 @@ def create_and_check_flaubert_lm_head( model = TFFlaubertWithLMHeadModel(config) inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids} - outputs = model(inputs) - - logits = outputs[0] - - result = { - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size]) @@ -194,12 +185,7 @@ def create_and_check_flaubert_qa( inputs = {"input_ids": input_ids, "lengths": input_lengths} - start_logits, end_logits = model(inputs) - - result = { - "start_logits": start_logits.numpy(), - "end_logits": end_logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) @@ -220,11 +206,7 @@ def create_and_check_flaubert_sequence_classif( inputs = {"input_ids": input_ids, "lengths": input_lengths} - (logits,) = model(inputs) - - result = { - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.type_sequence_label_size]) @@ -243,10 +225,7 @@ def create_and_check_flaubert_for_token_classification( config.num_labels = self.num_labels model = TFFlaubertForTokenClassification(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - (logits,) = model(inputs) - result = { - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]) def create_and_check_flaubert_for_multiple_choice( @@ -271,8 +250,7 @@ def create_and_check_flaubert_for_multiple_choice( "attention_mask": multiple_choice_input_mask, "token_type_ids": multiple_choice_token_type_ids, } - (logits,) = model(inputs) - result = {"logits": logits.numpy()} + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) def prepare_config_and_inputs_for_common(self): diff --git a/tests/test_modeling_tf_gpt2.py b/tests/test_modeling_tf_gpt2.py index 7728c8b1f60a..32e725c02817 100644 --- a/tests/test_modeling_tf_gpt2.py +++ b/tests/test_modeling_tf_gpt2.py @@ -102,6 +102,7 @@ def prepare_config_and_inputs(self): # initializer_range=self.initializer_range bos_token_id=self.bos_token_id, eos_token_id=self.eos_token_id, + return_dict=True, ) head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) @@ -125,18 +126,15 @@ def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, "attention_mask": input_mask, "token_type_ids": token_type_ids, } - sequence_output = model(inputs)[0] + result = model(inputs) inputs = [input_ids, None, input_mask] # None is the input for 'past' - sequence_output = model(inputs)[0] + result = model(inputs) - sequence_output = model(input_ids)[0] + result = model(input_ids) - result = { - "sequence_output": sequence_output.numpy(), - } self.parent.assertListEqual( - list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size], + list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size], ) def create_and_check_gpt2_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): @@ -150,7 +148,7 @@ def create_and_check_gpt2_model_past(self, config, input_ids, input_mask, head_m self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf)) self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1) - output, past = outputs + output, past = outputs.to_tuple() # create hypothetical next token and extent to next_input_ids next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) @@ -160,8 +158,8 @@ def create_and_check_gpt2_model_past(self, config, input_ids, input_mask, head_m next_input_ids = tf.concat([input_ids, next_tokens], axis=-1) next_token_type_ids = tf.concat([token_type_ids, next_token_types], axis=-1) - output_from_no_past, _ = model(next_input_ids, token_type_ids=next_token_type_ids) - output_from_past, _ = model(next_tokens, token_type_ids=next_token_types, past=past) + output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"] + output_from_past = model(next_tokens, token_type_ids=next_token_types, past=past)["last_hidden_state"] # select random slice random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1])) @@ -183,7 +181,7 @@ def create_and_check_gpt2_model_attention_mask_past( attn_mask = tf.concat([attn_mask_begin, attn_mask_end], axis=1) # first forward pass - output, past = model(input_ids, attention_mask=attn_mask) + output, past = model(input_ids, attention_mask=attn_mask).to_tuple() # create hypothetical next token and extent to next_input_ids next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) @@ -202,8 +200,8 @@ def create_and_check_gpt2_model_attention_mask_past( attn_mask = tf.concat([attn_mask, tf.ones((shape_list(attn_mask)[0], 1), dtype=tf.int32)], axis=1) # get two different outputs - output_from_no_past, _ = model(next_input_ids, attention_mask=attn_mask) - output_from_past, _ = model(next_tokens, past=past, attention_mask=attn_mask) + output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"] + output_from_past = model(next_tokens, past=past, attention_mask=attn_mask)["last_hidden_state"] # select random slice random_slice_idx = int(ids_tensor((1,), shape_list(output_from_past)[-1])) @@ -220,12 +218,9 @@ def create_and_check_gpt2_lm_head(self, config, input_ids, input_mask, head_mask "attention_mask": input_mask, "token_type_ids": token_type_ids, } - prediction_scores = model(inputs)[0] - result = { - "prediction_scores": prediction_scores.numpy(), - } + result = model(inputs) self.parent.assertListEqual( - list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size], + list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size], ) def create_and_check_gpt2_double_head( @@ -243,8 +238,7 @@ def create_and_check_gpt2_double_head( "attention_mask": multiple_choice_input_mask, "token_type_ids": multiple_choice_token_type_ids, } - lm_logits, mc_logits = model(inputs)[:2] - result = {"lm_logits": lm_logits.numpy(), "mc_logits": mc_logits.numpy()} + result = model(inputs) self.parent.assertListEqual( list(result["lm_logits"].shape), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size], ) diff --git a/tests/test_modeling_tf_mobilebert.py b/tests/test_modeling_tf_mobilebert.py index e43d0d84cf76..41dd522f5396 100644 --- a/tests/test_modeling_tf_mobilebert.py +++ b/tests/test_modeling_tf_mobilebert.py @@ -138,6 +138,7 @@ def prepare_config_and_inputs(self): type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, embedding_size=self.embedding_size, + return_dict=True, ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -147,33 +148,26 @@ def create_and_check_mobilebert_model( ): model = TFMobileBertModel(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - sequence_output, pooled_output = model(inputs) + result = model(inputs) inputs = [input_ids, input_mask] - sequence_output, pooled_output = model(inputs) + result = model(inputs) - sequence_output, pooled_output = model(input_ids) + result = model(input_ids) - result = { - "sequence_output": sequence_output.numpy(), - "pooled_output": pooled_output.numpy(), - } self.parent.assertListEqual( - list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size] ) - self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size]) + self.parent.assertListEqual(list(result["pooler_output"].shape), [self.batch_size, self.hidden_size]) def create_and_check_mobilebert_for_masked_lm( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): model = TFMobileBertForMaskedLM(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - (prediction_scores,) = model(inputs) - result = { - "prediction_scores": prediction_scores.numpy(), - } + result = model(inputs) self.parent.assertListEqual( - list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] + list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size] ) def create_and_check_mobilebert_for_next_sequence_prediction( @@ -181,26 +175,19 @@ def create_and_check_mobilebert_for_next_sequence_prediction( ): model = TFMobileBertForNextSentencePrediction(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - (seq_relationship_score,) = model(inputs) - result = { - "seq_relationship_score": seq_relationship_score.numpy(), - } - self.parent.assertListEqual(list(result["seq_relationship_score"].shape), [self.batch_size, 2]) + result = model(inputs) + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, 2]) def create_and_check_mobilebert_for_pretraining( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): model = TFMobileBertForPreTraining(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - prediction_scores, seq_relationship_score = model(inputs) - result = { - "prediction_scores": prediction_scores.numpy(), - "seq_relationship_score": seq_relationship_score.numpy(), - } + result = model(inputs) self.parent.assertListEqual( - list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] + list(result["prediction_logits"].shape), [self.batch_size, self.seq_length, self.vocab_size] ) - self.parent.assertListEqual(list(result["seq_relationship_score"].shape), [self.batch_size, 2]) + self.parent.assertListEqual(list(result["seq_relationship_logits"].shape), [self.batch_size, 2]) def create_and_check_mobilebert_for_sequence_classification( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -208,10 +195,7 @@ def create_and_check_mobilebert_for_sequence_classification( config.num_labels = self.num_labels model = TFMobileBertForSequenceClassification(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - (logits,) = model(inputs) - result = { - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels]) def create_and_check_mobilebert_for_multiple_choice( @@ -227,10 +211,7 @@ def create_and_check_mobilebert_for_multiple_choice( "attention_mask": multiple_choice_input_mask, "token_type_ids": multiple_choice_token_type_ids, } - (logits,) = model(inputs) - result = { - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) def create_and_check_mobilebert_for_token_classification( @@ -239,10 +220,7 @@ def create_and_check_mobilebert_for_token_classification( config.num_labels = self.num_labels model = TFMobileBertForTokenClassification(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - (logits,) = model(inputs) - result = { - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual( list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels] ) @@ -252,11 +230,7 @@ def create_and_check_mobilebert_for_question_answering( ): model = TFMobileBertForQuestionAnswering(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - start_logits, end_logits = model(inputs) - result = { - "start_logits": start_logits.numpy(), - "end_logits": end_logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) diff --git a/tests/test_modeling_tf_openai_gpt.py b/tests/test_modeling_tf_openai_gpt.py index 8e329491027b..b9f86fed58a6 100644 --- a/tests/test_modeling_tf_openai_gpt.py +++ b/tests/test_modeling_tf_openai_gpt.py @@ -94,9 +94,10 @@ def prepare_config_and_inputs(self): # hidden_dropout_prob=self.hidden_dropout_prob, # attention_probs_dropout_prob=self.attention_probs_dropout_prob, n_positions=self.max_position_embeddings, - n_ctx=self.max_position_embeddings + n_ctx=self.max_position_embeddings, # type_vocab_size=self.type_vocab_size, - # initializer_range=self.initializer_range + # initializer_range=self.initializer_range, + return_dict=True, ) head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) @@ -116,30 +117,22 @@ def prepare_config_and_inputs(self): def create_and_check_openai_gpt_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): model = TFOpenAIGPTModel(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - sequence_output = model(inputs)[0] + result = model(inputs) inputs = [input_ids, input_mask] - sequence_output = model(inputs)[0] + result = model(inputs) - sequence_output = model(input_ids)[0] + result = model(input_ids) - result = { - "sequence_output": sequence_output.numpy(), - } self.parent.assertListEqual( - list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size] ) def create_and_check_openai_gpt_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): model = TFOpenAIGPTLMHeadModel(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - prediction_scores = model(inputs)[0] - result = { - "prediction_scores": prediction_scores.numpy(), - } - self.parent.assertListEqual( - list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] - ) + result = model(inputs) + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size]) def create_and_check_openai_gpt_double_head( self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args @@ -156,8 +149,7 @@ def create_and_check_openai_gpt_double_head( "attention_mask": multiple_choice_input_mask, "token_type_ids": multiple_choice_token_type_ids, } - lm_logits, mc_logits = model(inputs)[:2] - result = {"lm_logits": lm_logits.numpy(), "mc_logits": mc_logits.numpy()} + result = model(inputs) self.parent.assertListEqual( list(result["lm_logits"].shape), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size] ) diff --git a/tests/test_modeling_tf_roberta.py b/tests/test_modeling_tf_roberta.py index 65752e994a2e..04dcf20af8fd 100644 --- a/tests/test_modeling_tf_roberta.py +++ b/tests/test_modeling_tf_roberta.py @@ -95,6 +95,7 @@ def prepare_config_and_inputs(self): max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, + return_dict=True, ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -104,31 +105,23 @@ def create_and_check_roberta_model( ): model = TFRobertaModel(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - sequence_output = model(inputs)[0] + result = model(inputs) inputs = [input_ids, input_mask] - sequence_output = model(inputs)[0] + result = model(inputs) - sequence_output = model(input_ids)[0] + result = model(input_ids) - result = { - "sequence_output": sequence_output.numpy(), - } self.parent.assertListEqual( - list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size] ) def create_and_check_roberta_for_masked_lm( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): model = TFRobertaForMaskedLM(config=config) - prediction_scores = model([input_ids, input_mask, token_type_ids])[0] - result = { - "prediction_scores": prediction_scores.numpy(), - } - self.parent.assertListEqual( - list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] - ) + result = model([input_ids, input_mask, token_type_ids]) + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size]) def create_and_check_roberta_for_token_classification( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -136,10 +129,7 @@ def create_and_check_roberta_for_token_classification( config.num_labels = self.num_labels model = TFRobertaForTokenClassification(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - (logits,) = model(inputs) - result = { - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]) def create_and_check_roberta_for_question_answering( @@ -147,11 +137,7 @@ def create_and_check_roberta_for_question_answering( ): model = TFRobertaForQuestionAnswering(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - start_logits, end_logits = model(inputs) - result = { - "start_logits": start_logits.numpy(), - "end_logits": end_logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) @@ -168,10 +154,7 @@ def create_and_check_roberta_for_multiple_choice( "attention_mask": multiple_choice_input_mask, "token_type_ids": multiple_choice_token_type_ids, } - (logits,) = model(inputs) - result = { - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) def prepare_config_and_inputs_for_common(self): diff --git a/tests/test_modeling_tf_t5.py b/tests/test_modeling_tf_t5.py index 3990ba76c3b8..fc7f72667aff 100644 --- a/tests/test_modeling_tf_t5.py +++ b/tests/test_modeling_tf_t5.py @@ -78,6 +78,7 @@ def prepare_config_and_inputs(self): bos_token_id=self.pad_token_id, pad_token_id=self.pad_token_id, decoder_start_token_id=self.pad_token_id, + return_dict=True, ) return (config, input_ids, input_mask, token_labels) @@ -89,22 +90,14 @@ def create_and_check_t5_model(self, config, input_ids, input_mask, token_labels) "decoder_input_ids": input_ids, "decoder_attention_mask": input_mask, } - decoder_output, decoder_past, encoder_output = model(inputs) - - decoder_output, decoder_past, encoder_output = model( - input_ids, decoder_attention_mask=input_mask, decoder_input_ids=input_ids - ) - result = { - "encoder_output": encoder_output.numpy(), - "decoder_past": decoder_past, - "decoder_output": decoder_output.numpy(), - } - self.parent.assertListEqual( - list(result["encoder_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] - ) - self.parent.assertListEqual( - list(result["decoder_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] - ) + result = model(inputs) + + result = model(input_ids, decoder_attention_mask=input_mask, decoder_input_ids=input_ids) + decoder_output = result["last_hidden_state"] + decoder_past = result["decoder_past_key_values"] + encoder_output = result["encoder_last_hidden_state"] + self.parent.assertListEqual(list(encoder_output.shape), [self.batch_size, self.seq_length, self.hidden_size]) + self.parent.assertListEqual(list(decoder_output.shape), [self.batch_size, self.seq_length, self.hidden_size]) self.parent.assertEqual(len(decoder_past), 2) # decoder_past[0] should correspond to encoder output self.parent.assertTrue(tf.reduce_all(tf.math.equal(decoder_past[0][0], encoder_output))) @@ -121,14 +114,9 @@ def create_and_check_t5_with_lm_head(self, config, input_ids, input_mask, token_ "decoder_attention_mask": input_mask, } - prediction_scores, _, _ = model(inputs_dict) + result = model(inputs_dict) - result = { - "prediction_scores": prediction_scores.numpy(), - } - self.parent.assertListEqual( - list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] - ) + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size]) def create_and_check_t5_decoder_model_past(self, config, input_ids, decoder_input_ids, attention_mask): model = TFT5Model(config=config).get_decoder() diff --git a/tests/test_modeling_tf_transfo_xl.py b/tests/test_modeling_tf_transfo_xl.py index 408b3c02b0d6..12e3be5bd5c9 100644 --- a/tests/test_modeling_tf_transfo_xl.py +++ b/tests/test_modeling_tf_transfo_xl.py @@ -79,6 +79,7 @@ def prepare_config_and_inputs(self): div_val=self.div_val, n_layer=self.num_hidden_layers, eos_token_id=self.eos_token_id, + return_dict=True, ) return (config, input_ids_1, input_ids_2, lm_labels) @@ -90,11 +91,11 @@ def set_seed(self): def create_and_check_transfo_xl_model(self, config, input_ids_1, input_ids_2, lm_labels): model = TFTransfoXLModel(config) - hidden_states_1, mems_1 = model(input_ids_1) + hidden_states_1, mems_1 = model(input_ids_1).to_tuple() inputs = {"input_ids": input_ids_2, "mems": mems_1} - hidden_states_2, mems_2 = model(inputs) + hidden_states_2, mems_2 = model(inputs).to_tuple() result = { "hidden_states_1": hidden_states_1.numpy(), @@ -121,16 +122,16 @@ def create_and_check_transfo_xl_model(self, config, input_ids_1, input_ids_2, lm def create_and_check_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels): model = TFTransfoXLLMHeadModel(config) - lm_logits_1, mems_1 = model(input_ids_1) + lm_logits_1, mems_1 = model(input_ids_1).to_tuple() inputs = {"input_ids": input_ids_1, "labels": lm_labels} - _, mems_1 = model(inputs) + _, mems_1 = model(inputs).to_tuple() - lm_logits_2, mems_2 = model([input_ids_2, mems_1]) + lm_logits_2, mems_2 = model([input_ids_2, mems_1]).to_tuple() inputs = {"input_ids": input_ids_1, "mems": mems_1, "labels": lm_labels} - _, mems_2 = model(inputs) + _, mems_2 = model(inputs).to_tuple() result = { "mems_1": [mem.numpy() for mem in mems_1], diff --git a/tests/test_modeling_tf_xlm.py b/tests/test_modeling_tf_xlm.py index 1903f4a8dfb4..7f5007ad88f4 100644 --- a/tests/test_modeling_tf_xlm.py +++ b/tests/test_modeling_tf_xlm.py @@ -112,6 +112,7 @@ def prepare_config_and_inputs(self): summary_type=self.summary_type, use_proj=self.use_proj, bos_token_id=self.bos_token_id, + return_dict=True, ) return ( @@ -140,16 +141,12 @@ def create_and_check_xlm_model( ): model = TFXLMModel(config=config) inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids} - outputs = model(inputs) + result = model(inputs) inputs = [input_ids, input_mask] - outputs = model(inputs) - sequence_output = outputs[0] - result = { - "sequence_output": sequence_output.numpy(), - } + result = model(inputs) self.parent.assertListEqual( - list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size] ) def create_and_check_xlm_lm_head( @@ -169,11 +166,7 @@ def create_and_check_xlm_lm_head( inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids} outputs = model(inputs) - logits = outputs[0] - - result = { - "logits": logits.numpy(), - } + result = outputs self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size]) @@ -193,12 +186,7 @@ def create_and_check_xlm_qa( inputs = {"input_ids": input_ids, "lengths": input_lengths} - start_logits, end_logits = model(inputs) - - result = { - "start_logits": start_logits.numpy(), - "end_logits": end_logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) @@ -219,11 +207,7 @@ def create_and_check_xlm_sequence_classif( inputs = {"input_ids": input_ids, "lengths": input_lengths} - (logits,) = model(inputs) - - result = { - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.type_sequence_label_size]) @@ -242,10 +226,7 @@ def create_and_check_xlm_for_token_classification( config.num_labels = self.num_labels model = TFXLMForTokenClassification(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - (logits,) = model(inputs) - result = { - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]) def create_and_check_xlm_for_multiple_choice( @@ -270,8 +251,7 @@ def create_and_check_xlm_for_multiple_choice( "attention_mask": multiple_choice_input_mask, "token_type_ids": multiple_choice_token_type_ids, } - (logits,) = model(inputs) - result = {"logits": logits.numpy()} + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) def prepare_config_and_inputs_for_common(self): diff --git a/tests/test_modeling_tf_xlm_roberta.py b/tests/test_modeling_tf_xlm_roberta.py index c27b0576b844..088574c508e6 100644 --- a/tests/test_modeling_tf_xlm_roberta.py +++ b/tests/test_modeling_tf_xlm_roberta.py @@ -36,7 +36,7 @@ def test_output_embeds_base_model(self): "attention_mask": tf.convert_to_tensor([[1, 1, 1, 1, 1, 1]], dtype=tf.int32), } - output = model(features)[0] + output = model(features)["last_hidden_state"] expected_shape = tf.TensorShape((1, 6, 768)) self.assertEqual(output.shape, expected_shape) # compare the actual values for a slice. diff --git a/tests/test_modeling_tf_xlnet.py b/tests/test_modeling_tf_xlnet.py index 0299cb2fb466..f8b92186ca1f 100644 --- a/tests/test_modeling_tf_xlnet.py +++ b/tests/test_modeling_tf_xlnet.py @@ -110,6 +110,7 @@ def prepare_config_and_inputs(self): bos_token_id=self.bos_token_id, pad_token_id=self.pad_token_id, eos_token_id=self.eos_token_id, + return_dict=True, ) return ( @@ -147,17 +148,10 @@ def create_and_check_xlnet_base_model( model = TFXLNetModel(config) inputs = {"input_ids": input_ids_1, "input_mask": input_mask, "token_type_ids": segment_ids} - - _, _ = model(inputs) + result = model(inputs) inputs = [input_ids_1, input_mask] - - outputs, mems_1 = model(inputs) - - result = { - "mems_1": [mem.numpy() for mem in mems_1], - "outputs": outputs.numpy(), - } + result = model(inputs) config.mem_len = 0 model = TFXLNetModel(config) @@ -165,10 +159,10 @@ def create_and_check_xlnet_base_model( self.parent.assertEqual(len(no_mems_outputs), 1) self.parent.assertListEqual( - list(result["outputs"].shape), [self.batch_size, self.seq_length, self.hidden_size] + list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size] ) self.parent.assertListEqual( - list(list(mem.shape) for mem in result["mems_1"]), + list(list(mem.shape) for mem in result["mems"]), [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers, ) @@ -189,16 +183,13 @@ def create_and_check_xlnet_lm_head( model = TFXLNetLMHeadModel(config) inputs_1 = {"input_ids": input_ids_1, "token_type_ids": segment_ids} - - all_logits_1, mems_1 = model(inputs_1) + all_logits_1, mems_1 = model(inputs_1).to_tuple() inputs_2 = {"input_ids": input_ids_2, "mems": mems_1, "token_type_ids": segment_ids} - - all_logits_2, mems_2 = model(inputs_2) + all_logits_2, mems_2 = model(inputs_2).to_tuple() inputs_3 = {"input_ids": input_ids_q, "perm_mask": perm_mask, "target_mapping": target_mapping} - - logits, _ = model(inputs_3) + logits, _ = model(inputs_3).to_tuple() result = { "mems_1": [mem.numpy() for mem in mems_1], @@ -240,13 +231,7 @@ def create_and_check_xlnet_qa( model = TFXLNetForQuestionAnsweringSimple(config) inputs = {"input_ids": input_ids_1, "attention_mask": input_mask, "token_type_ids": segment_ids} - start_logits, end_logits, mems = model(inputs) - - result = { - "start_logits": start_logits.numpy(), - "end_logits": end_logits.numpy(), - "mems": [m.numpy() for m in mems], - } + result = model(inputs) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) @@ -271,16 +256,11 @@ def create_and_check_xlnet_sequence_classif( ): model = TFXLNetForSequenceClassification(config) - logits, mems_1 = model(input_ids_1) - - result = { - "mems_1": [mem.numpy() for mem in mems_1], - "logits": logits.numpy(), - } + result = model(input_ids_1) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.type_sequence_label_size]) self.parent.assertListEqual( - list(list(mem.shape) for mem in result["mems_1"]), + list(list(mem.shape) for mem in result["mems"]), [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers, ) @@ -305,16 +285,12 @@ def create_and_check_xlnet_for_token_classification( "attention_mask": input_mask, # 'token_type_ids': token_type_ids } - logits, mems_1 = model(inputs) - result = { - "mems_1": [mem.numpy() for mem in mems_1], - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual( list(result["logits"].shape), [self.batch_size, self.seq_length, config.num_labels] ) self.parent.assertListEqual( - list(list(mem.shape) for mem in result["mems_1"]), + list(list(mem.shape) for mem in result["mems"]), [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers, ) @@ -342,15 +318,11 @@ def create_and_check_xlnet_for_multiple_choice( "attention_mask": multiple_choice_input_mask, "token_type_ids": multiple_choice_token_type_ids, } - (logits, mems_1) = model(inputs) - result = { - "mems_1": [mem.numpy() for mem in mems_1], - "logits": logits.numpy(), - } + result = model(inputs) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) self.parent.assertListEqual( - list(list(mem.shape) for mem in result["mems_1"]), + list(list(mem.shape) for mem in result["mems"]), [[self.seq_length, self.batch_size * self.num_choices, self.hidden_size]] * self.num_hidden_layers, ) From a23a535c1041405f0bf42c02a5b3e7f256c772e1 Mon Sep 17 00:00:00 2001 From: HUSEIN ZOLKEPLI Date: Thu, 6 Aug 2020 00:27:27 +0800 Subject: [PATCH 117/127] added t5 bahasa summarization readme (#6269) --- .../README.md | 103 ++++++++++++++++++ .../README.md | 103 ++++++++++++++++++ 2 files changed, 206 insertions(+) create mode 100644 model_cards/huseinzol05/t5-base-bahasa-summarization-cased/README.md create mode 100644 model_cards/huseinzol05/t5-small-bahasa-summarization-cased/README.md diff --git a/model_cards/huseinzol05/t5-base-bahasa-summarization-cased/README.md b/model_cards/huseinzol05/t5-base-bahasa-summarization-cased/README.md new file mode 100644 index 000000000000..b36d80166f9f --- /dev/null +++ b/model_cards/huseinzol05/t5-base-bahasa-summarization-cased/README.md @@ -0,0 +1,103 @@ +--- +language: ms +--- + +# Bahasa T5 Summarization Model + +Finetuned T5 base summarization model for Malay and Indonesian. + +## Finetuning Corpus + +`t5-base-bahasa-summarization-cased` model was finetuned on multiple summarization dataset. Below is list of tasks we trained on, + +1. [Translated CNN News](https://github.com/huseinzol05/Malay-Dataset#cnn-news) +2. [Translated Gigawords](https://github.com/huseinzol05/Malay-Dataset#gigawords) +3. [Translated Multinews](https://github.com/huseinzol05/Malay-Dataset#multinews) + +## Finetuning details + +- This model was trained using Malaya T5's github [repository](https://github.com/huseinzol05/Malaya/tree/master/pretrained-model/t5) on v3-8 TPU using Base size. +- All steps can reproduce from here, [Malaya/session/summarization](https://github.com/huseinzol05/Malaya/tree/master/session/summarization). + +## Load Finetuned Model + +You can use this model by installing `torch` or `tensorflow` and Huggingface library `transformers`. And you can use it directly by initializing it like this: + +```python +from transformers import T5Tokenizer, T5Model + +tokenizer = T5Tokenizer.from_pretrained('huseinzol05/t5-base-bahasa-summarization-cased') +model = T5ForConditionalGeneration.from_pretrained('huseinzol05/t5-base-bahasa-summarization-cased') +``` + +## Example using T5ForConditionalGeneration + +```python +from transformers import T5Tokenizer, T5ForConditionalGeneration + +tokenizer = T5Tokenizer.from_pretrained('huseinzol05/t5-base-bahasa-summarization-cased') +model = T5ForConditionalGeneration.from_pretrained('huseinzol05/t5-base-bahasa-summarization-cased') + +# https://www.hmetro.com.my/mutakhir/2020/05/580438/peletakan-jawatan-tun-m-ditolak-bukan-lagi-isu +# original title, Peletakan jawatan Tun M ditolak, bukan lagi isu +string = 'PELETAKAN jawatan Tun Dr Mahathir Mohamad sebagai Pengerusi Parti Pribumi Bersatu Malaysia (Bersatu) ditolak di dalam mesyuarat khas Majlis Pimpinan Tertinggi (MPT) pada 24 Februari lalu. Justeru, tidak timbul soal peletakan jawatan itu sah atau tidak kerana ia sudah pun diputuskan pada peringkat parti yang dipersetujui semua termasuk Presiden, Tan Sri Muhyiddin Yassin. Bekas Setiausaha Agung Bersatu Datuk Marzuki Yahya berkata, pada mesyuarat itu MPT sebulat suara menolak peletakan jawatan Dr Mahathir. "Jadi ini agak berlawanan dengan keputusan yang kita sudah buat. Saya tak faham bagaimana Jabatan Pendaftar Pertubuhan Malaysia (JPPM) kata peletakan jawatan itu sah sedangkan kita sudah buat keputusan di dalam mesyuarat, bukan seorang dua yang buat keputusan. "Semua keputusan mesti dibuat melalui parti. Walau apa juga perbincangan dibuat di luar daripada keputusan mesyuarat, ini bukan keputusan parti. "Apa locus standy yang ada pada Setiausaha Kerja untuk membawa perkara ini kepada JPPM. Seharusnya ia dibawa kepada Setiausaha Agung sebagai pentadbir kepada parti," katanya kepada Harian Metro. Beliau mengulas laporan media tempatan hari ini mengenai pengesahan JPPM bahawa Dr Mahathir tidak lagi menjadi Pengerusi Bersatu berikutan peletakan jawatannya di tengah-tengah pergolakan politik pada akhir Februari adalah sah. Laporan itu juga menyatakan, kedudukan Muhyiddin Yassin memangku jawatan itu juga sah. Menurutnya, memang betul Dr Mahathir menghantar surat peletakan jawatan, tetapi ditolak oleh MPT. "Fasal yang disebut itu terpakai sekiranya berhenti atau diberhentikan, tetapi ini mesyuarat sudah menolak," katanya. Marzuki turut mempersoal kenyataan media yang dibuat beberapa pimpinan parti itu hari ini yang menyatakan sokongan kepada Perikatan Nasional. "Kenyataan media bukanlah keputusan rasmi. Walaupun kita buat 1,000 kenyataan sekali pun ia tetap tidak merubah keputusan yang sudah dibuat di dalam mesyuarat. Kita catat di dalam minit apa yang berlaku di dalam mesyuarat," katanya.' + +# https://huggingface.co/blog/how-to-generate +# generate summary +input_ids = tokenizer.encode(f'ringkasan: {string}', return_tensors = 'pt') +outputs = model.generate( + input_ids, + do_sample = True, + temperature = 0.8, + top_k = 50, + top_p = 0.95, + max_length = 300, + num_return_sequences = 3, +) + +for i, sample_output in enumerate(outputs): + print( + '{}: {}'.format( + i, tokenizer.decode(sample_output, skip_special_tokens = True) + ) + ) + +# generate news title +input_ids = tokenizer.encode(f'tajuk: {string}', return_tensors = 'pt') +outputs = model.generate( + input_ids, + do_sample = True, + temperature = 0.8, + top_k = 50, + top_p = 0.95, + max_length = 300, + num_return_sequences = 3, +) + +for i, sample_output in enumerate(outputs): + print( + '{}: {}'.format( + i, tokenizer.decode(sample_output, skip_special_tokens = True) + ) + ) +``` + +Output is, + +``` +0: "Ini agak berlawanan dengan keputusan yang kita sudah buat," kata Marzuki Yahya. Kenyataan media adalah keputusan rasmi. Marzuki: Kenyataan media tidak mengubah keputusan mesyuarat +1: MPT sebulat suara menolak peletakan jawatan Dr M di mesyuarat 24 Februari. Tidak ada persoalan peletakan jawatan itu sah atau tidak, tetapi ia adalah keputusan parti yang dipersetujui semua. Bekas Setiausaha Agung Bersatu mengatakan keputusan itu perlu dibuat melalui parti. Bekas setiausaha agung itu mengatakan kenyataan media tidak lagi menyokong Perikatan Nasional +2: Kenyataan media menunjukkan sokongan kepada Perikatan Nasional. Marzuki: Kedudukan Dr M sebagai Pengerusi Bersatu juga sah. Beliau berkata pengumuman itu harus diserahkan kepada setiausaha Agung + +0: 'Kalah Tun M, Muhyiddin tetap sah' +1: Boleh letak jawatan PM di MPT +2: 'Ketegangan Dr M sudah tolak, tak timbul isu peletakan jawatan' +``` + +## Result + +We found out using original Tensorflow implementation gives better results, check it at https://malaya.readthedocs.io/en/latest/Abstractive.html#generate-ringkasan + +## Acknowledgement + +Thanks to [Im Big](https://www.facebook.com/imbigofficial/), [LigBlou](https://www.facebook.com/ligblou), [Mesolitica](https://mesolitica.com/) and [KeyReply](https://www.keyreply.com/) for sponsoring AWS, Google and GPU clouds to train T5 for Bahasa. diff --git a/model_cards/huseinzol05/t5-small-bahasa-summarization-cased/README.md b/model_cards/huseinzol05/t5-small-bahasa-summarization-cased/README.md new file mode 100644 index 000000000000..3d23762de5e4 --- /dev/null +++ b/model_cards/huseinzol05/t5-small-bahasa-summarization-cased/README.md @@ -0,0 +1,103 @@ +--- +language: ms +--- + +# Bahasa T5 Summarization Model + +Finetuned T5 small summarization model for Malay and Indonesian. + +## Finetuning Corpus + +`t5-small-bahasa-summarization-cased` model was finetuned on multiple summarization dataset. Below is list of tasks we trained on, + +1. [Translated CNN News](https://github.com/huseinzol05/Malay-Dataset#cnn-news) +2. [Translated Gigawords](https://github.com/huseinzol05/Malay-Dataset#gigawords) +3. [Translated Multinews](https://github.com/huseinzol05/Malay-Dataset#multinews) + +## Finetuning details + +- This model was trained using Malaya T5's github [repository](https://github.com/huseinzol05/Malaya/tree/master/pretrained-model/t5) on v3-8 TPU using small size. +- All steps can reproduce from here, [Malaya/session/summarization](https://github.com/huseinzol05/Malaya/tree/master/session/summarization). + +## Load Finetuned Model + +You can use this model by installing `torch` or `tensorflow` and Huggingface library `transformers`. And you can use it directly by initializing it like this: + +```python +from transformers import T5Tokenizer, T5Model + +tokenizer = T5Tokenizer.from_pretrained('huseinzol05/t5-small-bahasa-summarization-cased') +model = T5ForConditionalGeneration.from_pretrained('huseinzol05/t5-small-bahasa-summarization-cased') +``` + +## Example using T5ForConditionalGeneration + +```python +from transformers import T5Tokenizer, T5ForConditionalGeneration + +tokenizer = T5Tokenizer.from_pretrained('huseinzol05/t5-small-bahasa-summarization-cased') +model = T5ForConditionalGeneration.from_pretrained('huseinzol05/t5-small-bahasa-summarization-cased') + +# https://www.hmetro.com.my/mutakhir/2020/05/580438/peletakan-jawatan-tun-m-ditolak-bukan-lagi-isu +# original title, Peletakan jawatan Tun M ditolak, bukan lagi isu +string = 'PELETAKAN jawatan Tun Dr Mahathir Mohamad sebagai Pengerusi Parti Pribumi Bersatu Malaysia (Bersatu) ditolak di dalam mesyuarat khas Majlis Pimpinan Tertinggi (MPT) pada 24 Februari lalu. Justeru, tidak timbul soal peletakan jawatan itu sah atau tidak kerana ia sudah pun diputuskan pada peringkat parti yang dipersetujui semua termasuk Presiden, Tan Sri Muhyiddin Yassin. Bekas Setiausaha Agung Bersatu Datuk Marzuki Yahya berkata, pada mesyuarat itu MPT sebulat suara menolak peletakan jawatan Dr Mahathir. "Jadi ini agak berlawanan dengan keputusan yang kita sudah buat. Saya tak faham bagaimana Jabatan Pendaftar Pertubuhan Malaysia (JPPM) kata peletakan jawatan itu sah sedangkan kita sudah buat keputusan di dalam mesyuarat, bukan seorang dua yang buat keputusan. "Semua keputusan mesti dibuat melalui parti. Walau apa juga perbincangan dibuat di luar daripada keputusan mesyuarat, ini bukan keputusan parti. "Apa locus standy yang ada pada Setiausaha Kerja untuk membawa perkara ini kepada JPPM. Seharusnya ia dibawa kepada Setiausaha Agung sebagai pentadbir kepada parti," katanya kepada Harian Metro. Beliau mengulas laporan media tempatan hari ini mengenai pengesahan JPPM bahawa Dr Mahathir tidak lagi menjadi Pengerusi Bersatu berikutan peletakan jawatannya di tengah-tengah pergolakan politik pada akhir Februari adalah sah. Laporan itu juga menyatakan, kedudukan Muhyiddin Yassin memangku jawatan itu juga sah. Menurutnya, memang betul Dr Mahathir menghantar surat peletakan jawatan, tetapi ditolak oleh MPT. "Fasal yang disebut itu terpakai sekiranya berhenti atau diberhentikan, tetapi ini mesyuarat sudah menolak," katanya. Marzuki turut mempersoal kenyataan media yang dibuat beberapa pimpinan parti itu hari ini yang menyatakan sokongan kepada Perikatan Nasional. "Kenyataan media bukanlah keputusan rasmi. Walaupun kita buat 1,000 kenyataan sekali pun ia tetap tidak merubah keputusan yang sudah dibuat di dalam mesyuarat. Kita catat di dalam minit apa yang berlaku di dalam mesyuarat," katanya.' + +# https://huggingface.co/blog/how-to-generate +# generate summary +input_ids = tokenizer.encode(f'ringkasan: {string}', return_tensors = 'pt') +outputs = model.generate( + input_ids, + do_sample = True, + temperature = 0.8, + top_k = 50, + top_p = 0.95, + max_length = 300, + num_return_sequences = 3, +) + +for i, sample_output in enumerate(outputs): + print( + '{}: {}'.format( + i, tokenizer.decode(sample_output, skip_special_tokens = True) + ) + ) + +# generate news title +input_ids = tokenizer.encode(f'tajuk: {string}', return_tensors = 'pt') +outputs = model.generate( + input_ids, + do_sample = True, + temperature = 0.8, + top_k = 50, + top_p = 0.95, + max_length = 300, + num_return_sequences = 3, +) + +for i, sample_output in enumerate(outputs): + print( + '{}: {}'.format( + i, tokenizer.decode(sample_output, skip_special_tokens = True) + ) + ) +``` + +Output is, + +``` +0: Pengerusi Bersatu Bersatu menafikan peletakan jawatan dalam mesyuarat khas Majlis Pimpinan Tertinggi. Tidak timbul isu peletakan jawatan itu sah atau tidak kerana ia sudah diputuskan di peringkat parti. Kenyataan media yang dibuat oleh pemimpin parti hari ini menyokong Perikatan Nasional +1: Tiada keputusan kerana ia sudah diputuskan pada peringkat parti, Marzuki berkata. Pejabat rasmi parti menolak peletakan jawatan Dr M, dengan mengatakan ia adalah keputusan. Kedudukan Muhyiddin memangku jawatan itu juga sah, katanya +2: Tiada peletakan jawatan Dr Mahathir dalam mesyuarat khas MPT pada 24 Februari. Ketua parti menolak peletakan jawatan itu. Tidak timbul isu peletakan jawatan itu sah atau tidak, katanya + +0: Tiada peletakan jawatan Tun M dalam mesyuarat khas +1: â€˜Tidak timbul peletakan jawatan Tun Mâ€™ +2: Tidak timbul isu peletakan jawatan Tun M di mesyuarat khas +``` + +## Result + +We found out using original Tensorflow implementation gives better results, check it at https://malaya.readthedocs.io/en/latest/Abstractive.html#generate-ringkasan + +## Acknowledgement + +Thanks to [Im Big](https://www.facebook.com/imbigofficial/), [LigBlou](https://www.facebook.com/ligblou), [Mesolitica](https://mesolitica.com/) and [KeyReply](https://www.keyreply.com/) for sponsoring AWS, Google and GPU clouds to train T5 for Bahasa. From a8bdba232f291aa4f3bd98fffccb2aea7b5a69ca Mon Sep 17 00:00:00 2001 From: JME-P <55997171+JME-P@users.noreply.github.com> Date: Wed, 5 Aug 2020 17:27:46 +0100 Subject: [PATCH 118/127] Create README.md for uploaded classifier (#6272) I am adding a descriptive README.md file to my recently uploaded twitter classification model: shrugging-grace/tweetclassifier. --- .../README.md | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 model_cards/jme-p/shrugging-grace-tweet-classifier/README.md diff --git a/model_cards/jme-p/shrugging-grace-tweet-classifier/README.md b/model_cards/jme-p/shrugging-grace-tweet-classifier/README.md new file mode 100644 index 000000000000..21a43b19a14d --- /dev/null +++ b/model_cards/jme-p/shrugging-grace-tweet-classifier/README.md @@ -0,0 +1,26 @@ +# shrugging-grace/tweetclassifier + +## Model description +This model classifies tweets as either relating to the Covid-19 pandemic or not. + +## Intended uses & limitations +It is intended to be used on tweets commenting on UK politics, in particular those trending with the #PMQs hashtag, as this refers to weekly Prime Ministers' Questions. + +#### How to use +``LABEL_0`` means that the tweet relates to Covid-19 + +``LABEL_1`` means that the tweet does not relate to Covid-19 + +## Training data +The model was trained on 1000 tweets (with the "#PMQs'), which were manually labeled by the author. The tweets were collected between May-July 2020. + +### BibTeX entry and citation info + +This was based on a pretrained version of BERT. + +@article{devlin2018bert, + title={Bert: Pre-training of deep bidirectional transformers for language understanding}, + author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina}, + journal={arXiv preprint arXiv:1810.04805}, + year={2018} +} From 31da35cc8939e19a292cb7f15cad5c9a1ddf7f23 Mon Sep 17 00:00:00 2001 From: JME-P <55997171+JME-P@users.noreply.github.com> Date: Wed, 5 Aug 2020 17:36:24 +0100 Subject: [PATCH 119/127] Create README.md (#6273) I am adding a descriptive README.md file to my recently uploaded twitter classification model: shrugging-grace/tweetclassifier. --- .../shrugging-grace/tweetclassifier/README.md | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 model_cards/shrugging-grace/tweetclassifier/README.md diff --git a/model_cards/shrugging-grace/tweetclassifier/README.md b/model_cards/shrugging-grace/tweetclassifier/README.md new file mode 100644 index 000000000000..21a43b19a14d --- /dev/null +++ b/model_cards/shrugging-grace/tweetclassifier/README.md @@ -0,0 +1,26 @@ +# shrugging-grace/tweetclassifier + +## Model description +This model classifies tweets as either relating to the Covid-19 pandemic or not. + +## Intended uses & limitations +It is intended to be used on tweets commenting on UK politics, in particular those trending with the #PMQs hashtag, as this refers to weekly Prime Ministers' Questions. + +#### How to use +``LABEL_0`` means that the tweet relates to Covid-19 + +``LABEL_1`` means that the tweet does not relate to Covid-19 + +## Training data +The model was trained on 1000 tweets (with the "#PMQs'), which were manually labeled by the author. The tweets were collected between May-July 2020. + +### BibTeX entry and citation info + +This was based on a pretrained version of BERT. + +@article{devlin2018bert, + title={Bert: Pre-training of deep bidirectional transformers for language understanding}, + author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina}, + journal={arXiv preprint arXiv:1810.04805}, + year={2018} +} From d5bc32ce92ace9aaec7752e0b89d51ba18903a1b Mon Sep 17 00:00:00 2001 From: Philip May Date: Thu, 6 Aug 2020 12:52:28 +0200 Subject: [PATCH 120/127] Add strip_accents to basic BertTokenizer. (#6280) * Add strip_accents to basic tokenizer * Add tests for strip_accents. * fix style with black * Fix strip_accents test * empty commit to trigger CI * Improved strip_accents check * Add code quality with is not False --- src/transformers/tokenization_bert.py | 25 ++++++++++++++---- tests/test_tokenization_bert.py | 38 +++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 5 deletions(-) diff --git a/src/transformers/tokenization_bert.py b/src/transformers/tokenization_bert.py index 96c71d0d9fde..254398b34d53 100644 --- a/src/transformers/tokenization_bert.py +++ b/src/transformers/tokenization_bert.py @@ -154,6 +154,9 @@ class BertTokenizer(PreTrainedTokenizer): Whether to tokenize Chinese characters. This should likely be deactivated for Japanese: see: https://github.com/huggingface/transformers/issues/328 + strip_accents: (:obj:`bool`, `optional`, defaults to :obj:`None`): + Whether to strip all accents. If this option is not specified (ie == None), + then it will be determined by the value for `lowercase` (as in the original Bert). """ vocab_files_names = VOCAB_FILES_NAMES @@ -173,6 +176,7 @@ def __init__( cls_token="[CLS]", mask_token="[MASK]", tokenize_chinese_chars=True, + strip_accents=None, **kwargs ): super().__init__( @@ -194,7 +198,10 @@ def __init__( self.do_basic_tokenize = do_basic_tokenize if do_basic_tokenize: self.basic_tokenizer = BasicTokenizer( - do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=tokenize_chinese_chars + do_lower_case=do_lower_case, + never_split=never_split, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, ) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) @@ -351,7 +358,7 @@ def save_vocabulary(self, vocab_path): class BasicTokenizer(object): """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" - def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True): + def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): """ Constructs a BasicTokenizer. Args: @@ -364,12 +371,16 @@ def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars= Whether to tokenize Chinese characters. This should likely be deactivated for Japanese: see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328 + **strip_accents**: (`optional`) boolean (default None) + Whether to strip all accents. If this option is not specified (ie == None), + then it will be determined by the value for `lowercase` (as in the original Bert). """ if never_split is None: never_split = [] self.do_lower_case = do_lower_case self.never_split = set(never_split) self.tokenize_chinese_chars = tokenize_chinese_chars + self.strip_accents = strip_accents def tokenize(self, text, never_split=None): """ Basic Tokenization of a piece of text. @@ -395,9 +406,13 @@ def tokenize(self, text, never_split=None): orig_tokens = whitespace_tokenize(text) split_tokens = [] for token in orig_tokens: - if self.do_lower_case and token not in never_split: - token = token.lower() - token = self._run_strip_accents(token) + if token not in never_split: + if self.do_lower_case: + token = token.lower() + if self.strip_accents is not False: + token = self._run_strip_accents(token) + elif self.strip_accents: + token = self._run_strip_accents(token) split_tokens.extend(self._run_split_on_punc(token, never_split)) output_tokens = whitespace_tokenize(" ".join(split_tokens)) diff --git a/tests/test_tokenization_bert.py b/tests/test_tokenization_bert.py index 7aa3fbe1d6c4..4421d30de4bd 100644 --- a/tests/test_tokenization_bert.py +++ b/tests/test_tokenization_bert.py @@ -130,6 +130,30 @@ def test_basic_tokenizer_lower(self): ) self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"]) + def test_basic_tokenizer_lower_strip_accents_false(self): + tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=False) + + self.assertListEqual( + tokenizer.tokenize(" \tHÃ¤LLo!how \n Are yoU? "), ["hÃ¤llo", "!", "how", "are", "you", "?"] + ) + self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["h\u00E9llo"]) + + def test_basic_tokenizer_lower_strip_accents_true(self): + tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=True) + + self.assertListEqual( + tokenizer.tokenize(" \tHÃ¤LLo!how \n Are yoU? "), ["hallo", "!", "how", "are", "you", "?"] + ) + self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"]) + + def test_basic_tokenizer_lower_strip_accents_default(self): + tokenizer = BasicTokenizer(do_lower_case=True) + + self.assertListEqual( + tokenizer.tokenize(" \tHÃ¤LLo!how \n Are yoU? "), ["hallo", "!", "how", "are", "you", "?"] + ) + self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"]) + def test_basic_tokenizer_no_lower(self): tokenizer = BasicTokenizer(do_lower_case=False) @@ -137,6 +161,20 @@ def test_basic_tokenizer_no_lower(self): tokenizer.tokenize(" \tHeLLo!how \n Are yoU? "), ["HeLLo", "!", "how", "Are", "yoU", "?"] ) + def test_basic_tokenizer_no_lower_strip_accents_false(self): + tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=False) + + self.assertListEqual( + tokenizer.tokenize(" \tHÃ¤LLo!how \n Are yoU? "), ["HÃ¤LLo", "!", "how", "Are", "yoU", "?"] + ) + + def test_basic_tokenizer_no_lower_strip_accents_true(self): + tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=True) + + self.assertListEqual( + tokenizer.tokenize(" \tHÃ¤LLo!how \n Are yoU? "), ["HaLLo", "!", "how", "Are", "yoU", "?"] + ) + def test_basic_tokenizer_respects_never_split_tokens(self): tokenizer = BasicTokenizer(do_lower_case=False, never_split=["[UNK]"]) From b923871bb78f538e3c2e4bf36776986c800da1ae Mon Sep 17 00:00:00 2001 From: Doug Blank Date: Thu, 6 Aug 2020 08:31:30 -0700 Subject: [PATCH 121/127] Adds comet_ml to the list of auto-experiment loggers (#6176) * Support for Comet.ml * Need to import comet first * Log this model, not the one in the backprop step * Log args as hyperparameters; use framework to allow fine control * Log hyperparameters with context * Apply black formatting * isort fix integrations * isort fix __init__ * Update src/transformers/trainer.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/trainer.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/trainer_tf.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Address review comments * Style + Quality, remove Tensorboard import test Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Lysandre --- examples/README.md | 24 +++++++++-- src/transformers/__init__.py | 3 ++ src/transformers/integrations.py | 48 +++++++++++++++++++++ src/transformers/trainer.py | 72 +++++++++++++++++++++---------- src/transformers/trainer_tf.py | 50 ++++++++++++++++++++- src/transformers/trainer_utils.py | 18 -------- 6 files changed, 171 insertions(+), 44 deletions(-) create mode 100644 src/transformers/integrations.py diff --git a/examples/README.md b/examples/README.md index a298ea4ea3e6..11b83792bee0 100644 --- a/examples/README.md +++ b/examples/README.md @@ -81,7 +81,13 @@ Feedback and more use cases and benchmarks involving TPUs are welcome, please sh ## Logging & Experiment tracking -You can easily log and monitor your runs code. [TensorBoard](https://www.tensorflow.org/tensorboard) and [Weights & Biases](https://docs.wandb.com/library/integrations/huggingface) are currently supported. +You can easily log and monitor your runs code. The following are currently supported: + +* [TensorBoard](https://www.tensorflow.org/tensorboard) +* [Weights & Biases](https://docs.wandb.com/library/integrations/huggingface) +* [Comet ML](https://www.comet.ml/docs/python-sdk/huggingface/) + +### Weights & Biases To use Weights & Biases, install the wandb package with: @@ -104,6 +110,18 @@ wandb.login() Whenever you use `Trainer` or `TFTrainer` classes, your losses, evaluation metrics, model topology and gradients (for `Trainer` only) will automatically be logged. -For advanced configuration and examples, refer to the [W&B documentation](https://docs.wandb.com/library/integrations/huggingface). - When using ðŸ¤— Transformers with PyTorch Lightning, runs can be tracked through `WandbLogger`. Refer to related [documentation & examples](https://docs.wandb.com/library/frameworks/pytorch/lightning). + +### Comet.ml + +To use `comet_ml`, install the Python package with: + +```bash +pip install comet_ml +``` + +or if in a Conda environment: + +```bash +conda install -c comet_ml -c anaconda -c conda-forge comet_ml +``` diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index cf86a2d028a7..c43e19604f22 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -88,6 +88,9 @@ ) from .hf_argparser import HfArgumentParser +# Integrations +from .integrations import is_comet_available, is_tensorboard_available, is_wandb_available + # Model Cards from .modelcard import ModelCard diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py new file mode 100644 index 000000000000..c200d4e1df11 --- /dev/null +++ b/src/transformers/integrations.py @@ -0,0 +1,48 @@ +# Integrations with other Python libraries + +import os + + +try: + import comet_ml # noqa: F401 + + _has_comet = True +except (ImportError): + _has_comet = False + + +try: + import wandb + + wandb.ensure_configured() + if wandb.api.api_key is None: + _has_wandb = False + wandb.termwarn("W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.") + else: + _has_wandb = False if os.getenv("WANDB_DISABLED") else True +except (ImportError, AttributeError): + _has_wandb = False + +try: + from torch.utils.tensorboard import SummaryWriter # noqa: F401 + + _has_tensorboard = True +except ImportError: + try: + from tensorboardX import SummaryWriter # noqa: F401 + + _has_tensorboard = True + except ImportError: + _has_tensorboard = False + + +def is_wandb_available(): + return _has_wandb + + +def is_comet_available(): + return _has_comet + + +def is_tensorboard_available(): + return _has_tensorboard diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 10674c062009..d8aeddb85335 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -20,16 +20,10 @@ from .data.data_collator import DataCollator, default_data_collator from .file_utils import is_torch_tpu_available +from .integrations import is_comet_available, is_tensorboard_available, is_wandb_available from .modeling_utils import PreTrainedModel from .optimization import AdamW, get_linear_schedule_with_warmup -from .trainer_utils import ( - PREFIX_CHECKPOINT_DIR, - EvalPrediction, - PredictionOutput, - TrainOutput, - is_wandb_available, - set_seed, -) +from .trainer_utils import PREFIX_CHECKPOINT_DIR, EvalPrediction, PredictionOutput, TrainOutput, set_seed from .training_args import TrainingArguments @@ -53,26 +47,17 @@ import torch_xla.debug.metrics as met import torch_xla.distributed.parallel_loader as pl -try: - from torch.utils.tensorboard import SummaryWriter - - _has_tensorboard = True -except ImportError: +if is_tensorboard_available(): try: - from tensorboardX import SummaryWriter - - _has_tensorboard = True + from torch.utils.tensorboard import SummaryWriter except ImportError: - _has_tensorboard = False - - -def is_tensorboard_available(): - return _has_tensorboard - + from tensorboardX import SummaryWriter if is_wandb_available(): import wandb +if is_comet_available(): + import comet_ml logger = logging.getLogger(__name__) @@ -210,6 +195,13 @@ def __init__( "You are instantiating a Trainer but W&B is not installed. To use wandb logging, " "run `pip install wandb; wandb login` see https://docs.wandb.com/huggingface." ) + if is_comet_available(): + self.setup_comet() + elif os.environ.get("COMET_MODE") != "DISABLED": + logger.info( + "To use comet_ml logging, run `pip/conda install comet_ml` " + "see https://www.comet.ml/docs/python-sdk/huggingface/" + ) set_seed(self.args.seed) # Create output directory if needed if self.is_world_process_zero(): @@ -393,6 +385,37 @@ def setup_wandb(self): self.model, log=os.getenv("WANDB_WATCH", "gradients"), log_freq=max(100, self.args.logging_steps) ) + def setup_comet(self): + """ + Setup the optional Comet.ml integration. + + Environment: + COMET_MODE: + (Optional): str - "OFFLINE", "ONLINE", or "DISABLED" + COMET_PROJECT_NAME: + (Optional): str - Comet.ml project name for experiments + COMET_OFFLINE_DIRECTORY: + (Optional): str - folder to use for saving offline experiments when `COMET_MODE` is "OFFLINE" + + For a number of configurable items in the environment, + see `here `__ + """ + if self.is_world_master(): + comet_mode = os.getenv("COMET_MODE", "ONLINE").upper() + args = {"project_name": os.getenv("COMET_PROJECT_NAME", "huggingface")} + experiment = None + if comet_mode == "ONLINE": + experiment = comet_ml.Experiment(**args) + logger.info("Automatic Comet.ml online logging enabled") + elif comet_mode == "OFFLINE": + args["offline_directory"] = os.getenv("COMET_OFFLINE_DIRECTORY", "./") + experiment = comet_ml.OfflineExperiment(**args) + logger.info("Automatic Comet.ml offline logging enabled; use `comet upload` when finished") + if experiment is not None: + experiment._set_model_graph(self.model, framework="transformers") + experiment._log_parameters(self.args, prefix="args/", framework="transformers") + experiment._log_parameters(self.model.config, prefix="config/", framework="transformers") + def num_examples(self, dataloader: DataLoader) -> int: """ Helper to get number of samples in a :class:`~torch.utils.data.DataLoader` by accessing its dataset. @@ -655,6 +678,11 @@ def log(self, logs: Dict[str, float], iterator: Optional[tqdm] = None) -> None: if is_wandb_available(): if self.is_world_process_zero(): wandb.log(logs, step=self.global_step) + if is_comet_available(): + if self.is_world_process_zero(): + experiment = comet_ml.config.get_global_experiment() + if experiment is not None: + experiment._log_metrics(logs, step=self.global_step, epoch=self.epoch, framework="transformers") output = {**logs, **{"step": self.global_step}} if iterator is not None: iterator.write(output) diff --git a/src/transformers/trainer_tf.py b/src/transformers/trainer_tf.py index d388017437f0..7cb387164e96 100644 --- a/src/transformers/trainer_tf.py +++ b/src/transformers/trainer_tf.py @@ -11,15 +11,18 @@ import tensorflow as tf from packaging.version import parse +from .integrations import is_comet_available, is_wandb_available from .modeling_tf_utils import TFPreTrainedModel from .optimization_tf import GradientAccumulator, create_optimizer -from .trainer_utils import PREFIX_CHECKPOINT_DIR, EvalPrediction, PredictionOutput, is_wandb_available, set_seed +from .trainer_utils import PREFIX_CHECKPOINT_DIR, EvalPrediction, PredictionOutput, set_seed from .training_args_tf import TFTrainingArguments if is_wandb_available(): import wandb +if is_comet_available(): + import comet_ml logger = logging.getLogger(__name__) @@ -96,6 +99,14 @@ def __init__( "run `pip install wandb; wandb login` see https://docs.wandb.com/huggingface." ) + if is_comet_available(): + self.setup_comet() + elif os.environ.get("COMET_MODE") != "DISABLED": + logger.info( + "To use comet_ml logging, run `pip/conda install comet_ml` " + "see https://www.comet.ml/docs/python-sdk/huggingface/" + ) + set_seed(self.args.seed) def get_train_tfdataset(self) -> tf.data.Dataset: @@ -218,6 +229,36 @@ def setup_wandb(self): combined_dict = {**self.model.config.to_dict(), **self.args.to_sanitized_dict()} wandb.init(project=os.getenv("WANDB_PROJECT", "huggingface"), config=combined_dict, name=self.args.run_name) + def setup_comet(self): + """ + Setup the optional Comet.ml integration. + + Environment: + COMET_MODE: + (Optional): str - "OFFLINE", "ONLINE", or "DISABLED" + COMET_PROJECT_NAME: + (Optional): str - Comet.ml project name for experiments + COMET_OFFLINE_DIRECTORY: + (Optional): str - folder to use for saving offline experiments when `COMET_MODE` is "OFFLINE" + + For a number of configurable items in the environment, + see `here `__ + """ + comet_mode = os.getenv("COMET_MODE", "ONLINE").upper() + args = {"project_name": os.getenv("COMET_PROJECT_NAME", "huggingface")} + experiment = None + if comet_mode == "ONLINE": + experiment = comet_ml.Experiment(**args) + logger.info("Automatic Comet.ml online logging enabled") + elif comet_mode == "OFFLINE": + args["offline_directory"] = os.getenv("COMET_OFFLINE_DIRECTORY", "./") + experiment = comet_ml.OfflineExperiment(**args) + logger.info("Automatic Comet.ml offline logging enabled; use `comet upload` when finished") + if experiment is not None: + experiment._set_model_graph(self.model, framework="transformers") + experiment._log_parameters(self.args, prefix="args/", framework="transformers") + experiment._log_parameters(self.model.config, prefix="config/", framework="transformers") + def prediction_loop( self, dataset: tf.data.Dataset, @@ -336,6 +377,13 @@ def log(self, logs: Dict[str, float]) -> None: if is_wandb_available(): wandb.log(logs, step=self.global_step) + if is_comet_available(): + experiment = comet_ml.config.get_global_experiment() + if experiment is not None: + experiment._log_metrics( + logs, step=self.global_step, epoch=self.epoch_logging, framework="transformers" + ) + output = {**logs, **{"step": self.global_step}} logger.info(output) diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py index 1a4e99507236..5bfdddb07152 100644 --- a/src/transformers/trainer_utils.py +++ b/src/transformers/trainer_utils.py @@ -1,4 +1,3 @@ -import os import random from typing import Dict, NamedTuple, Optional @@ -7,23 +6,6 @@ from .file_utils import is_tf_available, is_torch_available -try: - import wandb - - wandb.ensure_configured() - if wandb.api.api_key is None: - _has_wandb = False - wandb.termwarn("W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.") - else: - _has_wandb = False if os.getenv("WANDB_DISABLED") else True -except (ImportError, AttributeError): - _has_wandb = False - - -def is_wandb_available(): - return _has_wandb - - def set_seed(seed: int): """ Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf`` From 0a0d53dcf89a773ee3a1095e7911a6ff94a8fbe7 Mon Sep 17 00:00:00 2001 From: Manuel Romero Date: Thu, 6 Aug 2020 17:42:43 +0200 Subject: [PATCH 122/127] Update model card (#6290) Add links to RuPERTa models fine-tuned on Spanish SQUAD datasets --- model_cards/mrm8488/RuPERTa-base/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/model_cards/mrm8488/RuPERTa-base/README.md b/model_cards/mrm8488/RuPERTa-base/README.md index b822c996b3a1..a737a9e0b96c 100644 --- a/model_cards/mrm8488/RuPERTa-base/README.md +++ b/model_cards/mrm8488/RuPERTa-base/README.md @@ -16,12 +16,12 @@ The architecture is the same as `roberta-base`: ## Benchmarks ðŸ§¾ WIP (I continue working on it) ðŸš§ -| Task | F1 | Precision | Recall | Fine-tuned model | Reproduce it | +| Task/Dataset | F1 | Precision | Recall | Fine-tuned model | Reproduce it | | -------- | ----: | --------: | -----: | --------------------------------------------------------------------------------------: | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | | POS | 97.39 | 97.47 | 97.32 | [RuPERTa-base-finetuned-pos](https://huggingface.co/mrm8488/RuPERTa-base-finetuned-pos) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mrm8488/shared_colab_notebooks/blob/master/RuPERTa_base_finetuned_POS.ipynb) | NER | 77.55 | 75.53 | 79.68 | [RuPERTa-base-finetuned-ner](https://huggingface.co/mrm8488/RuPERTa-base-finetuned-ner) | -| SQUAD-es v1 | to-do | | | -| SQUAD-es v2 | to-do | | | +| SQUAD-es v1 | to-do | | |[RuPERTa-base-finetuned-squadv1](https://huggingface.co/mrm8488/RuPERTa-base-finetuned-squadv1) +| SQUAD-es v2 | to-do | | |[RuPERTa-base-finetuned-squadv2](https://huggingface.co/mrm8488/RuPERTa-base-finetuned-squadv2) ## Model in action ðŸ”¨ From 2f2aa0c89cab9a77560e6845578f917a61081c67 Mon Sep 17 00:00:00 2001 From: Teven Date: Thu, 6 Aug 2020 17:47:32 +0200 Subject: [PATCH 123/127] added `n_inner` argument to gpt2 config (#6296) --- src/transformers/configuration_gpt2.py | 4 ++++ src/transformers/modeling_gpt2.py | 3 ++- src/transformers/modeling_tf_gpt2.py | 3 ++- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/transformers/configuration_gpt2.py b/src/transformers/configuration_gpt2.py index 814846cbde3f..5dd80d198743 100644 --- a/src/transformers/configuration_gpt2.py +++ b/src/transformers/configuration_gpt2.py @@ -59,6 +59,8 @@ class GPT2Config(PretrainedConfig): Number of hidden layers in the Transformer encoder. n_head (:obj:`int`, optional, defaults to 12): Number of attention heads for each attention layer in the Transformer encoder. + n_inner (:obj:`int`, optional, defaults to None): + Dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd activation_function (:obj:`str`, optional, defaults to 'gelu'): Activation function selected in the list ["relu", "swish", "gelu", "tanh", "gelu_new"]. resid_pdrop (:obj:`float`, optional, defaults to 0.1): @@ -122,6 +124,7 @@ def __init__( n_embd=768, n_layer=12, n_head=12, + n_inner=None, activation_function="gelu_new", resid_pdrop=0.1, embd_pdrop=0.1, @@ -145,6 +148,7 @@ def __init__( self.n_embd = n_embd self.n_layer = n_layer self.n_head = n_head + self.n_inner = n_inner self.activation_function = activation_function self.resid_pdrop = resid_pdrop self.embd_pdrop = embd_pdrop diff --git a/src/transformers/modeling_gpt2.py b/src/transformers/modeling_gpt2.py index 3a8d104d8941..ea23a819d547 100644 --- a/src/transformers/modeling_gpt2.py +++ b/src/transformers/modeling_gpt2.py @@ -240,10 +240,11 @@ class Block(nn.Module): def __init__(self, n_ctx, config, scale=False): super().__init__() nx = config.n_embd + inner_dim = config.n_inner if config.n_inner is not None else 4 * nx self.ln_1 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon) self.attn = Attention(nx, n_ctx, config, scale) self.ln_2 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon) - self.mlp = MLP(4 * nx, config) + self.mlp = MLP(inner_dim, config) def forward( self, x, layer_past=None, attention_mask=None, head_mask=None, use_cache=False, output_attentions=False, diff --git a/src/transformers/modeling_tf_gpt2.py b/src/transformers/modeling_tf_gpt2.py index 5221ef46ceff..4952e286fbcb 100644 --- a/src/transformers/modeling_tf_gpt2.py +++ b/src/transformers/modeling_tf_gpt2.py @@ -194,10 +194,11 @@ class TFBlock(tf.keras.layers.Layer): def __init__(self, n_ctx, config, scale=False, **kwargs): super().__init__(**kwargs) nx = config.n_embd + inner_dim = config.n_inner if config.n_inner is not None else 4 * nx self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1") self.attn = TFAttention(nx, n_ctx, config, scale, name="attn") self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2") - self.mlp = TFMLP(4 * nx, config, name="mlp") + self.mlp = TFMLP(inner_dim, config, name="mlp") def call(self, x, layer_past, attention_mask, head_mask, use_cache, output_attentions, training=False): a = self.ln_1(x) From 2804fff8393dbda5098b8c9f5e36235e89c50023 Mon Sep 17 00:00:00 2001 From: Sam Shleifer Date: Thu, 6 Aug 2020 14:58:38 -0400 Subject: [PATCH 124/127] [s2s]Use prepare_translation_batch for Marian finetuning (#6293) Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- examples/seq2seq/README.md | 2 +- examples/seq2seq/finetune.py | 10 +++++----- examples/seq2seq/test_seq2seq_examples.py | 13 ++++++++----- examples/seq2seq/utils.py | 4 +++- src/transformers/tokenization_marian.py | 5 +++++ 5 files changed, 22 insertions(+), 12 deletions(-) diff --git a/examples/seq2seq/README.md b/examples/seq2seq/README.md index dd026784169a..63b5b078204d 100644 --- a/examples/seq2seq/README.md +++ b/examples/seq2seq/README.md @@ -63,7 +63,7 @@ Summarization Tips: (It rarely makes sense to start from `bart-large` unless you are a researching finetuning methods). **Update 2018-07-18** -Datasets: Seq2SeqDataset will be used for all models besides MBart, for which MBartDataset will be used.** +Datasets: `Seq2SeqDataset` should be used for all tokenizers without a `prepare_translation_batch` method. For those who do (like Marian, MBart), `TranslationDataset` should be used.** A new dataset is needed to support multilingual tasks. diff --git a/examples/seq2seq/finetune.py b/examples/seq2seq/finetune.py index c71382954604..702d71ba5797 100644 --- a/examples/seq2seq/finetune.py +++ b/examples/seq2seq/finetune.py @@ -14,7 +14,7 @@ from torch.utils.data import DataLoader from lightning_base import BaseTransformer, add_generic_args, generic_train -from transformers import MBartTokenizer, T5ForConditionalGeneration, get_linear_schedule_with_warmup +from transformers import MarianTokenizer, MBartTokenizer, T5ForConditionalGeneration, get_linear_schedule_with_warmup try: @@ -32,7 +32,7 @@ ROUGE_KEYS, calculate_bleu_score, Seq2SeqDataset, - MBartDataset, + TranslationDataset, label_smoothed_nll_loss, ) @@ -40,7 +40,7 @@ except ImportError: from utils import ( Seq2SeqDataset, - MBartDataset, + TranslationDataset, assert_all_frozen, use_task_specific_params, lmap, @@ -108,8 +108,8 @@ def __init__(self, hparams, **kwargs): if self.model.config.decoder_start_token_id is None and isinstance(self.tokenizer, MBartTokenizer): self.decoder_start_token_id = self.tokenizer.lang_code_to_id[hparams.tgt_lang] self.model.config.decoder_start_token_id = self.decoder_start_token_id - if isinstance(self.tokenizer, MBartTokenizer): - self.dataset_class = MBartDataset + if isinstance(self.tokenizer, MBartTokenizer) or isinstance(self.tokenizer, MarianTokenizer): + self.dataset_class = TranslationDataset else: self.dataset_class = Seq2SeqDataset diff --git a/examples/seq2seq/test_seq2seq_examples.py b/examples/seq2seq/test_seq2seq_examples.py index 7473e0a64bd4..06719446d6c9 100644 --- a/examples/seq2seq/test_seq2seq_examples.py +++ b/examples/seq2seq/test_seq2seq_examples.py @@ -14,14 +14,14 @@ from torch.utils.data import DataLoader import lightning_base -from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, MBartTokenizer +from transformers import AutoModelForSeq2SeqLM, AutoTokenizer from transformers.testing_utils import require_multigpu from .distillation import distill_main, evaluate_checkpoint from .finetune import SummarizationModule, main from .pack_dataset import pack_data_dir from .run_eval import generate_summaries_or_translations, run_generate -from .utils import MBartDataset, Seq2SeqDataset, label_smoothed_nll_loss, lmap, load_json +from .utils import Seq2SeqDataset, TranslationDataset, label_smoothed_nll_loss, lmap, load_json logging.basicConfig(level=logging.DEBUG) @@ -406,8 +406,9 @@ def test_pack_dataset(): assert orig_paths == new_paths -def test_mbart_dataset_truncation(): - tokenizer = MBartTokenizer.from_pretrained(MBART_TINY) +@pytest.mark.parametrize(["tok_name"], [pytest.param(MBART_TINY), pytest.param(MARIAN_TINY)]) +def test_mbart_dataset_truncation(tok_name): + tokenizer = AutoTokenizer.from_pretrained(tok_name) tmp_dir = make_test_data_dir() max_len_source = max(len(tokenizer.encode(a)) for a in ARTICLES) max_len_target = max(len(tokenizer.encode(a)) for a in SUMMARIES) @@ -416,7 +417,7 @@ def test_mbart_dataset_truncation(): assert max_len_target > max_src_len # Truncated assert max_len_source > max_src_len src_lang, tgt_lang = "ro_RO", "de_DE" # NOT WHAT IT WAS TRAINED ON - train_dataset = MBartDataset( + train_dataset = TranslationDataset( tokenizer, data_dir=tmp_dir, type_path="train", @@ -433,6 +434,8 @@ def test_mbart_dataset_truncation(): assert batch["input_ids"].shape[1] == max_src_len # show that targets are the same len assert batch["decoder_input_ids"].shape[1] == max_tgt_len + if tok_name == MARIAN_TINY: + continue # check language codes in correct place assert batch["decoder_input_ids"][0, 0].item() == tokenizer.lang_code_to_id[tgt_lang] assert batch["decoder_input_ids"][0, -1].item() == tokenizer.eos_token_id diff --git a/examples/seq2seq/utils.py b/examples/seq2seq/utils.py index 7d9288333c9d..1c13c0aa28e2 100644 --- a/examples/seq2seq/utils.py +++ b/examples/seq2seq/utils.py @@ -146,7 +146,9 @@ def make_sortish_sampler(self, batch_size): return SortishSampler(self.src_lens, batch_size) -class MBartDataset(Seq2SeqDataset): +class TranslationDataset(Seq2SeqDataset): + """A dataset that calls prepare_translation_batch.""" + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) if self.max_source_length != self.max_target_length: diff --git a/src/transformers/tokenization_marian.py b/src/transformers/tokenization_marian.py index 46ff3ff457c5..211dfda8a2b9 100644 --- a/src/transformers/tokenization_marian.py +++ b/src/transformers/tokenization_marian.py @@ -127,10 +127,12 @@ def prepare_translation_batch( src_texts: List[str], tgt_texts: Optional[List[str]] = None, max_length: Optional[int] = None, + max_target_length: Optional[int] = None, pad_to_max_length: bool = True, return_tensors: str = "pt", truncation_strategy="only_first", padding="longest", + **unused, ) -> BatchEncoding: """Prepare model inputs for translation. For best performance, translate one sentence at a time. Arguments: @@ -162,6 +164,9 @@ def prepare_translation_batch( if tgt_texts is None: return model_inputs + if max_target_length is not None: + tokenizer_kwargs["max_length"] = max_target_length + self.current_spm = self.spm_target decoder_inputs: BatchEncoding = self(tgt_texts, **tokenizer_kwargs) for k, v in decoder_inputs.items(): From 118ecfd4273b5381aeeb65476a01678c7a96ae3e Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 6 Aug 2020 21:14:46 +0200 Subject: [PATCH 125/127] fix for pytorch < 1.6 (#6300) --- src/transformers/modeling_reformer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/modeling_reformer.py b/src/transformers/modeling_reformer.py index 6049dc3ed086..b214c6d5ee6f 100644 --- a/src/transformers/modeling_reformer.py +++ b/src/transformers/modeling_reformer.py @@ -1400,7 +1400,7 @@ def _init_attention_seed(self): # randomize seeds # use cuda generator if available - if len(torch.cuda.default_generators) > 0: + if hasattr(torch.cuda, "default_generators") and len(torch.cuda.default_generators) > 0: # GPU device_idx = torch.cuda.current_device() self.attention_seed = torch.cuda.default_generators[device_idx].seed() @@ -1420,7 +1420,7 @@ def _init_feed_forward_seed(self): """ # randomize seeds # use cuda generator if available - if len(torch.cuda.default_generators) > 0: + if hasattr(torch.cuda, "default_generators") and len(torch.cuda.default_generators) > 0: # GPU device_idx = torch.cuda.current_device() self.feed_forward_seed = torch.cuda.default_generators[device_idx].seed() From eb2bd8d6eba96db67a59679b16e173ade97ba09d Mon Sep 17 00:00:00 2001 From: xujiaze13 <37360975+xujiaze13@users.noreply.github.com> Date: Thu, 6 Aug 2020 12:43:45 -0700 Subject: [PATCH 126/127] Remove redundant line in run_pl_glue.py (#6305) --- examples/text-classification/run_pl_glue.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/text-classification/run_pl_glue.py b/examples/text-classification/run_pl_glue.py index 19d8c913dd56..9c35d6d55c04 100644 --- a/examples/text-classification/run_pl_glue.py +++ b/examples/text-classification/run_pl_glue.py @@ -54,7 +54,6 @@ def prepare_data(self): cached_features_file = self._feature_file(mode) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) - features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) examples = ( From ffceef2042d5a1f2a2d70c8a0606551147dd6f8d Mon Sep 17 00:00:00 2001 From: Bhashithe Abeysinghe Date: Thu, 6 Aug 2020 15:46:43 -0400 Subject: [PATCH 127/127] [Fix] text-classification PL example (#6027) Co-authored-by: Sam Shleifer --- examples/lightning_base.py | 10 +++++++--- examples/text-classification/run_pl.sh | 2 +- examples/text-classification/run_pl_glue.py | 8 ++++++-- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/examples/lightning_base.py b/examples/lightning_base.py index 85435718302f..54719b4ae9a2 100644 --- a/examples/lightning_base.py +++ b/examples/lightning_base.py @@ -73,7 +73,7 @@ def __init__( # self.save_hyperparameters() # can also expand arguments into trainer signature for easier reading - self.hparams = hparams + self.save_hyperparameters(hparams) self.step_count = 0 self.output_dir = Path(self.hparams.output_dir) cache_dir = self.hparams.cache_dir if self.hparams.cache_dir else None @@ -245,7 +245,7 @@ def add_model_specific_args(parser, root_dir): class LoggingCallback(pl.Callback): def on_batch_end(self, trainer, pl_module): - lrs = {f"lr_group_{i}": lr for i, lr in enumerate(self.lr_scheduler.get_lr())} + lrs = {f"lr_group_{i}": param["lr"] for i, param in enumerate(pl_module.trainer.optimizers[0].param_groups)} pl_module.logger.log_metrics(lrs) def on_validation_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule): @@ -278,6 +278,10 @@ def add_generic_args(parser, root_dir) -> None: help="The output directory where the model predictions and checkpoints will be written.", ) + parser.add_argument( + "--gpus", default=0, type=int, help="The number of GPUs allocated for this, it is by default 0 meaning none", + ) + parser.add_argument( "--fp16", action="store_true", @@ -291,7 +295,7 @@ def add_generic_args(parser, root_dir) -> None: help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) - parser.add_argument("--n_tpu_cores", dest="tpu_cores", type=int, default=0) + parser.add_argument("--n_tpu_cores", dest="tpu_cores", type=int) parser.add_argument("--max_grad_norm", dest="gradient_clip_val", default=1.0, type=float, help="Max gradient norm") parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.") diff --git a/examples/text-classification/run_pl.sh b/examples/text-classification/run_pl.sh index 26a95404149b..5a01de64a39a 100755 --- a/examples/text-classification/run_pl.sh +++ b/examples/text-classification/run_pl.sh @@ -23,7 +23,7 @@ mkdir -p $OUTPUT_DIR # Add parent directory to python path to access lightning_base.py export PYTHONPATH="../":"${PYTHONPATH}" -python3 run_pl_glue.py --data_dir $DATA_DIR \ +python3 run_pl_glue.py --gpus 1 --data_dir $DATA_DIR \ --task $TASK \ --model_name_or_path $BERT_MODEL \ --output_dir $OUTPUT_DIR \ diff --git a/examples/text-classification/run_pl_glue.py b/examples/text-classification/run_pl_glue.py index 9c35d6d55c04..233a390ce894 100644 --- a/examples/text-classification/run_pl_glue.py +++ b/examples/text-classification/run_pl_glue.py @@ -3,6 +3,7 @@ import logging import os import time +from argparse import Namespace import numpy as np import torch @@ -24,6 +25,8 @@ class GLUETransformer(BaseTransformer): mode = "sequence-classification" def __init__(self, hparams): + if type(hparams) == dict: + hparams = Namespace(**hparams) hparams.glue_output_mode = glue_output_modes[hparams.task] num_labels = glue_tasks_num_labels[hparams.task] @@ -41,7 +44,8 @@ def training_step(self, batch, batch_idx): outputs = self(**inputs) loss = outputs[0] - tensorboard_logs = {"loss": loss, "rate": self.lr_scheduler.get_last_lr()[-1]} + # tensorboard_logs = {"loss": loss, "rate": self.lr_scheduler.get_last_lr()[-1]} + tensorboard_logs = {"loss": loss} return {"loss": loss, "log": tensorboard_logs} def prepare_data(self): @@ -71,7 +75,7 @@ def prepare_data(self): logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) - def load_dataset(self, mode, batch_size): + def get_dataloader(self, mode: int, batch_size: int, shuffle: bool) -> DataLoader: "Load datasets. Called after prepare data." # We test on dev set to compare to benchmarks without having to submit to GLUE server