Merge pull request #6 from huggingface/master

Update from source
huggingface · Aug 6, 2020 · c96bc33 · c96bc33
2 parents 5a26a2d + ffceef2
commit c96bc33
Show file tree

Hide file tree

Showing 231 changed files with 12,726 additions and 6,700 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -10,7 +10,8 @@ jobs:
         parallelism: 1
         steps:
             - checkout
-            - run: sudo pip install .[sklearn,tf-cpu,torch,testing]
+            - run: sudo pip install --upgrade pip
+            - run: sudo pip install .[sklearn,tf-cpu,torch,testing] --no-cache-dir
             - run: sudo pip install codecov pytest-cov
             - run: python -m pytest -n 8 --dist=loadfile -s ./tests/ --cov  | tee output.txt
             - run: codecov
@@ -27,7 +28,8 @@ jobs:
         parallelism: 1
         steps:
             - checkout
-            - run: sudo pip install .[sklearn,torch,testing]
+            - run: sudo pip install --upgrade pip
+            - run: sudo pip install .[sklearn,torch,testing] --no-cache-dir
             - run: python -m pytest -n 8 --dist=loadfile -s ./tests/ | tee output.txt
             - store_artifacts:
                   path: ~/transformers/output.txt
@@ -43,7 +45,8 @@ jobs:
         parallelism: 1
         steps:
             - checkout
-            - run: sudo pip install .[sklearn,tf-cpu,testing]
+            - run: sudo pip install --upgrade pip
+            - run: sudo pip install .[sklearn,tf-cpu,testing] --no-cache-dir
             - run: python -m pytest -n 8 --dist=loadfile -s ./tests/ | tee output.txt
             - store_artifacts:
                path: ~/transformers/output.txt
@@ -56,7 +59,8 @@ jobs:
             RUN_CUSTOM_TOKENIZERS: yes
         steps:
             - checkout
-            - run: sudo pip install .[mecab,testing]
+            - run: sudo pip install --upgrade pip
+            - run: sudo pip install .[ja,testing]
             - run: python -m pytest -s ./tests/test_tokenization_bert_japanese.py | tee output.txt
             - store_artifacts:
                 path: ~/transformers/output.txt
@@ -71,9 +75,10 @@ jobs:
         parallelism: 1
         steps:
             - checkout
-            - run: sudo pip install .[sklearn,torch,testing]
+            - run: sudo pip install --upgrade pip
+            - run: sudo pip install .[sklearn,torch,testing] --no-cache-dir
             - run: sudo pip install -r examples/requirements.txt
-            - run: python -m pytest -n 8 --dist=loadfile -s ./examples/ | tee output.txt
+            - run: python -m pytest -n 8 --dist=loadfile -rA -s ./examples/ | tee output.txt
             - store_artifacts:
                   path: ~/transformers/output.txt
                   destination: test_output.txt
@@ -83,7 +88,8 @@ jobs:
             - image: circleci/python:3.6
         steps:
             - checkout
-            - run: sudo pip install .[tf,torch,docs]
+            - run: sudo pip install --upgrade pip
+            - run: sudo pip install .[tf,torch,docs] --no-cache-dir
             - run: cd docs && make html SPHINXOPTS="-W"
             - store_artifacts:
                 path: ./docs/_build
@@ -96,7 +102,7 @@ jobs:
                 fingerprints:
                     - "5b:7a:95:18:07:8c:aa:76:4c:60:35:88:ad:60:56:71"
             - checkout
-            - run: sudo pip install .[tf,torch,docs]
+            - run: sudo pip install .[tf,torch,docs] --no-cache-dir
             - run: ./.circleci/deploy.sh
     check_code_quality:
         working_directory: ~/transformers
@@ -106,9 +112,10 @@ jobs:
         parallelism: 1
         steps:
             - checkout
+            - run: sudo pip install --upgrade pip
             # we need a version of isort with https://github.com/timothycrosley/isort/pull/1000
             - run: sudo pip install git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort
-            - run: sudo pip install .[tf,torch,quality]
+            - run: sudo pip install .[tf,torch,quality] --no-cache-dir
             - run: black --check --line-length 119 --target-version py35 examples templates tests src utils
             - run: isort --check-only --recursive examples templates tests src utils
             - run: flake8 examples templates tests src utils

diff --git a/.github/ISSUE_TEMPLATE/bug-report.md b/.github/ISSUE_TEMPLATE/bug-report.md
@@ -7,14 +7,51 @@ assignees: ''
 
 ---
 
-# 🐛 Bug
+
+## Environment info
+<!-- You can run the command `transformers-cli env` and copy-and-paste its output below.
+     Don't forget to fill out the missing fields in that output! -->
+     
+- `transformers` version:
+- Platform:
+- Python version:
+- PyTorch version (GPU?):
+- Tensorflow version (GPU?):
+- Using GPU in script?:
+- Using distributed or parallel set-up in script?:
+
+### Who can help
+<!-- Your issue will be replied to more quickly if you can figure out the right person to tag with @
+ If you know how to use git blame, that is the easiest way, otherwise, here is a rough guide of **who to tag**.
+ Please tag fewer than 3 people.
+ 
+ albert, bert, GPT2, XLM: @LysandreJik 
+ tokenizers: @mfuntowicz
+ Trainer: @sgugger
+ Speed and Memory Benchmarks: @patrickvonplaten
+ Model Cards: @julien-c
+ Translation: @sshleifer
+ Summarization: @sshleifer
+ TextGeneration: @TevenLeScao 
+ examples/distillation: @VictorSanh
+ nlp datasets: [different repo](https://github.com/huggingface/nlp)
+ rust tokenizers: [different repo](https://github.com/huggingface/tokenizers)
+ Text Generation: @TevenLeScao
+ blenderbot: @mariamabarham
+ Bart: @sshleifer
+ Marian: @sshleifer
+ T5: @patrickvonplaten
+ Longformer/Reformer: @patrickvonplaten
+ TransfoXL/XLNet: @TevenLeScao 
+ examples/seq2seq: @sshleifer
+ tensorflow: @jplu 
+documentation: @sgugger
+ -->
 
 ## Information
 
 Model I am using (Bert, XLNet ...):
 
-Language I am using the model on (English, Chinese ...):
-
 The problem arises when using:
 * [ ] the official example scripts: (give details below)
 * [ ] my own modified scripts: (give details below)
@@ -38,15 +75,3 @@ Steps to reproduce the behavior:
 ## Expected behavior
 
 <!-- A clear and concise description of what you would expect to happen. -->
-
-## Environment info
-<!-- You can run the command `transformers-cli env` and copy-and-paste its output below.
-     Don't forget to fill out the missing fields in that output! -->
-     
-- `transformers` version:
-- Platform:
-- Python version:
-- PyTorch version (GPU?):
-- Tensorflow version (GPU?):
-- Using GPU in script?:
-- Using distributed or parallel set-up in script?:
diff --git a/.github/workflows/github-torch-hub.yml b/.github/workflows/github-torch-hub.yml
@@ -20,7 +20,8 @@ jobs:
         python-version: 3.7
     - name: Install dependencies
       run: |
-        pip install torch
+        pip install --upgrade pip
+        pip install torch --no-cache-dir
         pip install numpy tokenizers filelock requests tqdm regex sentencepiece sacremoses packaging
 
     - name: Torch hub list

diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml
@@ -35,7 +35,8 @@ jobs:
     - name: Install dependencies
       run: |
         source .env/bin/activate
-        pip install torch
+        pip install --upgrade pip
+        pip install torch --no-cache-dir
         pip install .[sklearn,testing]
 
     - name: Are GPUs recognized by our DL frameworks

diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
@@ -31,7 +31,8 @@ jobs:
     - name: Install dependencies
       run: |
         source .env/bin/activate
-        pip install .[sklearn,torch,testing]
+        pip install --upgrade pip
+        pip install .[sklearn,torch,testing] --no-cache-dir
 
     - name: Are GPUs recognized by our DL frameworks
       run: |

diff --git a/README.md b/README.md
@@ -174,12 +174,19 @@ These implementations have been tested on several datasets (see the example scri
 
 ## Online demo
 
-**[Write With Transformer](https://transformer.huggingface.co)**, built by the Hugging Face team at transformer.huggingface.co, is the official demo of this repo’s text generation capabilities.
-You can use it to experiment with completions generated by `GPT2Model`, `TransfoXLModel`, and `XLNetModel`.
+You can test our inference API on most model pages from the model hub: https://huggingface.co/models
+
+For example: 
+- [Masked word completion with BERT](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
+- [NER with Electra](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
+- [Text generation with GPT-2](https://huggingface.co/gpt2?text=A+long+time+ago%2C+)
+- [NLI with RoBERTa](https://huggingface.co/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
+- [Summarization with BART](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
+- [Question answering with DistilBERT](https://huggingface.co/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
+- [Translation with T5](https://huggingface.co/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
 
-> “🦄 Write with transformer is to writing what calculators are to calculus.”
 
-![write_with_transformer](https://transformer.huggingface.co/front/assets/thumbnail-large.png)
+**[Write With Transformer](https://transformer.huggingface.co)**, built by the Hugging Face team at transformer.huggingface.co, is the official demo of this repo’s text generation capabilities.
 
 ## Quick tour
 

diff --git a/deploy_multi_version_doc.sh b/deploy_multi_version_doc.sh
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -157,8 +157,8 @@ conversion utilities for the following models:
     notebooks
     converting_tensorflow_models
     migration
-    torchscript
     contributing
+    serialization
 
 .. toctree::
     :maxdepth: 2
@@ -206,3 +206,5 @@ conversion utilities for the following models:
     model_doc/mobilebert
     model_doc/dpr
     internal/modeling_utils
+    internal/tokenization_utils
+    internal/pipelines_utils
diff --git a/docs/source/internal/pipelines_utils.rst b/docs/source/internal/pipelines_utils.rst
@@ -0,0 +1,40 @@
+Utilities for pipelines
+-----------------------
+
+This page lists all the utility functions the library provides for pipelines.
+
+Most of those are only useful if you are studying the code of the models in the library.
+
+
+Argument handling
+~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.pipelines.ArgumentHandler
+
+.. autoclass:: transformers.pipelines.ZeroShotClassificationArgumentHandler
+
+.. autoclass:: transformers.pipelines.QuestionAnsweringArgumentHandler
+
+
+Data format
+~~~~~~~~~~~
+
+.. autoclass:: transformers.pipelines.PipelineDataFormat
+    :members:
+
+.. autoclass:: transformers.pipelines.CsvPipelineDataFormat
+    :members:
+
+.. autoclass:: transformers.pipelines.JsonPipelineDataFormat
+    :members:
+
+.. autoclass:: transformers.pipelines.PipedPipelineDataFormat
+    :members:
+
+
+Utilities
+~~~~~~~~~
+
+.. autofunction:: transformers.pipelines.get_framework
+
+.. autoclass:: transformers.pipelines.PipelineException
diff --git a/docs/source/internal/tokenization_utils.rst b/docs/source/internal/tokenization_utils.rst
@@ -0,0 +1,38 @@
+Utilities for Tokenizers
+------------------------
+
+This page lists all the utility functions used by the tokenizers, mainly the class
+:class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase` that implements the common methods between
+:class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast` and the mixin
+:class:`~transformers.tokenization_utils_base.SpecialTokensMixin`.
+
+Most of those are only useful if you are studying the code of the tokenizers in the library.
+
+``PreTrainedTokenizerBase``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.tokenization_utils_base.PreTrainedTokenizerBase
+    :special-members: __call__
+    :members:
+
+
+``SpecialTokensMixin``
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.tokenization_utils_base.SpecialTokensMixin
+    :members:
+
+
+Enums and namedtuples
+~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: transformers.tokenization_utils_base.ExplicitEnum
+
+.. autoclass:: transformers.tokenization_utils_base.PaddingStrategy
+
+.. autoclass:: transformers.tokenization_utils_base.TensorType
+
+.. autoclass:: transformers.tokenization_utils_base.TruncationStrategy
+
+.. autoclass:: transformers.tokenization_utils_base.CharSpan
+
+.. autoclass:: transformers.tokenization_utils_base.TokenSpan
diff --git a/docs/source/main_classes/model.rst b/docs/source/main_classes/model.rst
@@ -41,3 +41,9 @@ The other methods that are common to each model are defined in :class:`~transfor
 
 .. autoclass:: transformers.modeling_tf_utils.TFModelUtilsMixin
     :members:
+
+
+Generative models
+~~~~~~~~~~~~~~~~~
+
+Coming soon