Improve docs

huggingface · NielsRogge · Oct 13, 2021 · Sep 25, 2021 · Sep 29, 2021 · Sep 30, 2021
commit f3d9e9483d8d6b915260880312cdd58518e68cf4
diff --git a/docs/source/model_doc/trocr.rst b/docs/source/model_doc/trocr.rst
@@ -31,9 +31,10 @@ The original code can be found `here
 
 Tips:
 
-- TrOCR achieves state-of-the-art results on both printed and handwritten text recognition tasks, such as the `IAM
-  Handwriting dataset <https://fki.tic.heia-fr.ch/databases/iam-handwriting-database>`__. For more information, see the
-  `official models <https://huggingface.co/models?other=trocr>`__.
+- TrOCR is pre-trained in 2 stages before being fine-tuned on downstream datasets. It achieves state-of-the-art results
+  on both printed (e.g. the `SROIE dataset <https://paperswithcode.com/dataset/sroie>`__) and handwritten (e.g. the
+  `IAM Handwriting dataset <https://fki.tic.heia-fr.ch/databases/iam-handwriting-database>`__) text recognition tasks.
+  For more information, see the `official models <https://huggingface.co/models?other=trocr>`__.
 - TrOCR is always used within the :doc:`VisionEncoderDecoder <visionencoderdecoder>` framework.
 
 Inference
@@ -67,7 +68,7 @@ predicted token ids.
         >>> pixel_values = processor(image, return_tensors="pt").pixel_values
         >>> generated_ids = model.generate(pixel_values)
 
-        >>> generated_text = processor.batch_decode(generated_ids)
+        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
 
 See the `model hub <https://huggingface.co/models?filter=trocr>`__ to look for TrOCR checkpoints.

diff --git a/src/transformers/models/trocr/modeling_trocr.py b/src/transformers/models/trocr/modeling_trocr.py
@@ -152,10 +152,6 @@ def create_position_ids_from_input_ids(
         """
         Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
         symbols are ignored. This is modified from fairseq's `utils.make_positions`.
-
-        Args:
-            x: torch.Tensor x:
-        Returns: torch.Tensor
         """
         # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
         mask = input_ids.ne(padding_idx).int()

diff --git a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
@@ -245,11 +245,10 @@ def from_encoder_decoder_pretrained(
 
         Params:
             encoder_pretrained_model_name_or_path (:obj: `str`, `optional`):
-                Information necessary to initiate the encoder. Can be either:
+                Information necessary to initiate the image encoder. Can be either:
 
-                    - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
-                      a user or organization name, like ``dbmdz/bert-base-german-cased``.
+                    - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co. An
+                      example is ``google/vit-base-patch16-224-in21k``.
                     - A path to a `directory` containing model weights saved using
                       :func:`~transformers.PreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
                     - A path or url to a `tensorflow index checkpoint file` (e.g, ``./tf_model/model.ckpt.index``). In
@@ -258,7 +257,7 @@ def from_encoder_decoder_pretrained(
                       a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
 
             decoder_pretrained_model_name_or_path (:obj: `str`, `optional`, defaults to `None`):
-                Information necessary to initiate the decoder. Can be either:
+                Information necessary to initiate the text decoder. Can be either:
 
                     - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co.
                       Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
@@ -400,10 +399,10 @@ def forward(
             >>> from PIL import Image
             >>> import torch
 
-            >>> processor = TrOCRProcessor.from_pretrained('microsoft/tr-ocr-base-iam')
-            >>> model = VisionEncoderDecoderModel.from_pretrained('microsoft/tr-ocr-base-iam')
+            >>> processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
+            >>> model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten')
 
-            >>> # load image
+            >>> # load image from the IAM dataset
             >>> url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02.jpg"
             >>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
 
@@ -414,7 +413,7 @@ def forward(
 
             >>> # inference (generation)
             >>> generated_ids = model.generate(pixel_values)
-            >>> generated_text = processor.batch_decode(generated_ids)
+            >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict