huggingface · fxmarty · Feb 24, 2023 · Feb 21, 2023 · Feb 22, 2023 · Feb 22, 2023
diff --git a/.github/workflows/test_exporters_gpu.yml b/.github/workflows/test_exporters_gpu.yml
@@ -0,0 +1,89 @@
+name: Exporters / Test GPU
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: 0 1 */3 * * # at 1am every 3 days
+  pull_request:
+    types: [labeled]
+  # uncomment to enable on PR merge on main branch:
+  #push:
+  #  branches:
+  #    - main
+
+jobs:
+  start-runner:
+    name: Start self-hosted EC2 runner
+    runs-on: ubuntu-latest
+    env:
+      AWS_REGION: us-east-1
+      EC2_AMI_ID: ami-0dc1c26161f869ed1
+      EC2_INSTANCE_TYPE: g4dn.xlarge
+      EC2_SUBNET_ID: subnet-859322b4,subnet-b7533b96,subnet-47cfad21,subnet-a396b2ad,subnet-06576a4b,subnet-df0f6180
+      EC2_SECURITY_GROUP: sg-0bb210cd3ec725a13
+      EC2_IAM_ROLE: optimum-ec2-github-actions-role
+    outputs:
+      label: ${{ steps.start-ec2-runner.outputs.label }}
+      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v1
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ env.AWS_REGION }}
+      - name: Start EC2 runner
+        id: start-ec2-runner
+        uses: philschmid/philschmid-ec2-github-runner@main
+        with:
+          mode: start
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          ec2-image-id: ${{ env.EC2_AMI_ID }}
+          ec2-instance-type: ${{ env.EC2_INSTANCE_TYPE }}
+          subnet-id: ${{ env.EC2_SUBNET_ID }}
+          security-group-id: ${{ env.EC2_SECURITY_GROUP }}
+          iam-role-name: ${{ env.EC2_IAM_ROLE }}
+          aws-resource-tags: > # optional, requires additional permissions
+            [
+              {"Key": "Name", "Value": "ec2-optimum-github-runner"},
+              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
+            ]
+  do-the-job:
+    name: Setup
+    needs: start-runner # required to start the main job when the runner is ready
+    runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
+    env:
+      AWS_REGION: us-east-1
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+      - name: Build image
+        run: |
+          docker build -f tests/exporters/Dockerfile_exporters_gpu -t exporters-gpu .
+      - name: Test with unittest within docker container
+        run: |
+          docker run --rm --gpus all -v $(pwd)/hf_cache:/root/.cache/huggingface --workdir=/workspace/optimum/tests exporters-gpu:latest
+
+  stop-runner:
+    name: Stop self-hosted EC2 runner
+    needs:
+      - start-runner # required to get output from the start-runner job
+      - do-the-job # required to wait when the main job is done
+    runs-on: ubuntu-latest
+    env:
+      AWS_REGION: us-east-1
+    if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v1
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ env.AWS_REGION }}
+      - name: Stop EC2 runner
+        uses: philschmid/philschmid-ec2-github-runner@main
+        with:
+          mode: stop
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          label: ${{ needs.start-runner.outputs.label }}
+          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
diff --git a/.github/workflows/test_exporters_slow.yml b/.github/workflows/test_exporters_slow.yml
@@ -30,11 +30,11 @@ jobs:
     - name: Test with unittest
       working-directory: tests
       run: |
-        RUN_SLOW=1 pytest exporters -s -m "not tensorflow_test" --durations=0
+        RUN_SLOW=1 pytest exporters -s -m "not tensorflow_test and run_slow" --durations=0
     - name: Install dependencies for tensorflow export
       run: |
         pip install .[tests,exporters-tf]
     - name: Test with unittest
       working-directory: tests
       run: |
-        RUN_SLOW=1 pytest exporters -s -m "tensorflow_test" --durations=0
+        RUN_SLOW=1 pytest exporters -s -m "tensorflow_test and run_slow" --durations=0
diff --git a/docs/source/exporters/onnx/usage_guides/export_a_model.mdx b/docs/source/exporters/onnx/usage_guides/export_a_model.mdx
@@ -72,9 +72,10 @@ optimum-cli export onnx --help
 
 usage: optimum-cli <command> [<args>] export onnx [-h] -m MODEL [--task TASK] [--monolith] [--device DEVICE] [--opset OPSET] [--atol ATOL]
                                                   [--framework {pt,tf}] [--pad_token_id PAD_TOKEN_ID] [--cache_dir CACHE_DIR] [--trust-remote-code]
-                                                  [--batch_size BATCH_SIZE] [--sequence_length SEQUENCE_LENGTH] [--num_choices NUM_CHOICES] [--width WIDTH]
-                                                  [--height HEIGHT] [--num_channels NUM_CHANNELS] [--feature_size FEATURE_SIZE]
-                                                  [--nb_max_frames NB_MAX_FRAMES] [--audio_sequence_length AUDIO_SEQUENCE_LENGTH]
+                                                  [--no-post-process] [--optimize {O1,O2,O3,O4}] [--batch_size BATCH_SIZE]
+                                                  [--sequence_length SEQUENCE_LENGTH] [--num_choices NUM_CHOICES] [--width WIDTH] [--height HEIGHT]
+                                                  [--num_channels NUM_CHANNELS] [--feature_size FEATURE_SIZE] [--nb_max_frames NB_MAX_FRAMES]
+                                                  [--audio_sequence_length AUDIO_SEQUENCE_LENGTH]
                                                   output
 
 optional arguments:
@@ -86,24 +87,25 @@ Required arguments:
   output                Path indicating the directory where to store generated ONNX model.
 
 Optional arguments:
-  --task TASK           The task to export the model for. If not specified, the task will be auto-inferred based on the model. Available tasks depend on
-                        the model, but are among: ['default', 'masked-lm', 'causal-lm', 'seq2seq-lm', 'sequence-classification', 'token-classification',
-                        'multiple-choice', 'object-detection', 'question-answering', 'image-classification', 'image-segmentation', 'masked-im', 'semantic-
-                        segmentation', 'speech2seq-lm', 'audio-classification', 'audio-frame-classification', 'audio-ctc', 'audio-xvector', 'vision2seq-
-                        lm', 'stable-diffusion']. For decoder models, use `xxx-with-past` to export the model using past key values in the decoder.
-  --monolith            Force to export the model as a single ONNX file. By default, the ONNX exporter may break the model in several ONNX files, for
-                        example for encoder-decoder models where the encoder should be run only once while thedecoder is looped over.
+  --task TASK           The task to export the model for. If not specified, the task will be auto-inferred based on the model. Available tasks depend on the model, but are among: ['default', 'masked-lm', 'causal-lm', 'seq2seq-lm', 'sequence-classification', 'token-classification', 'multiple-choice', 'object-detection', 'question-answering', 'image-classification', 'image-segmentation', 'masked-im', 'semantic-segmentation', 'speech2seq-lm', 'audio-classification', 'audio-frame-classification', 'audio-ctc', 'audio-xvector', 'vision2seq-lm', 'stable-diffusion', 'zero-shot-object-detection']. For decoder models, use `xxx-with-past` to export the model using past key values in the decoder.
+  --monolith            Force to export the model as a single ONNX file. By default, the ONNX exporter may break the model in several ONNX files, for example for encoder-decoder models where the encoder should be run only once while the decoder is looped over.
   --device DEVICE       The device to use to do the export. Defaults to "cpu".
   --opset OPSET         If specified, ONNX opset version to export the model with. Otherwise, the default opset will be used.
   --atol ATOL           If specified, the absolute difference tolerance when validating the model. Otherwise, the default atol for the model will be used.
-  --framework {pt,tf}   The framework to use for the ONNX export. If not provided, will attempt to use the local checkpoint's original framework or what is
-                        available in the environment.
+  --framework {pt,tf}   The framework to use for the ONNX export. If not provided, will attempt to use the local checkpoint's original framework or what is available in the environment.
   --pad_token_id PAD_TOKEN_ID
                         This is needed by some models, for some tasks. If not provided, will attempt to use the tokenizer to guess it.
   --cache_dir CACHE_DIR
                         Path indicating where to store cache.
-  --trust-remote-code   Allow to use custom code for the modeling hosted in the model repository. This option should only be set for repositories you trust
-                        and in which you have read the code, as it will execute on your local machine arbitrary code present in the model repository.
+  --trust-remote-code   Allows to use custom code for the modeling hosted in the model repository. This option should only be set for repositories you trust and in which you have read the code, as it will execute on your local machine arbitrary code present in the model repository.
+  --no-post-process     Allows to disable any post-processing done by default on the exported ONNX models. For example, the merging of decoder and decoder-with-past models into a single ONNX model file to reduce memory usage.
+  --optimize {O1,O2,O3,O4}
+                        Allows to run ONNX Runtime optimizations directly during the export. Some of these optimizations are specific to ONNX Runtime, and the resulting ONNX will not be usable with other runtime as OpenVINO or TensorRT. Possible options:
+                            - O1: Basic general optimizations
+                            - O2: Basic and extended general optimizations, transformers-specific fusions
+                            - O3: Same as O2 with GELU approximation
+                            - O4: Same as O3 with mixed precision (fp16, GPU-only, requires `--device cuda`)
+
 ```
 
 Exporting a checkpoint can be done as follows:

diff --git a/docs/source/onnxruntime/usage_guides/optimization.mdx b/docs/source/onnxruntime/usage_guides/optimization.mdx
@@ -14,46 +14,25 @@ specific language governing permissions and limitations under the License.
 
 🤗 Optimum provides an `optimum.onnxruntime` package that enables you to apply graph optimization on many model hosted on the 🤗 hub using the [ONNX Runtime](https://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers) model optimization tool.
 
+## Optimizing a model during the ONNX export
 
-## Optimizing a model to be used with Optimum's CLI
+The ONNX model can be directly optimized during the ONNX export using Optimum CLI, by passing the argument `--optimize {O1,O2,O3,O4}` in the CLI, for example:
 
-The Optimum ONNX Runtime optimization tool can be used through Optimum command-line interface:
-
-```bash
-optimum-cli onnxruntime optimize --help
-usage: optimum-cli <command> [<args>] onnxruntime optimize [-h] --onnx_model ONNX_MODEL [-o OUTPUT] (-O1 | -O2 | -O3 | -O4)
-
-options:
-  -h, --help            show this help message and exit
-  -O1                   Basic general optimizations (see: https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization for more details).
-  -O2                   Basic and extended general optimizations, transformers-specific fusions (see: https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization for more details).
-  -O3                   Same as O2 with Gelu approximation (see: https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization for more details).
-  -O4                   Same as O3 with mixed precision (see: https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization for more details).
-  -c, --config          `ORTConfig` file to use to optimize the model.
-
-Required arguments:
-  --onnx_model ONNX_MODEL
-                        Path indicating where the ONNX models to optimize are located.
-
-Optional arguments:
-  -o OUTPUT, --output OUTPUT
-                        Path indicating the directory where to store generated ONNX model. (defaults to --onnx_model value).
 ```
-
-Optimizing an ONNX model can be done as follows:
-
-```bash
- optimum-cli onnxruntime optimize --onnx_model onnx_model_location/ -O1
+optimum-cli export onnx --model gpt2 --optimize O3 gpt2_onnx/
 ```
 
-This optimize all the ONNX files in `onnx_model_location` with the basic general optimizations. The optimized models will be created in the same directory by default unless the `--output` argument is specified.
-
+The optimization levels are:
+- O1: basic general optimizations.
+- O2: basic and extended general optimizations, transformers-specific fusions.
+- O3: same as O2 with GELU approximation.
+- O4: same as O3 with mixed precision (fp16, GPU-only, requires `--device cuda`).
 
-## Optimizing a model to be used with Optimum's `ORTOptimizer`
+## Optimizing a model programmatically with `ORTOptimizer`
 
-The [`~onnxruntime.ORTOptimizer`] class is used to optimize your ONNX model. The class can be initialized using the `from_pretrained()` method, which supports different checkpoint formats.
+ONNX models can be optimized with the [`~onnxruntime.ORTOptimizer`]. The class can be initialized using the [`~onnxruntime.ORTOptimizer.from_pretrained`] method, which supports different checkpoint formats.
 
-1. Using an already initialized `ORTModelForXXX` class.
+1. Using an already initialized [`~onnxruntime.ORTModel`] class.
 
 ```python
 >>> from optimum.onnxruntime import ORTOptimizer, ORTModelForSequenceClassification
@@ -76,7 +55,8 @@ The [`~onnxruntime.ORTOptimizer`] class is used to optimize your ONNX model. The
 >>> optimizer = ORTOptimizer.from_pretrained("path/to/model")  # doctest: +SKIP
 ```
 
-## Optimization Configuration
+
+### Optimization Configuration
 
 The [`~onnxruntime.configuration.OptimizationConfig`] class allows to specify how the optimization should be performed by the [`~onnxruntime.ORTOptimizer`].
 
@@ -99,11 +79,11 @@ Here is a list of the possible optimizations you can enable:
 - Add Bias and Gelu / FastGelu fusion with `disable_bias_gelu_fusion=False`,
 - Gelu approximation with `enable_gelu_approximation=True`.
 
-While [`~onnxruntime.configuration.OptimizationConfig`] gives you full control on how to do optimization, it can be hard to know what to enable / disable. Instead, you can use [`~onnxruntime.configuration.AutoOptimizationConfig`] which provides 3 common optimizations levels:
+While [`~onnxruntime.configuration.OptimizationConfig`] gives you full control on how to do optimization, it can be hard to know what to enable / disable. Instead, you can use [`~onnxruntime.configuration.AutoOptimizationConfig`] which provides four common optimization levels:
 - O1: basic general optimizations.
-- O2: basic and extended general optimizations, `transformers`-specific fusions.
-- O3: same as O2 with Gelu approximation.
-- O4: same as O3 with mixed precision.
+- O2: basic and extended general optimizations, transformers-specific fusions.
+- O3: same as O2 with GELU approximation.
+- O4: same as O3 with mixed precision (fp16, GPU-only).
 
 Example: Loading a O2 [`~onnxruntime.configuration.OptimizationConfig`]
 
@@ -120,7 +100,7 @@ You can also specify custom argument that were not defined in the O2 configurati
 ```
 
 
-## Optimization examples
+### Optimization examples
 
 Below you will find an easy end-to-end example on how to optimize [distilbert-base-uncased-finetuned-sst-2-english](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
 
@@ -176,3 +156,36 @@ Below you will find an easy end-to-end example on how to optimize a Seq2Seq mode
 >>> tokens = tokenizer("This is a sample input", return_tensors="pt")
 >>> outputs = optimized_model.generate(**tokens)
 ```
+
+## Optimizing a model with Optimum CLI
+
+The Optimum ONNX Runtime optimization tools can be used directly through Optimum command-line interface:
+
+```bash
+optimum-cli onnxruntime optimize --help
+usage: optimum-cli <command> [<args>] onnxruntime optimize [-h] --onnx_model ONNX_MODEL [-o OUTPUT] (-O1 | -O2 | -O3 | -O4)
+
+options:
+  -h, --help            show this help message and exit
+  -O1                   Basic general optimizations (see: https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization for more details).
+  -O2                   Basic and extended general optimizations, transformers-specific fusions (see: https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization for more details).
+  -O3                   Same as O2 with Gelu approximation (see: https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization for more details).
+  -O4                   Same as O3 with mixed precision (see: https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization for more details).
+  -c, --config          `ORTConfig` file to use to optimize the model.
+
+Required arguments:
+  --onnx_model ONNX_MODEL
+                        Path indicating where the ONNX models to optimize are located.
+
+Optional arguments:
+  -o OUTPUT, --output OUTPUT
+                        Path indicating the directory where to store generated ONNX model. (defaults to --onnx_model value).
+```
+
+Optimizing an ONNX model can be done as follows:
+
+```bash
+ optimum-cli onnxruntime optimize --onnx_model onnx_model_location/ -O1
+```
+
+This optimizes all the ONNX files in `onnx_model_location` with the basic general optimizations. The optimized models will be created in the same directory by default unless the `--output` argument is specified.
diff --git a/optimum/commands/export/__init__.py b/optimum/commands/export/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import sys
-from argparse import ArgumentParser
+from argparse import ArgumentParser, RawTextHelpFormatter
 
 from .. import BaseOptimumCLICommand
 from .onnx import ONNXExportCommand, parse_args_onnx
@@ -36,7 +36,9 @@ def register_subcommand(parser: ArgumentParser):
         )
         export_sub_parsers = export_parser.add_subparsers()
 
-        onnx_parser = export_sub_parsers.add_parser("onnx", help="Export PyTorch and TensorFlow to ONNX.")
+        onnx_parser = export_sub_parsers.add_parser(
+            "onnx", help="Export PyTorch and TensorFlow to ONNX.", formatter_class=RawTextHelpFormatter
+        )
 
         parse_args_onnx(onnx_parser)
         onnx_parser.set_defaults(func=onnx_export_factory)

diff --git a/optimum/commands/export/onnx.py b/optimum/commands/export/onnx.py
@@ -100,6 +100,19 @@ def parse_args_onnx(parser):
             " and decoder-with-past models into a single ONNX model file to reduce memory usage."
         ),
     )
+    optional_group.add_argument(
+        "--optimize",
+        type=str,
+        default=None,
+        choices=["O1", "O2", "O3", "O4"],
+        help=(
+            "Allows to run ONNX Runtime optimizations directly during the export. Some of these optimizations are specific to ONNX Runtime, and the resulting ONNX will not be usable with other runtime as OpenVINO or TensorRT. Possible options:\n"
+            "    - O1: Basic general optimizations\n"
+            "    - O2: Basic and extended general optimizations, transformers-specific fusions\n"
+            "    - O3: Same as O2 with GELU approximation\n"
+            "    - O4: Same as O3 with mixed precision (fp16, GPU-only, requires `--device cuda`)"
+        ),
+    )
 
     input_group = parser.add_argument_group(
         "Input shapes (if necessary, this allows to override the shapes of the input given to the ONNX exporter, that requires an example input.)"