Update ort CIs (slow, gpu, train) (#2024)

* update ort CIs * fix train ci * fix gpu ci * gpus all * devel * enable trt * fix * fix * fix * test * rename * change instance * test * use available * update * shorter labels as well * add onnxruntime-traning * fix onnxruntime package checking * fix typo * fix typo * remove torch version * fix trainer * fixed trt ep by using trt docker image (the only way to make sure everything works) * latest trt version * remove pkv speedup timing since never used * trust remote code for training datasets * remove rocm from diffusers tests * move ort training tests to onnxruntime-training * fix ort training * fix * style * always assert closenes and not equality * fixed perceiver * fixed missing position ids when attn mask is given * remove num_labels from output shapes as it's not a dynamic axis * raise error on missing mandatory inputs * added atol and rtol as part of the ORTModelTestMixin class * fix segformer image segmentation * style * fix vision encoder io binding * hot fix io binding, remove its dependency to the order of inputs and make sure it's actually being tested * fix * typo * unify io binding api with non io binding * force evaluated shape to int * mark pix2struct io binding tests * force contiguity in forward pass * fixed cryptic contiguity problems * fix some * fix vision2seq modeling and testing * Update setup.py * update import utils * Update optimum/onnxruntime/modeling_ort.py * fix vision encoder decoder io binding * enable bigbird and bigbirg pegasus and seperate timm slow tests to untangle them * use bigger machine for slow tests * lower atol and rtol for image classification logits * fix * large * enable more Longformer and MCTCT * enable commented models in export as well * uncomment timm slow models, big bird optimization and marian pkv comparison * fix whisper/speech_to_text test and make convolution deterministic * pin torch for ort training * ctc and speech also uses convolution so has to be deterministic * revert vison2seq atol
huggingface · Jan 29, 2025 · b755036 · b755036
1 parent d1bcdf7
commit b755036
Show file tree

Hide file tree

Showing 43 changed files with 1,550 additions and 1,478 deletions.
diff --git a/.github/workflows/test_export_onnx_cli.yml b/.github/workflows/test_export_onnx_cli.yml
@@ -2,9 +2,11 @@ name: Exporters ONNX CLI / Python - Test
 
 on:
   push:
-    branches: [main]
+    branches:
+      - main
   pull_request:
-    branches: [main]
+    branches:
+      - main
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -19,16 +21,20 @@ jobs:
         os: [ubuntu-20.04]
 
     runs-on: ${{ matrix.os }}
+
     steps:
-      - uses: actions/checkout@v2
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
       - name: Setup Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
-      - name: Install dependencies for pytorch export
+
+      - name: Install dependencies
         run: |
           pip install .[tests,exporters,diffusers]
-      - name: Test with unittest
-        working-directory: tests
+
+      - name: Test with pytest
         run: |
-          pytest exporters/onnx/test_exporters_onnx_cli.py -n auto -m "not tensorflow_test and not timm_test" -s --durations=0
+          pytest tests/exporters/onnx/test_exporters_onnx_cli.py -n auto -m "not tensorflow_test and not timm_test" -s --durations=0
diff --git a/.github/workflows/test_onnxruntime.yml b/.github/workflows/test_onnxruntime.yml
@@ -1,12 +1,12 @@
-# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
-# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 name: ONNX Runtime / Python - Test
 
 on:
   push:
-    branches: [main]
+    branches:
+      - main
   pull_request:
-    branches: [main]
+    branches:
+      - main
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -58,10 +58,10 @@ jobs:
 
       - name: Test with pytest (in series)
         run: |
-          pytest tests/onnxruntime -m "run_in_series" --durations=0 -vvvv -s
+          pytest tests/onnxruntime -m "run_in_series" --durations=0 -vvvv
 
       - name: Test with pytest (in parallel)
         run: |
-          pytest tests/onnxruntime -m "not run_in_series" --durations=0 -vvvv -s -n auto
+          pytest tests/onnxruntime -m "not run_in_series" --durations=0 -vvvv -n auto
         env:
           HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
diff --git a/.github/workflows/test_onnxruntime_gpu.yml b/.github/workflows/test_onnxruntime_gpu.yml
@@ -1,30 +1,54 @@
-name: ONNX Runtime / Test GPU
+name: ONNX Runtime GPU / Python - Test
 
 on:
   workflow_dispatch:
   schedule:
-    - cron: 0 1 */3 * * # at 1am every 3 days
+    - cron: 0 7 * * * # every day at 7am UTC
   pull_request:
-    types: [opened, synchronize, reopened, labeled]
-  # uncomment to enable on PR merge on main branch:
-  #push:
-  #  branches:
-  #    - main
+    branches:
+      - main
+    types:
+      - opened
+      - labeled
+      - reopened
+      - unlabeled
+      - synchronize
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
 
 jobs:
-  do-the-job:
-    if: ${{ (github.event_name == 'workflow_dispatch') || (github.event_name == 'schedule') || contains( github.event.pull_request.labels.*.name, 'gpu-test') }}
-    name: Start self-hosted EC2 runner
+  build:
+    if: ${{
+      (github.event_name == 'push') ||
+      (github.event_name == 'workflow_dispatch') ||
+      contains(github.event.pull_request.labels.*.name, 'gpu') ||
+      contains(github.event.pull_request.labels.*.name, 'onnxruntime-gpu')
+      }}
+
     runs-on:
       group: aws-g6-4xlarge-plus
-    env:
-      AWS_REGION: us-east-1
+
+    container:
+      image: nvcr.io/nvidia/tensorrt:24.12-py3
+      options: --gpus all
+
     steps:
       - name: Checkout
-        uses: actions/checkout@v2
-      - name: Build image
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.9"
+
+      - name: Install dependencies
         run: |
-          docker build -f tests/onnxruntime/docker/Dockerfile_onnxruntime_gpu -t onnxruntime-gpu .
-      - name: Test with unittest within docker container
+          pip install --upgrade pip
+          pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
+          pip install .[tests,onnxruntime-gpu,diffusers]
+
+      - name: Test with pytest
         run: |
-          docker run --rm --gpus all -v /mnt/cache/.cache/huggingface:/root/.cache/huggingface --workdir=/workspace/optimum/tests onnxruntime-gpu:latest
+          pytest tests/onnxruntime -m "cuda_ep_test or trt_ep_test" --durations=0 -vvvv -n auto
diff --git a/.github/workflows/test_onnxruntime_slow.yml b/.github/workflows/test_onnxruntime_slow.yml
@@ -1,33 +1,50 @@
-name: ONNX Runtime slow / Python - Test
+name: ONNX Runtime Slow / Python - Test
 
 on:
   workflow_dispatch:
   schedule:
-    - cron: 0 7 * * * # every day at 7am
+    - cron: 0 7 * * * # every day at 7am UTC
+  pull_request:
+    branches:
+      - main
+    types:
+      - opened
+      - labeled
+      - reopened
+      - unlabeled
+      - synchronize
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
 
 jobs:
   build:
-    strategy:
-      fail-fast: false
-      matrix:
-        python-version: ["3.9"]
-        os: [ubuntu-20.04]
+    if: ${{
+      (github.event_name == 'push') ||
+      (github.event_name == 'workflow_dispatch') ||
+      contains(github.event.pull_request.labels.*.name, 'slow') ||
+      contains(github.event.pull_request.labels.*.name, 'onnxruntime-slow')
+      }}
+
+    runs-on:
+      group: aws-general-8-plus
 
-    runs-on: ${{ matrix.os }}
     steps:
-    - uses: actions/checkout@v2
-    - name: Setup Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies for export
-      run: |
-        pip install .[tests,onnxruntime,diffusers]
-    - name: Test with unittest
-      working-directory: tests
-      run: |
-        RUN_SLOW=1 pytest onnxruntime -s -m "run_slow" --durations=0
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Python 3.9
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.9"
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+          pip install .[tests,onnxruntime,diffusers]
+
+      - name: Test with pytest
+        run: |
+          RUN_SLOW=1 pytest tests/onnxruntime -m "run_slow" --durations=0 -vvvv
diff --git a/.github/workflows/test_onnxruntime_train.yml b/.github/workflows/test_onnxruntime_train.yml
diff --git a/.github/workflows/test_onnxruntime_training.yml b/.github/workflows/test_onnxruntime_training.yml
@@ -0,0 +1,66 @@
+name: ONNX Runtime Training / Python - Test
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: 0 7 * * * # every day at 7am UTC
+  pull_request:
+    branches:
+      - main
+    types:
+      - opened
+      - labeled
+      - reopened
+      - unlabeled
+      - synchronize
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    if: ${{
+      (github.event_name == 'push') ||
+      (github.event_name == 'workflow_dispatch') ||
+      contains( github.event.pull_request.labels.*.name, 'training') ||
+      contains( github.event.pull_request.labels.*.name, 'onnxruntime-training')
+      }}
+
+    runs-on:
+      group: aws-g6-4xlarge-plus
+
+    container:
+      image: nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
+      options: --gpus all
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.9"
+
+      - name: Install dependencies
+        env:
+          TORCH_CUDA_ARCH_LIST: "5.0 6.0 7.0 7.5 8.0 8.6 9.0+PTX"
+        run: |
+          pip install --upgrade pip
+          pip install --no-cache-dir "torch<2.6" torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+          pip install --no-cache-dir torch-ort onnxruntime-training && python -m torch_ort.configure
+          pip install --no-cache-dir evaluate absl-py rouge_score seqeval sacrebleu nltk scikit-learn
+          pip install .[tests,onnxruntime-training]
+
+      - name: Test with pytest (trainer)
+        run: |
+          RUN_SLOW=1 pytest tests/onnxruntime-training/test_trainer.py --durations=0 -vvvv
+        env:
+          HF_DATASETS_TRUST_REMOTE_CODE: 1
+
+      - name: Test with pytest (examples)
+        run: |
+          RUN_SLOW=1 pytest tests/onnxruntime-training/test_examples.py --durations=0 -vvvv
+        env:
+          HF_DATASETS_TRUST_REMOTE_CODE: 1
diff --git a/examples/onnxruntime/training/image-classification/run_image_classification.py b/examples/onnxruntime/training/image-classification/run_image_classification.py
@@ -333,6 +333,7 @@ def compute_metrics(p):
         token=model_args.token,
         trust_remote_code=model_args.trust_remote_code,
         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
+        attn_implementation="eager",
     )
     image_processor = AutoImageProcessor.from_pretrained(
         model_args.image_processor_name or model_args.model_name_or_path,

diff --git a/examples/onnxruntime/training/language-modeling/run_clm.py b/examples/onnxruntime/training/language-modeling/run_clm.py
@@ -442,9 +442,12 @@ def main():
             trust_remote_code=model_args.trust_remote_code,
             torch_dtype=torch_dtype,
             low_cpu_mem_usage=model_args.low_cpu_mem_usage,
+            attn_implementation="eager",
         )
     else:
-        model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code)
+        model = AutoModelForCausalLM.from_config(
+            config, trust_remote_code=model_args.trust_remote_code, attn_implementation="eager"
+        )
         n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
         logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
 

diff --git a/examples/onnxruntime/training/language-modeling/run_mlm.py b/examples/onnxruntime/training/language-modeling/run_mlm.py
@@ -430,10 +430,13 @@ def main():
             token=model_args.token,
             trust_remote_code=model_args.trust_remote_code,
             low_cpu_mem_usage=model_args.low_cpu_mem_usage,
+            attn_implementation="eager",
         )
     else:
         logger.info("Training new model from scratch")
-        model = AutoModelForMaskedLM.from_config(config, trust_remote_code=model_args.trust_remote_code)
+        model = AutoModelForMaskedLM.from_config(
+            config, trust_remote_code=model_args.trust_remote_code, attn_implementation="eager"
+        )
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
     # on a small vocab and want a smaller embedding size, remove this test.

diff --git a/examples/onnxruntime/training/question-answering/run_qa.py b/examples/onnxruntime/training/question-answering/run_qa.py
@@ -364,6 +364,7 @@ def main():
         revision=model_args.model_revision,
         token=model_args.token,
         trust_remote_code=model_args.trust_remote_code,
+        attn_implementation="eager",
     )
 
     # Tokenizer check: this script requires a fast tokenizer.

diff --git a/examples/onnxruntime/training/summarization/run_summarization.py b/examples/onnxruntime/training/summarization/run_summarization.py
@@ -458,6 +458,7 @@ def main():
         revision=model_args.model_revision,
         token=model_args.token,
         trust_remote_code=model_args.trust_remote_code,
+        attn_implementation="eager",
     )
 
     if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):

diff --git a/examples/onnxruntime/training/text-classification/run_classification.py b/examples/onnxruntime/training/text-classification/run_classification.py
@@ -527,6 +527,7 @@ def main():
         token=model_args.token,
         trust_remote_code=model_args.trust_remote_code,
         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
+        attn_implementation="eager",
     )
     model.config.pad_token_id = model.config.eos_token_id
 

diff --git a/examples/onnxruntime/training/text-classification/run_glue.py b/examples/onnxruntime/training/text-classification/run_glue.py
@@ -404,6 +404,7 @@ def main():
         token=model_args.token,
         trust_remote_code=model_args.trust_remote_code,
         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
+        attn_implementation="eager",
     )
 
     # Preprocessing the raw_datasets

diff --git a/examples/onnxruntime/training/token-classification/run_ner.py b/examples/onnxruntime/training/token-classification/run_ner.py
@@ -405,6 +405,7 @@ def get_label_list(labels):
         token=model_args.token,
         trust_remote_code=model_args.trust_remote_code,
         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
+        attn_implementation="eager",
     )
 
     if tokenizer.pad_token is None:

diff --git a/examples/onnxruntime/training/translation/run_translation.py b/examples/onnxruntime/training/translation/run_translation.py
@@ -408,6 +408,7 @@ def main():
         revision=model_args.model_revision,
         token=model_args.token,
         trust_remote_code=model_args.trust_remote_code,
+        attn_implementation="eager",
     )
 
     # Set decoder_start_token_id