Sync transformers and accelerate versions (#562)

huggingface · May 16, 2024 · d15c130 · d15c130
1 parent 1e7d0f5
commit d15c130
Show file tree

Hide file tree

Showing 37 changed files with 1,215 additions and 1,264 deletions.
diff --git a/.github/workflows/test_trainium_common.yml b/.github/workflows/test_trainium_common.yml
@@ -16,28 +16,47 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
 
-
 jobs:
   optimum-neuron-tests:
-    runs-on: [self-hosted, 1-aws-trn, 8-cpu, ci] # run the job on the newly created runner
+    name: Run common tests on Trainium 1
+    runs-on: [self-hosted, 16-aws-trn, 128-cpu, ci]
     env:
       AWS_REGION: us-east-1
       TESTS_TO_IGNORE_FLAGS: --ignore tests/distributed/ --ignore tests/test_examples.py
     steps:
-      - name: Check AMI
-        run: dpkg -l | grep neuron
+      - name: Install Neuron runtime
+        run: |
+          . /etc/os-release
+          sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <<EOF
+          deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main
+          EOF
+          wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
+          sudo apt-get update -y
+          sudo apt-get install aws-neuronx-tools=2.17.1.0 aws-neuronx-runtime-lib=2.20.22.0-1b3ca6425 aws-neuronx-collectives=2.20.22.0-c101c322e  -y
+          export PATH=/opt/aws/neuron/bin:$PATH
       - name: Checkout
         uses: actions/checkout@v2
-      - name: Setup PATH
-        run: echo "/home/ubuntu/.local/bin" >> $GITHUB_PATH
-      - name: Set pip repository pointing to the Neuron repository
-        run: pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
-      - name: Update pip
-        run: pip install -U pip
-      - name: Install Python dependencies
-        run: pip install .[tests,neuronx]
+      - name: Install python dependencies
+        run: |
+          sudo apt install python3.8-venv python3-dev -y
+          python3 -m venv aws_neuron_venv_pytorch
+          source aws_neuron_venv_pytorch/bin/activate
+          python -m pip install -U pip
+          python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
+          python -m pip install .[neuronx,tests]
+      - name: Collect tests on Neuron Cores
+        run: |
+          source aws_neuron_venv_pytorch/bin/activate
+          HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} USE_VENV="false" pytest -m "is_trainium_test" $TESTS_TO_IGNORE_FLAGS tests --collect-only
       - name: Run tests on Neuron cores
         run: |
-          HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} USE_VENV="false" pytest -m "is_trainium_test" $TESTS_TO_IGNORE_FLAGS tests
+          source aws_neuron_venv_pytorch/bin/activate
+          HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} USE_VENV="false" pytest -m "is_trainium_test" $TESTS_TO_IGNORE_FLAGS tests --durations=0
+      - name: Collect staging tests on Neuron Cores
+        run: |
+          source aws_neuron_venv_pytorch/bin/activate
+          HUGGINGFACE_CO_STAGING=1 pytest -m "is_trainium_test and is_staging_test" $TESTS_TO_IGNORE_FLAGS tests -s --collect-only
       - name: Run staging tests on Neuron cores
-        run: HUGGINGFACE_CO_STAGING=1 pytest -m "is_trainium_test and is_staging_test" $TESTS_TO_IGNORE_FLAGS tests -s
+        run: |
+          source aws_neuron_venv_pytorch/bin/activate
+          HUGGINGFACE_CO_STAGING=1 pytest -m "is_trainium_test and is_staging_test" $TESTS_TO_IGNORE_FLAGS tests -s
diff --git a/.github/workflows/test_trainium_distributed.yml b/.github/workflows/test_trainium_distributed.yml
@@ -19,21 +19,39 @@ concurrency:
 
 jobs:
   optimum-neuron-tests:
+    name: Run distributed tests on Trainium 1
     runs-on: [self-hosted, 16-aws-trn, 128-cpu, ci]
     env:
       AWS_REGION: us-east-1
     steps:
-      - name: Check AMI
-        run: dpkg -l | grep neuron
+      - name: Install Neuron runtime
+        run: |
+          . /etc/os-release
+          sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <<EOF
+          deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main
+          EOF
+          wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
+          sudo apt-get update -y
+          sudo apt-get install aws-neuronx-tools=2.17.1.0 aws-neuronx-runtime-lib=2.20.22.0-1b3ca6425 aws-neuronx-collectives=2.20.22.0-c101c322e  -y
+          export PATH=/opt/aws/neuron/bin:$PATH
       - name: Checkout
         uses: actions/checkout@v2
       - name: Setup PATH
         run: echo "/home/ubuntu/.local/bin" >> $GITHUB_PATH
-      - name: Set pip repository pointing to the Neuron repository
-        run: pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
-      - name: Install Python dependencies
-        run: pip install .[tests,neuronx]
-      - name: Run tests on Neuron cores
+      - name: Install python dependencies
+        run: |
+          sudo apt install python3.8-venv python3-dev -y
+          python3 -m venv aws_neuron_venv_pytorch
+          source aws_neuron_venv_pytorch/bin/activate
+          python -m pip install -U pip
+          python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
+          python -m pip install .[neuronx,tests]
+      - name: Collect tests on Neuron Cores
+        run: |
+          source aws_neuron_venv_pytorch/bin/activate
+          HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m "is_trainium_test" tests/distributed/ --collect-only
+      - name: Run tests on Neuron Cores
         run: |
-          HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m "is_trainium_test" tests/distributed/ -v --durations=0 -x --ignore tests/distributed/test_training.py
+          source aws_neuron_venv_pytorch/bin/activate
+          HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m "is_trainium_test" tests/distributed/ -v --durations=0 -x
 
diff --git a/.github/workflows/test_trainium_examples.yml b/.github/workflows/test_trainium_examples.yml
@@ -26,76 +26,40 @@ concurrency:
 
 
 jobs:
-  start-runner:
-    name: Start self-hosted EC2 runner
-    runs-on: ubuntu-latest
-    env:
-      AWS_REGION: us-east-1
-      EC2_AMI_ID: ${{ vars.TRAINIUM_AMI_ID }} 
-      EC2_INSTANCE_TYPE: trn1.2xlarge 
-      EC2_SUBNET_ID: subnet-859322b4,subnet-b7533b96,subnet-47cfad21,subnet-a396b2ad,subnet-06576a4b,subnet-df0f6180
-      EC2_SECURITY_GROUP: sg-0bb210cd3ec725a13
-      EC2_IAM_ROLE: optimum-ec2-github-actions-role
-    outputs:
-      label: ${{ steps.start-ec2-runner.outputs.label }}
-      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v1
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ env.AWS_REGION }}
-      - name: Start EC2 runner
-        id: start-ec2-runner
-        uses: philschmid/philschmid-ec2-github-runner@main
-        with:
-          mode: start
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          ec2-image-id: ${{ env.EC2_AMI_ID }}
-          ec2-instance-type: ${{ env.EC2_INSTANCE_TYPE }}
-          subnet-id: ${{ env.EC2_SUBNET_ID }}
-          security-group-id: ${{ env.EC2_SECURITY_GROUP }}
-          iam-role-name: ${{ env.EC2_IAM_ROLE }}
-          aws-resource-tags: > # optional, requires additional permissions
-            [
-              {"Key": "Name", "Value": "ec2-optimum-github-runner"},
-              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
-            ]
   optimum-neuron-tests:
-    needs: start-runner # required to start the main job when the runner is ready
-    runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
+    name: Run example script tests on Trainium 1
+    runs-on: [self-hosted, 1-aws-trn, 8-cpu, ci] # run the job on the newly created runner
     env:
       AWS_REGION: us-east-1
       RUN_TINY: ${{ github.event.inputs.model_size == "tiny" && "1" || "0" }}
     steps:
+      - name: Install Neuron runtime
+        run: |
+          . /etc/os-release
+          sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <<EOF
+          deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main
+          EOF
+          wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
+          sudo apt-get update -y
+          sudo apt-get install aws-neuronx-tools=2.17.1.0 aws-neuronx-runtime-lib=2.20.22.0-1b3ca6425 aws-neuronx-collectives=2.20.22.0-c101c322e  -y
+          export PATH=/opt/aws/neuron/bin:$PATH
       - name: Checkout
         uses: actions/checkout@v2
-      - name: Install Python dependencies
-        run: pip install .[tests,neuronx]
-      - name: Run example tests on Neuron cores
+      - name: Setup PATH
+        run: echo "/home/ubuntu/.local/bin" >> $GITHUB_PATH
+      - name: Install python dependencies
+        run: |
+          sudo apt install python3.8-venv python3-dev -y
+          python3 -m venv aws_neuron_venv_pytorch
+          source aws_neuron_venv_pytorch/bin/activate
+          python -m pip install -U pip
+          python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
+          python -m pip install .[neuronx,tests]
+      - name: Collect example tests on Neuron Cores
         run: |
+          source aws_neuron_venv_pytorch/bin/activate
+          HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} USE_VENV=false COVERAGE=${{ github.event.inputs.priority }} RUN_TINY=$RUN_TINY RUN_SLOW=1 pytest -m "is_trainium_test" tests/test_examples.py --collect-only
+      - name: Run example tests on Neuron Cores
+        run: |
+          source aws_neuron_venv_pytorch/bin/activate
           HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} USE_VENV=false COVERAGE=${{ github.event.inputs.priority }} RUN_TINY=$RUN_TINY RUN_SLOW=1 pytest -m "is_trainium_test" tests/test_examples.py -v
-  stop-runner:
-    name: Stop self-hosted EC2 runner
-    needs:
-      - start-runner
-      - optimum-neuron-tests
-    runs-on: ubuntu-latest
-    env:
-      AWS_REGION: us-east-1
-    if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v1
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ env.AWS_REGION }}
-      - name: Stop EC2 runner
-        uses: philschmid/philschmid-ec2-github-runner@main
-        with:
-          mode: stop
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          label: ${{ needs.start-runner.outputs.label }}
-          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }} 
diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py
@@ -33,7 +33,6 @@
     NormalizedTextConfig,
     is_diffusers_available,
 )
-from ...utils.normalized_config import T5LikeNormalizedTextConfig
 from ..tasks import TasksManager
 from .config import (
     TextAndVisionNeuronConfig,
@@ -505,7 +504,7 @@ class T5DecoderNeuronConfig(TextSeq2SeqNeuronConfig):
     INPUT_ARGS = ("batch_size", "sequence_length", "num_beams")
     MODEL_TYPE = "t5-decoder"
     CUSTOM_MODEL_WRAPPER = T5DecoderWrapper
-    NORMALIZED_CONFIG_CLASS = T5LikeNormalizedTextConfig
+    NORMALIZED_CONFIG_CLASS = NormalizedSeq2SeqConfig
 
     @property
     def is_decoder(self) -> bool:

diff --git a/optimum/neuron/__init__.py b/optimum/neuron/__init__.py
@@ -13,8 +13,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 from typing import TYPE_CHECKING
 
+from .utils.training_utils import patch_transformers_for_neuron_sdk
+
+
+if not os.environ.get("DISABLE_TRANSFORMERS_PATCHING", False):
+    patch_transformers_for_neuron_sdk()
+
 from transformers.utils import _LazyModule
 
 
@@ -93,11 +100,5 @@
     )
 
 
-import os
-
 from .utils import is_neuron_available, is_neuronx_available, patch_transformers_for_neuron_sdk
 from .version import __version__
-
-
-if not os.environ.get("DISABLE_TRANSFORMERS_PATCHING", False):
-    patch_transformers_for_neuron_sdk()
diff --git a/optimum/neuron/accelerate/accelerator.py b/optimum/neuron/accelerate/accelerator.py
@@ -54,10 +54,10 @@
     ModelParallelismPlugin,
     NeuronDistributedType,
     get_tied_parameters_dict,
-    patch_accelerate_is_tpu_available,
+    patch_accelerate_is_torch_xla_available,
     tie_parameters,
 )
-from .utils.misc import apply_activation_checkpointing, create_patched_finfo
+from .utils.misc import apply_activation_checkpointing, create_patched_finfo, create_patched_save_pretrained
 from .utils.operations import _xla_gather
 
 
@@ -100,14 +100,15 @@ def __init__(
         **kwargs,
     ):
         # Patches accelerate.utils.imports.is_tpu_available to match `is_torch_xla_available`
-        patch_accelerate_is_tpu_available()
+        # TODO: check that removing it does not break anything.
+        patch_accelerate_is_torch_xla_available()
 
         full_kwargs = args_and_kwargs_to_kwargs_only(
             super().__init__, args=args, kwargs=kwargs, include_default_values=True
         )
 
         # There is a check for gradient_accumulation_steps to be equal to 1 when
-        # DistributedType == DistributedType.TPU, so we change that for initialization
+        # DistributedType == DistributedType.XLA, so we change that for initialization
         # and restore it back afterwards.
         num_steps = 1
         gradient_accumulation_plugin = full_kwargs["gradient_accumulation_plugin"]
@@ -327,12 +328,21 @@ def patch_model_for_neuron(
             ),
         )
 
+        if hasattr(model, "save_pretrained"):
+            patching_specs.append(
+                (
+                    "save_pretrained",
+                    DynamicPatch(create_patched_save_pretrained),
+                ),
+            )
+
         prepared_patching_specs = []
         for spec in patching_specs:
             prepared_patching_specs.append((model,) + spec)
 
         model_patcher = ModelPatcher(prepared_patching_specs, ignore_missing_attributes=True)
         model_patcher.patch()
+
         return model
 
     @requires_neuronx_distributed
@@ -494,6 +504,7 @@ def _custom_save_state(
             ["Accelerator", "torch.optim.Optimizer", "PreTrainedModel", Union[str, Path], int], Any
         ],
         output_dir: Optional[str] = None,
+        safe_serialization: bool = True,
         **save_model_func_kwargs: Any,
     ) -> str:
         if self.project_configuration.automatic_checkpoint_naming:
@@ -545,6 +556,9 @@ def _inner(folder):
         # Save the lr schedulers taking care of DeepSpeed nuances
         schedulers = self._schedulers
 
+        # Save the samplers of the dataloaders
+        dataloaders = self._dataloaders
+
         # Setting those to be empty list so that `save_accelerator_state` does not redo the job.
         weights = []
         optimizers = []
@@ -555,10 +569,18 @@ def _inner(folder):
             hook(self._models, weights, output_dir)
 
         save_location = save_accelerator_state(
-            output_dir, weights, optimizers, schedulers, self.state.process_index, self.scaler
+            output_dir,
+            weights,
+            optimizers,
+            schedulers,
+            dataloaders,
+            self.state.process_index,
+            self.scaler,
+            save_on_each_node=self.project_configuration.save_on_each_node,
+            safe_serialization=safe_serialization,
         )
         for i, obj in enumerate(self._custom_objects):
-            save_custom_state(obj, output_dir, i)
+            save_custom_state(obj, output_dir, i, save_on_each_node=self.project_configuration.save_on_each_node)
         self.project_configuration.iteration += 1
         return save_location
 
@@ -580,14 +602,21 @@ def save_optimizer_func(accelerator, optimizer, model, output_dir, i):
             logger.info(f"Parallel model and optimizer saved to the directory {output_dir}")
 
         return self._custom_save_state(
-            save_model_func, save_optimizer_func, output_dir=output_dir, **save_model_func_kwargs
+            save_model_func,
+            save_optimizer_func,
+            output_dir=output_dir,
+            safe_serialization=False,
+            **save_model_func_kwargs,
         )
 
-    @patch_within_function(("accelerate.checkpointing.xm", xm), ignore_missing_attributes=True)
-    def save_state(self, output_dir: Optional[str] = None, **save_model_func_kwargs) -> str:
+    def save_state(
+        self, output_dir: Optional[str] = None, safe_serialization: bool = True, **save_model_func_kwargs
+    ) -> str:
         if self.distributed_type is NeuronDistributedType.MODEL_PARALLELISM:
             return self.save_state_for_mp(output_dir=output_dir, **save_model_func_kwargs)
-        return super().save_state(output_dir=output_dir, **save_model_func_kwargs)
+        return super().save_state(
+            output_dir=output_dir, safe_serialization=safe_serialization, **save_model_func_kwargs
+        )
 
     def gather(self, tensor, out_of_graph: bool = False):
         return _xla_gather(tensor, out_of_graph=out_of_graph)

diff --git a/optimum/neuron/accelerate/optimizer.py b/optimum/neuron/accelerate/optimizer.py
@@ -105,7 +105,7 @@ def step(self, closure=None):
                 # Resetting everything.
                 self.optimizer.grad_clipping = False
                 self.clip_grad_norm_to_perform = None
-            elif self.accelerator_state.distributed_type is DistributedType.TPU:
+            elif self.accelerator_state.distributed_type is DistributedType.XLA:
                 optimizer_args = {"closure": closure} if closure is not None else {}
                 # By default barrier=False, but making sure it's the case here since we use ParalleLoader.
                 xm.optimizer_step(self.optimizer, optimizer_args=optimizer_args, barrier=False)