Skip to content

Commit

Permalink
Sync transformers and accelerate versions (#562)
Browse files Browse the repository at this point in the history
  • Loading branch information
michaelbenayoun authored May 16, 2024
1 parent 1e7d0f5 commit d15c130
Show file tree
Hide file tree
Showing 37 changed files with 1,215 additions and 1,264 deletions.
47 changes: 33 additions & 14 deletions .github/workflows/test_trainium_common.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,28 +16,47 @@ concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true


jobs:
optimum-neuron-tests:
runs-on: [self-hosted, 1-aws-trn, 8-cpu, ci] # run the job on the newly created runner
name: Run common tests on Trainium 1
runs-on: [self-hosted, 16-aws-trn, 128-cpu, ci]
env:
AWS_REGION: us-east-1
TESTS_TO_IGNORE_FLAGS: --ignore tests/distributed/ --ignore tests/test_examples.py
steps:
- name: Check AMI
run: dpkg -l | grep neuron
- name: Install Neuron runtime
run: |
. /etc/os-release
sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <<EOF
deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main
EOF
wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
sudo apt-get update -y
sudo apt-get install aws-neuronx-tools=2.17.1.0 aws-neuronx-runtime-lib=2.20.22.0-1b3ca6425 aws-neuronx-collectives=2.20.22.0-c101c322e -y
export PATH=/opt/aws/neuron/bin:$PATH
- name: Checkout
uses: actions/checkout@v2
- name: Setup PATH
run: echo "/home/ubuntu/.local/bin" >> $GITHUB_PATH
- name: Set pip repository pointing to the Neuron repository
run: pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
- name: Update pip
run: pip install -U pip
- name: Install Python dependencies
run: pip install .[tests,neuronx]
- name: Install python dependencies
run: |
sudo apt install python3.8-venv python3-dev -y
python3 -m venv aws_neuron_venv_pytorch
source aws_neuron_venv_pytorch/bin/activate
python -m pip install -U pip
python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
python -m pip install .[neuronx,tests]
- name: Collect tests on Neuron Cores
run: |
source aws_neuron_venv_pytorch/bin/activate
HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} USE_VENV="false" pytest -m "is_trainium_test" $TESTS_TO_IGNORE_FLAGS tests --collect-only
- name: Run tests on Neuron cores
run: |
HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} USE_VENV="false" pytest -m "is_trainium_test" $TESTS_TO_IGNORE_FLAGS tests
source aws_neuron_venv_pytorch/bin/activate
HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} USE_VENV="false" pytest -m "is_trainium_test" $TESTS_TO_IGNORE_FLAGS tests --durations=0
- name: Collect staging tests on Neuron Cores
run: |
source aws_neuron_venv_pytorch/bin/activate
HUGGINGFACE_CO_STAGING=1 pytest -m "is_trainium_test and is_staging_test" $TESTS_TO_IGNORE_FLAGS tests -s --collect-only
- name: Run staging tests on Neuron cores
run: HUGGINGFACE_CO_STAGING=1 pytest -m "is_trainium_test and is_staging_test" $TESTS_TO_IGNORE_FLAGS tests -s
run: |
source aws_neuron_venv_pytorch/bin/activate
HUGGINGFACE_CO_STAGING=1 pytest -m "is_trainium_test and is_staging_test" $TESTS_TO_IGNORE_FLAGS tests -s
34 changes: 26 additions & 8 deletions .github/workflows/test_trainium_distributed.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,21 +19,39 @@ concurrency:

jobs:
optimum-neuron-tests:
name: Run distributed tests on Trainium 1
runs-on: [self-hosted, 16-aws-trn, 128-cpu, ci]
env:
AWS_REGION: us-east-1
steps:
- name: Check AMI
run: dpkg -l | grep neuron
- name: Install Neuron runtime
run: |
. /etc/os-release
sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <<EOF
deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main
EOF
wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
sudo apt-get update -y
sudo apt-get install aws-neuronx-tools=2.17.1.0 aws-neuronx-runtime-lib=2.20.22.0-1b3ca6425 aws-neuronx-collectives=2.20.22.0-c101c322e -y
export PATH=/opt/aws/neuron/bin:$PATH
- name: Checkout
uses: actions/checkout@v2
- name: Setup PATH
run: echo "/home/ubuntu/.local/bin" >> $GITHUB_PATH
- name: Set pip repository pointing to the Neuron repository
run: pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
- name: Install Python dependencies
run: pip install .[tests,neuronx]
- name: Run tests on Neuron cores
- name: Install python dependencies
run: |
sudo apt install python3.8-venv python3-dev -y
python3 -m venv aws_neuron_venv_pytorch
source aws_neuron_venv_pytorch/bin/activate
python -m pip install -U pip
python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
python -m pip install .[neuronx,tests]
- name: Collect tests on Neuron Cores
run: |
source aws_neuron_venv_pytorch/bin/activate
HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m "is_trainium_test" tests/distributed/ --collect-only
- name: Run tests on Neuron Cores
run: |
HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m "is_trainium_test" tests/distributed/ -v --durations=0 -x --ignore tests/distributed/test_training.py
source aws_neuron_venv_pytorch/bin/activate
HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m "is_trainium_test" tests/distributed/ -v --durations=0 -x
92 changes: 28 additions & 64 deletions .github/workflows/test_trainium_examples.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,76 +26,40 @@ concurrency:


jobs:
start-runner:
name: Start self-hosted EC2 runner
runs-on: ubuntu-latest
env:
AWS_REGION: us-east-1
EC2_AMI_ID: ${{ vars.TRAINIUM_AMI_ID }}
EC2_INSTANCE_TYPE: trn1.2xlarge
EC2_SUBNET_ID: subnet-859322b4,subnet-b7533b96,subnet-47cfad21,subnet-a396b2ad,subnet-06576a4b,subnet-df0f6180
EC2_SECURITY_GROUP: sg-0bb210cd3ec725a13
EC2_IAM_ROLE: optimum-ec2-github-actions-role
outputs:
label: ${{ steps.start-ec2-runner.outputs.label }}
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ env.AWS_REGION }}
- name: Start EC2 runner
id: start-ec2-runner
uses: philschmid/philschmid-ec2-github-runner@main
with:
mode: start
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
ec2-image-id: ${{ env.EC2_AMI_ID }}
ec2-instance-type: ${{ env.EC2_INSTANCE_TYPE }}
subnet-id: ${{ env.EC2_SUBNET_ID }}
security-group-id: ${{ env.EC2_SECURITY_GROUP }}
iam-role-name: ${{ env.EC2_IAM_ROLE }}
aws-resource-tags: > # optional, requires additional permissions
[
{"Key": "Name", "Value": "ec2-optimum-github-runner"},
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
]
optimum-neuron-tests:
needs: start-runner # required to start the main job when the runner is ready
runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
name: Run example script tests on Trainium 1
runs-on: [self-hosted, 1-aws-trn, 8-cpu, ci] # run the job on the newly created runner
env:
AWS_REGION: us-east-1
RUN_TINY: ${{ github.event.inputs.model_size == "tiny" && "1" || "0" }}
steps:
- name: Install Neuron runtime
run: |
. /etc/os-release
sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <<EOF
deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main
EOF
wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
sudo apt-get update -y
sudo apt-get install aws-neuronx-tools=2.17.1.0 aws-neuronx-runtime-lib=2.20.22.0-1b3ca6425 aws-neuronx-collectives=2.20.22.0-c101c322e -y
export PATH=/opt/aws/neuron/bin:$PATH
- name: Checkout
uses: actions/checkout@v2
- name: Install Python dependencies
run: pip install .[tests,neuronx]
- name: Run example tests on Neuron cores
- name: Setup PATH
run: echo "/home/ubuntu/.local/bin" >> $GITHUB_PATH
- name: Install python dependencies
run: |
sudo apt install python3.8-venv python3-dev -y
python3 -m venv aws_neuron_venv_pytorch
source aws_neuron_venv_pytorch/bin/activate
python -m pip install -U pip
python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
python -m pip install .[neuronx,tests]
- name: Collect example tests on Neuron Cores
run: |
source aws_neuron_venv_pytorch/bin/activate
HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} USE_VENV=false COVERAGE=${{ github.event.inputs.priority }} RUN_TINY=$RUN_TINY RUN_SLOW=1 pytest -m "is_trainium_test" tests/test_examples.py --collect-only
- name: Run example tests on Neuron Cores
run: |
source aws_neuron_venv_pytorch/bin/activate
HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} USE_VENV=false COVERAGE=${{ github.event.inputs.priority }} RUN_TINY=$RUN_TINY RUN_SLOW=1 pytest -m "is_trainium_test" tests/test_examples.py -v
stop-runner:
name: Stop self-hosted EC2 runner
needs:
- start-runner
- optimum-neuron-tests
runs-on: ubuntu-latest
env:
AWS_REGION: us-east-1
if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ env.AWS_REGION }}
- name: Stop EC2 runner
uses: philschmid/philschmid-ec2-github-runner@main
with:
mode: stop
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
label: ${{ needs.start-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
3 changes: 1 addition & 2 deletions optimum/exporters/neuron/model_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
NormalizedTextConfig,
is_diffusers_available,
)
from ...utils.normalized_config import T5LikeNormalizedTextConfig
from ..tasks import TasksManager
from .config import (
TextAndVisionNeuronConfig,
Expand Down Expand Up @@ -505,7 +504,7 @@ class T5DecoderNeuronConfig(TextSeq2SeqNeuronConfig):
INPUT_ARGS = ("batch_size", "sequence_length", "num_beams")
MODEL_TYPE = "t5-decoder"
CUSTOM_MODEL_WRAPPER = T5DecoderWrapper
NORMALIZED_CONFIG_CLASS = T5LikeNormalizedTextConfig
NORMALIZED_CONFIG_CLASS = NormalizedSeq2SeqConfig

@property
def is_decoder(self) -> bool:
Expand Down
13 changes: 7 additions & 6 deletions optimum/neuron/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from typing import TYPE_CHECKING

from .utils.training_utils import patch_transformers_for_neuron_sdk


if not os.environ.get("DISABLE_TRANSFORMERS_PATCHING", False):
patch_transformers_for_neuron_sdk()

from transformers.utils import _LazyModule


Expand Down Expand Up @@ -93,11 +100,5 @@
)


import os

from .utils import is_neuron_available, is_neuronx_available, patch_transformers_for_neuron_sdk
from .version import __version__


if not os.environ.get("DISABLE_TRANSFORMERS_PATCHING", False):
patch_transformers_for_neuron_sdk()
49 changes: 39 additions & 10 deletions optimum/neuron/accelerate/accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,10 @@
ModelParallelismPlugin,
NeuronDistributedType,
get_tied_parameters_dict,
patch_accelerate_is_tpu_available,
patch_accelerate_is_torch_xla_available,
tie_parameters,
)
from .utils.misc import apply_activation_checkpointing, create_patched_finfo
from .utils.misc import apply_activation_checkpointing, create_patched_finfo, create_patched_save_pretrained
from .utils.operations import _xla_gather


Expand Down Expand Up @@ -100,14 +100,15 @@ def __init__(
**kwargs,
):
# Patches accelerate.utils.imports.is_tpu_available to match `is_torch_xla_available`
patch_accelerate_is_tpu_available()
# TODO: check that removing it does not break anything.
patch_accelerate_is_torch_xla_available()

full_kwargs = args_and_kwargs_to_kwargs_only(
super().__init__, args=args, kwargs=kwargs, include_default_values=True
)

# There is a check for gradient_accumulation_steps to be equal to 1 when
# DistributedType == DistributedType.TPU, so we change that for initialization
# DistributedType == DistributedType.XLA, so we change that for initialization
# and restore it back afterwards.
num_steps = 1
gradient_accumulation_plugin = full_kwargs["gradient_accumulation_plugin"]
Expand Down Expand Up @@ -327,12 +328,21 @@ def patch_model_for_neuron(
),
)

if hasattr(model, "save_pretrained"):
patching_specs.append(
(
"save_pretrained",
DynamicPatch(create_patched_save_pretrained),
),
)

prepared_patching_specs = []
for spec in patching_specs:
prepared_patching_specs.append((model,) + spec)

model_patcher = ModelPatcher(prepared_patching_specs, ignore_missing_attributes=True)
model_patcher.patch()

return model

@requires_neuronx_distributed
Expand Down Expand Up @@ -494,6 +504,7 @@ def _custom_save_state(
["Accelerator", "torch.optim.Optimizer", "PreTrainedModel", Union[str, Path], int], Any
],
output_dir: Optional[str] = None,
safe_serialization: bool = True,
**save_model_func_kwargs: Any,
) -> str:
if self.project_configuration.automatic_checkpoint_naming:
Expand Down Expand Up @@ -545,6 +556,9 @@ def _inner(folder):
# Save the lr schedulers taking care of DeepSpeed nuances
schedulers = self._schedulers

# Save the samplers of the dataloaders
dataloaders = self._dataloaders

# Setting those to be empty list so that `save_accelerator_state` does not redo the job.
weights = []
optimizers = []
Expand All @@ -555,10 +569,18 @@ def _inner(folder):
hook(self._models, weights, output_dir)

save_location = save_accelerator_state(
output_dir, weights, optimizers, schedulers, self.state.process_index, self.scaler
output_dir,
weights,
optimizers,
schedulers,
dataloaders,
self.state.process_index,
self.scaler,
save_on_each_node=self.project_configuration.save_on_each_node,
safe_serialization=safe_serialization,
)
for i, obj in enumerate(self._custom_objects):
save_custom_state(obj, output_dir, i)
save_custom_state(obj, output_dir, i, save_on_each_node=self.project_configuration.save_on_each_node)
self.project_configuration.iteration += 1
return save_location

Expand All @@ -580,14 +602,21 @@ def save_optimizer_func(accelerator, optimizer, model, output_dir, i):
logger.info(f"Parallel model and optimizer saved to the directory {output_dir}")

return self._custom_save_state(
save_model_func, save_optimizer_func, output_dir=output_dir, **save_model_func_kwargs
save_model_func,
save_optimizer_func,
output_dir=output_dir,
safe_serialization=False,
**save_model_func_kwargs,
)

@patch_within_function(("accelerate.checkpointing.xm", xm), ignore_missing_attributes=True)
def save_state(self, output_dir: Optional[str] = None, **save_model_func_kwargs) -> str:
def save_state(
self, output_dir: Optional[str] = None, safe_serialization: bool = True, **save_model_func_kwargs
) -> str:
if self.distributed_type is NeuronDistributedType.MODEL_PARALLELISM:
return self.save_state_for_mp(output_dir=output_dir, **save_model_func_kwargs)
return super().save_state(output_dir=output_dir, **save_model_func_kwargs)
return super().save_state(
output_dir=output_dir, safe_serialization=safe_serialization, **save_model_func_kwargs
)

def gather(self, tensor, out_of_graph: bool = False):
return _xla_gather(tensor, out_of_graph=out_of_graph)
Expand Down
2 changes: 1 addition & 1 deletion optimum/neuron/accelerate/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def step(self, closure=None):
# Resetting everything.
self.optimizer.grad_clipping = False
self.clip_grad_norm_to_perform = None
elif self.accelerator_state.distributed_type is DistributedType.TPU:
elif self.accelerator_state.distributed_type is DistributedType.XLA:
optimizer_args = {"closure": closure} if closure is not None else {}
# By default barrier=False, but making sure it's the case here since we use ParalleLoader.
xm.optimizer_step(self.optimizer, optimizer_args=optimizer_args, barrier=False)
Expand Down
Loading

0 comments on commit d15c130

Please sign in to comment.