diff --git a/.github/workflows/self-push-amd-mi300-caller.yml b/.github/workflows/self-push-amd-mi300-caller.yml new file mode 100644 index 000000000000..a8ee4e540ecf --- /dev/null +++ b/.github/workflows/self-push-amd-mi300-caller.yml @@ -0,0 +1,25 @@ +name: Self-hosted runner (AMD mi300 CI caller) + +on: + workflow_run: + workflows: ["Self-hosted runner (push-caller)"] + branches: ["main"] + types: [completed] + push: + branches: + - run_amd_push_ci_caller* + paths: + - "src/**" + - "tests/**" + - ".github/**" + - "templates/**" + - "utils/**" + +jobs: + run_amd_ci: + name: AMD mi300 + if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && (startsWith(github.ref_name, 'run_amd_push_ci_caller') || startsWith(github.ref_name, 'mi300-ci')))) + uses: ./.github/workflows/self-push-amd.yml + with: + gpu_flavor: mi300 + secrets: inherit diff --git a/.github/workflows/self-push-amd.yml b/.github/workflows/self-push-amd.yml index 8705f398b2b5..8d68002e3294 100644 --- a/.github/workflows/self-push-amd.yml +++ b/.github/workflows/self-push-amd.yml @@ -36,7 +36,7 @@ jobs: strategy: matrix: machine_type: [single-gpu, multi-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -57,7 +57,7 @@ jobs: strategy: matrix: machine_type: [single-gpu, multi-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -155,7 +155,7 @@ jobs: matrix: folders: ${{ fromJson(needs.setup_gpu.outputs.matrix) }} machine_type: [single-gpu, multi-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -230,7 +230,7 @@ jobs: - name: Run all non-slow selected tests on GPU working-directory: /transformers run: | - python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports ${{ fromJson(needs.setup_gpu.outputs.test_map)[matrix.folders] }} + python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports ${{ fromJson(needs.setup_gpu.outputs.test_map)[matrix.folders] }} -m "not not_device_test" - name: Failure short reports if: ${{ failure() }} diff --git a/.github/workflows/self-scheduled-amd-mi210-caller.yml b/.github/workflows/self-scheduled-amd-mi210-caller.yml index cdb968901058..6abba6894aaf 100644 --- a/.github/workflows/self-scheduled-amd-mi210-caller.yml +++ b/.github/workflows/self-scheduled-amd-mi210-caller.yml @@ -16,4 +16,5 @@ jobs: uses: ./.github/workflows/self-scheduled-amd.yml with: gpu_flavor: mi210 + slack_report_channel: "#transformers-ci-daily-amd" secrets: inherit diff --git a/.github/workflows/self-scheduled-amd-mi250-caller.yml b/.github/workflows/self-scheduled-amd-mi250-caller.yml index dc7d12f17393..36365d4a67f1 100644 --- a/.github/workflows/self-scheduled-amd-mi250-caller.yml +++ b/.github/workflows/self-scheduled-amd-mi250-caller.yml @@ -16,4 +16,5 @@ jobs: uses: ./.github/workflows/self-scheduled-amd.yml with: gpu_flavor: mi250 + slack_report_channel: "#transformers-ci-daily-amd" secrets: inherit diff --git a/.github/workflows/self-scheduled-amd-mi300-caller.yml b/.github/workflows/self-scheduled-amd-mi300-caller.yml new file mode 100644 index 000000000000..a9e7b934c34b --- /dev/null +++ b/.github/workflows/self-scheduled-amd-mi300-caller.yml @@ -0,0 +1,21 @@ +name: Self-hosted runner (AMD mi300 scheduled CI caller) + +on: + workflow_run: + workflows: ["Self-hosted runner (AMD scheduled CI caller)"] + branches: ["main"] + types: [completed] + push: + branches: + - run_amd_scheduled_ci_caller* + +jobs: + run_amd_ci: + name: AMD mi300 + needs: build-docker-containers + if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && (startsWith(github.ref_name, 'run_amd_push_ci_caller') || startsWith(github.ref_name, 'mi300-ci')))) + uses: ./.github/workflows/self-scheduled-amd.yml + with: + gpu_flavor: mi300 + slack_report_channel: "#transformers-ci-daily-amd" + secrets: inherit diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml index d2ab90d13318..e9f280f51ab4 100644 --- a/.github/workflows/self-scheduled-amd.yml +++ b/.github/workflows/self-scheduled-amd.yml @@ -34,7 +34,7 @@ jobs: fetch-depth: 2 - name: Check Runner Status - run: python utils/check_self_hosted_runner.py --target_runners hf-amd-mi210-ci-1gpu-1,hf-amd-mi250-ci-1gpu-1 --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }} + run: python utils/check_self_hosted_runner.py --target_runners hf-amd-mi210-ci-1gpu-1,hf-amd-mi250-ci-1gpu-1,hf-amd-mi300-ci-1gpu-1 --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }} check_runners: name: Check Runners @@ -42,7 +42,7 @@ jobs: strategy: matrix: machine_type: [single-gpu, multi-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -63,7 +63,7 @@ jobs: strategy: matrix: machine_type: [single-gpu, multi-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -116,7 +116,7 @@ jobs: matrix: folders: ${{ fromJson(needs.setup.outputs.matrix) }} machine_type: [single-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -162,7 +162,7 @@ jobs: - name: Run all tests on GPU working-directory: /transformers - run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} + run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} -m "not not_device_test" - name: Failure short reports if: ${{ failure() }} @@ -184,7 +184,7 @@ jobs: matrix: folders: ${{ fromJson(needs.setup.outputs.matrix) }} machine_type: [multi-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -230,7 +230,7 @@ jobs: - name: Run all tests on GPU working-directory: /transformers - run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} + run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} -m "not not_device_test" - name: Failure short reports if: ${{ failure() }} @@ -250,7 +250,7 @@ jobs: fail-fast: false matrix: machine_type: [single-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -287,7 +287,7 @@ jobs: working-directory: /transformers run: | pip install -r examples/pytorch/_tests_requirements.txt - python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_examples_gpu_test_reports examples/pytorch + python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_examples_gpu_test_reports examples/pytorch -m "not not_device_test" - name: Failure short reports if: ${{ failure() }} @@ -307,7 +307,7 @@ jobs: fail-fast: false matrix: machine_type: [single-gpu, multi-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -343,7 +343,7 @@ jobs: - name: Run all pipeline tests on GPU working-directory: /transformers run: | - python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports tests/pipelines + python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports tests/pipelines -m "not not_device_test" - name: Failure short reports if: ${{ failure() }} @@ -364,7 +364,7 @@ jobs: matrix: machine_type: [single-gpu, multi-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] needs: setup container: image: huggingface/transformers-pytorch-deepspeed-amd-gpu @@ -400,7 +400,7 @@ jobs: - name: Run all tests on GPU working-directory: /transformers - run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended + run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended -m "not not_device_test" - name: Failure short reports if: ${{ failure() }} diff --git a/docker/transformers-pytorch-amd-gpu/Dockerfile b/docker/transformers-pytorch-amd-gpu/Dockerfile index 0b070c93a64f..da91906d6214 100644 --- a/docker/transformers-pytorch-amd-gpu/Dockerfile +++ b/docker/transformers-pytorch-amd-gpu/Dockerfile @@ -1,24 +1,19 @@ -FROM rocm/dev-ubuntu-20.04:5.6 +FROM rocm/dev-ubuntu-22.04:6.0.2 # rocm/pytorch has no version with 2.1.0 LABEL maintainer="Hugging Face" ARG DEBIAN_FRONTEND=noninteractive -ARG PYTORCH='2.1.0' -ARG TORCH_VISION='0.16.0' -ARG TORCH_AUDIO='2.1.0' -ARG ROCM='5.6' - RUN apt update && \ - apt install -y --no-install-recommends git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-dev python3-pip ffmpeg && \ + apt install -y --no-install-recommends git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-dev python3-pip python3-dev ffmpeg && \ apt clean && \ rm -rf /var/lib/apt/lists/* -RUN python3 -m pip install --no-cache-dir --upgrade pip +RUN python3 -m pip install --no-cache-dir --upgrade pip numpy -RUN python3 -m pip install torch==$PYTORCH torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO --index-url https://download.pytorch.org/whl/rocm$ROCM +RUN python3 -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.0 -RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools ninja git+https://github.com/facebookresearch/detectron2.git pytesseract "itsdangerous<2.1.0" +RUN python3 -m pip install --no-cache-dir --upgrade importlib-metadata setuptools ninja git+https://github.com/facebookresearch/detectron2.git pytesseract "itsdangerous<2.1.0" ARG REF=main WORKDIR / @@ -35,5 +30,5 @@ RUN python3 -m pip uninstall -y tensorflow flax # this line must be added in order for python to be aware of transformers. RUN cd transformers && python3 setup.py develop -# Remove nvml as it is not compatible with ROCm -RUN python3 -m pip uninstall py3nvml pynvml -y +# Remove nvml as it is not compatible with ROCm. apex is not tested on NVIDIA either. +RUN python3 -m pip uninstall py3nvml pynvml apex -y diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md index de49d4427b56..c8e99c1d43f5 100644 --- a/docs/source/en/perf_infer_gpu_one.md +++ b/docs/source/en/perf_infer_gpu_one.md @@ -94,7 +94,7 @@ We strongly suggest referring to the detailed [installation instructions](https: -FlashAttention-2 is also supported on AMD GPUs and current support is limited to **Instinct MI210** and **Instinct MI250**. We strongly suggest using this [Dockerfile](https://github.com/huggingface/optimum-amd/tree/main/docker/transformers-pytorch-amd-gpu-flash/Dockerfile) to use FlashAttention-2 on AMD GPUs. +FlashAttention-2 is also supported on AMD GPUs and current support is limited to **Instinct MI210**, **Instinct MI250** and **Instinct MI300**. We strongly suggest using this [Dockerfile](https://github.com/huggingface/optimum-amd/tree/main/docker/transformers-pytorch-amd-gpu-flash/Dockerfile) to use FlashAttention-2 on AMD GPUs. diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py index 4543cf9f98b5..39de7d6d326b 100755 --- a/src/transformers/integrations/integration_utils.py +++ b/src/transformers/integrations/integration_utils.py @@ -1545,6 +1545,11 @@ def __init__(self): raise RuntimeError( "CodeCarbonCallback requires `codecarbon` to be installed. Run `pip install codecarbon`." ) + elif torch.version.hip: + raise RuntimeError( + "CodeCarbonCallback requires `codecarbon` package, which is not compatible with AMD ROCm (https://github.com/mlco2/codecarbon/pull/490). When using the Trainer, please specify the `report_to` argument (https://huggingface.co/docs/transformers/v4.39.3/en/main_classes/trainer#transformers.TrainingArguments.report_to) to disable CodeCarbonCallback." + ) + import codecarbon self._codecarbon = codecarbon diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index 6ea2a6674b40..2807c9951aa6 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -1735,6 +1735,13 @@ def __post_init__(self): from .integrations import get_available_reporting_integrations self.report_to = get_available_reporting_integrations() + + if "codecarbon" in self.report_to and torch.version.hip: + logger.warning( + "When using the Trainer, CodeCarbonCallback requires the `codecarbon` package, which is not compatible with AMD ROCm (https://github.com/mlco2/codecarbon/pull/490). Automatically disabling the codecarbon callback. Reference: https://huggingface.co/docs/transformers/v4.39.3/en/main_classes/trainer#transformers.TrainingArguments.report_to." + ) + self.report_to.remove("codecarbon") + elif self.report_to == "none" or self.report_to == ["none"]: self.report_to = [] elif not isinstance(self.report_to, list): diff --git a/tests/extended/test_trainer_ext.py b/tests/extended/test_trainer_ext.py index 4bda892162fd..a35ea1a8e7eb 100644 --- a/tests/extended/test_trainer_ext.py +++ b/tests/extended/test_trainer_ext.py @@ -301,6 +301,7 @@ def run_trainer( --label_smoothing_factor 0.1 --target_lang ro_RO --source_lang en_XX + --report_to none """.split() args_eval = f""" diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index da6dcb2a4b72..c420da4052f1 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -607,7 +607,7 @@ def test_trainer_with_datasets(self): # Base training. Should have the same results as test_reproducible_training model = RegressionModel() - args = TrainingArguments("./regression", learning_rate=0.1) + args = TrainingArguments("./regression", learning_rate=0.1, report_to="none") trainer = Trainer(model, args, train_dataset=train_dataset) trainer.train() self.check_trained_model(trainer.model) @@ -629,7 +629,7 @@ def test_trainer_with_datasets(self): def test_model_init(self): train_dataset = RegressionDataset() - args = TrainingArguments("./regression", learning_rate=0.1) + args = TrainingArguments("./regression", learning_rate=0.1, report_to="none") trainer = Trainer(args=args, train_dataset=train_dataset, model_init=lambda: RegressionModel()) trainer.train() self.check_trained_model(trainer.model) @@ -692,7 +692,7 @@ def test_training_loss(self): def test_custom_optimizer(self): train_dataset = RegressionDataset() - args = TrainingArguments("./regression") + args = TrainingArguments("./regression", report_to="none") model = RegressionModel() optimizer = torch.optim.SGD(model.parameters(), lr=1.0) lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda x: 1.0) @@ -716,6 +716,7 @@ def test_lr_scheduler_kwargs(self): lr_scheduler_kwargs=extra_kwargs, learning_rate=0.2, warmup_steps=num_warmup_steps, + report_to="none", ) trainer = Trainer(model, args, train_dataset=train_dataset) trainer.create_optimizer_and_scheduler(num_training_steps=num_steps) @@ -742,6 +743,7 @@ def test_cosine_with_min_lr_scheduler(self): lr_scheduler_kwargs=extra_kwargs, learning_rate=0.2, warmup_steps=num_warmup_steps, + report_to="none", ) trainer = Trainer(model, args, train_dataset=train_dataset) trainer.create_optimizer_and_scheduler(num_training_steps=num_steps) @@ -762,6 +764,7 @@ def test_reduce_lr_on_plateau_args(self): "./regression", eval_strategy="epoch", metric_for_best_model="eval_loss", + report_to="none", ) model = RegressionModel() optimizer = torch.optim.SGD(model.parameters(), lr=1.0) @@ -796,6 +799,7 @@ def log(self, logs): metric_for_best_model="eval_loss", num_train_epochs=10, learning_rate=0.2, + report_to="none", ) model = RegressionModel() trainer = TrainerWithLRLogs(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset) @@ -828,7 +832,7 @@ def test_adafactor_lr_none(self): from transformers.optimization import Adafactor, AdafactorSchedule train_dataset = RegressionDataset() - args = TrainingArguments("./regression") + args = TrainingArguments("./regression", report_to="none") model = RegressionModel() optimizer = Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None) lr_scheduler = AdafactorSchedule(optimizer) @@ -879,7 +883,7 @@ def test_trainer_works_with_dict(self): train_dataset = RegressionDataset() eval_dataset = RegressionDataset() model = RegressionDictModel() - args = TrainingArguments("./regression") + args = TrainingArguments("./regression", report_to="none") trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset) trainer.train() _ = trainer.evaluate() @@ -890,7 +894,7 @@ def test_evaluation_with_keys_to_drop(self): tiny_gpt2 = GPT2LMHeadModel(config) x = torch.randint(0, 100, (128,)) eval_dataset = RepeatDataset(x) - args = TrainingArguments("./test") + args = TrainingArguments("./test", report_to="none") trainer = Trainer(tiny_gpt2, args, eval_dataset=eval_dataset) # By default the past_key_values are removed result = trainer.predict(eval_dataset) @@ -1100,7 +1104,12 @@ def test_neftune(self): # Trainer without inf/nan filter args = TrainingArguments( - "./test", learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, neftune_noise_alpha=0.4 + "./test", + learning_rate=1e-9, + logging_steps=5, + logging_nan_inf_filter=False, + neftune_noise_alpha=0.4, + report_to="none", ) trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset) @@ -1117,7 +1126,12 @@ def test_neftune(self): tiny_gpt2 = GPT2LMHeadModel(config) # Trainer without inf/nan filter args = TrainingArguments( - "./test", learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, neftune_noise_alpha=0.4 + "./test", + learning_rate=1e-9, + logging_steps=5, + logging_nan_inf_filter=False, + neftune_noise_alpha=0.4, + report_to="none", ) trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset) @@ -1143,13 +1157,17 @@ def test_logging_inf_nan_filter(self): train_dataset = RepeatDataset(x) # Trainer without inf/nan filter - args = TrainingArguments("./test", learning_rate=1e9, logging_steps=5, logging_nan_inf_filter=False) + args = TrainingArguments( + "./test", learning_rate=1e9, logging_steps=5, logging_nan_inf_filter=False, report_to="none" + ) trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset) trainer.train() log_history_no_filter = trainer.state.log_history # Trainer with inf/nan filter - args = TrainingArguments("./test", learning_rate=1e9, logging_steps=5, logging_nan_inf_filter=True) + args = TrainingArguments( + "./test", learning_rate=1e9, logging_steps=5, logging_nan_inf_filter=True, report_to="none" + ) trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset) trainer.train() log_history_filter = trainer.state.log_history @@ -1196,11 +1214,16 @@ def test_train_and_eval_dataloaders(self): # tests that we do not require dataloader to have a .dataset attribute def test_dataloader_without_dataset(self): train_dataset = RegressionDataset(length=128) - trainer = CustomDataloaderTrainer( - model=RegressionModel(), train_dataset=train_dataset, eval_dataset=train_dataset - ) - trainer.train() - trainer.evaluate() + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = CustomDataloaderTrainer( + model=RegressionModel(), + train_dataset=train_dataset, + eval_dataset=train_dataset, + args=TrainingArguments(output_dir=tmp_dir, report_to="none"), + ) + + trainer.train() + trainer.evaluate() def test_galore_matched_modules(self): regex_patterns = [r".*.attn.*", r".*.mlp.*"] @@ -1495,7 +1518,9 @@ def test_data_is_not_parallelized_when_model_is_parallel(self): # Make the Trainer believe it's a parallelized model model.is_parallelizable = True model.model_parallel = True - args = TrainingArguments("./regression", per_device_train_batch_size=16, per_device_eval_batch_size=16) + args = TrainingArguments( + "./regression", per_device_train_batch_size=16, per_device_eval_batch_size=16, report_to="none" + ) trainer = Trainer(model, args, train_dataset=RegressionDataset(), eval_dataset=RegressionDataset()) # Check the Trainer was fooled self.assertTrue(trainer.is_model_parallel) @@ -1849,7 +1874,7 @@ def test_predict_with_ipex(self): def test_dynamic_shapes(self): eval_dataset = DynamicShapesDataset(batch_size=self.batch_size) model = RegressionModel(a=2, b=1) - args = TrainingArguments("./regression") + args = TrainingArguments("./regression", report_to="none") trainer = Trainer(model, args, eval_dataset=eval_dataset) # Check evaluation can run to completion @@ -1866,7 +1891,7 @@ def test_dynamic_shapes(self): self.assertTrue(np.all(seen[expected.shape[0] :] == -100)) # Same tests with eval accumulation - args = TrainingArguments("./regression", eval_accumulation_steps=2) + args = TrainingArguments("./regression", eval_accumulation_steps=2, report_to="none") trainer = Trainer(model, args, eval_dataset=eval_dataset) # Check evaluation can run to completion @@ -2984,13 +3009,14 @@ def test_bf16_full_eval(self): def test_no_wd_param_group(self): model = nn.Sequential(TstLayer(128), nn.ModuleList([TstLayer(128), TstLayer(128)])) - trainer = Trainer(model=model) - trainer.create_optimizer_and_scheduler(10) - wd_names = ['0.linear1.weight', '0.linear2.weight', '1.0.linear1.weight', '1.0.linear2.weight', '1.1.linear1.weight', '1.1.linear2.weight'] # fmt: skip - wd_params = [p for n, p in model.named_parameters() if n in wd_names] - no_wd_params = [p for n, p in model.named_parameters() if n not in wd_names] - self.assertListEqual(trainer.optimizer.param_groups[0]["params"], wd_params) - self.assertListEqual(trainer.optimizer.param_groups[1]["params"], no_wd_params) + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = Trainer(model=model, args=TrainingArguments(output_dir=tmp_dir, report_to="none")) + trainer.create_optimizer_and_scheduler(10) + wd_names = ['0.linear1.weight', '0.linear2.weight', '1.0.linear1.weight', '1.0.linear2.weight', '1.1.linear1.weight', '1.1.linear2.weight'] # fmt: skip + wd_params = [p for n, p in model.named_parameters() if n in wd_names] + no_wd_params = [p for n, p in model.named_parameters() if n not in wd_names] + self.assertListEqual(trainer.optimizer.param_groups[0]["params"], wd_params) + self.assertListEqual(trainer.optimizer.param_groups[1]["params"], no_wd_params) @slow @require_torch_multi_accelerator @@ -4134,32 +4160,35 @@ def test_get_num_trainable_parameters(self): # in_features * out_features + bias layer_1 = 128 * 64 + 64 layer_2 = 64 * 32 + 32 - trainer = Trainer(model=model) - self.assertEqual(trainer.get_num_trainable_parameters(), layer_1 + layer_2) - # Freeze the last layer - for param in model[-1].parameters(): - param.requires_grad = False - self.assertEqual(trainer.get_num_trainable_parameters(), layer_1) + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = Trainer(model=model, args=TrainingArguments(output_dir=tmp_dir, report_to="none")) + self.assertEqual(trainer.get_num_trainable_parameters(), layer_1 + layer_2) + # Freeze the last layer + for param in model[-1].parameters(): + param.requires_grad = False + self.assertEqual(trainer.get_num_trainable_parameters(), layer_1) def test_get_learning_rates(self): model = nn.Sequential(nn.Linear(128, 64)) - trainer = Trainer(model=model) - with self.assertRaises(ValueError): - trainer.get_learning_rates() - trainer.create_optimizer() - self.assertEqual(trainer.get_learning_rates(), [5e-05, 5e-05]) + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = Trainer(model=model, args=TrainingArguments(output_dir=tmp_dir, report_to="none")) + with self.assertRaises(ValueError): + trainer.get_learning_rates() + trainer.create_optimizer() + self.assertEqual(trainer.get_learning_rates(), [5e-05, 5e-05]) def test_get_optimizer_group(self): model = nn.Sequential(nn.Linear(128, 64)) - trainer = Trainer(model=model) - # ValueError is raised if optimizer is None - with self.assertRaises(ValueError): - trainer.get_optimizer_group() - trainer.create_optimizer() - # Get groups - num_groups = len(trainer.get_optimizer_group()) - self.assertEqual(num_groups, 2) - # Get group of parameter - param = next(model.parameters()) - group = trainer.get_optimizer_group(param) - self.assertIn(param, group["params"]) + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = Trainer(model=model, args=TrainingArguments(output_dir=tmp_dir, report_to="none")) + # ValueError is raised if optimizer is None + with self.assertRaises(ValueError): + trainer.get_optimizer_group() + trainer.create_optimizer() + # Get groups + num_groups = len(trainer.get_optimizer_group()) + self.assertEqual(num_groups, 2) + # Get group of parameter + param = next(model.parameters()) + group = trainer.get_optimizer_group(param) + self.assertIn(param, group["params"]) diff --git a/tests/trainer/test_trainer_distributed.py b/tests/trainer/test_trainer_distributed.py index 8f867cf0beba..968f800174a6 100644 --- a/tests/trainer/test_trainer_distributed.py +++ b/tests/trainer/test_trainer_distributed.py @@ -153,7 +153,7 @@ def test_trainer(self): {self.test_file_dir}/test_trainer_distributed.py """.split() output_dir = self.get_auto_remove_tmp_dir() - args = f"--output_dir {output_dir}".split() + args = f"--output_dir {output_dir} --report_to none".split() cmd = ["torchrun"] + distributed_args + args execute_subprocess_async(cmd, env=self.get_env()) # successful return here == success - any errors would have caused an error in the sub-call diff --git a/tests/trainer/test_trainer_seq2seq.py b/tests/trainer/test_trainer_seq2seq.py index d8722c67836f..61d2163b9e81 100644 --- a/tests/trainer/test_trainer_seq2seq.py +++ b/tests/trainer/test_trainer_seq2seq.py @@ -119,6 +119,7 @@ def _compute_metrics(pred): warmup_steps=0, eval_steps=2, logging_steps=2, + report_to="none", ) # instantiate trainer @@ -152,7 +153,7 @@ def test_return_sequences(self): "google-t5/t5-small", max_length=None, min_length=None, max_new_tokens=256, min_new_tokens=1, num_beams=5 ) - training_args = Seq2SeqTrainingArguments(".", predict_with_generate=True) + training_args = Seq2SeqTrainingArguments(".", predict_with_generate=True, report_to="none") trainer = Seq2SeqTrainer( model=model, @@ -160,6 +161,7 @@ def test_return_sequences(self): tokenizer=tokenizer, data_collator=data_collator, compute_metrics=lambda x: {"samples": x[0].shape[0]}, + report_to="none", ) def prepare_data(examples): @@ -191,7 +193,9 @@ def test_bad_generation_config_fail_early(self): data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt", padding="longest") gen_config = GenerationConfig(do_sample=False, top_p=0.9) # bad: top_p is not compatible with do_sample=False - training_args = Seq2SeqTrainingArguments(".", predict_with_generate=True, generation_config=gen_config) + training_args = Seq2SeqTrainingArguments( + ".", predict_with_generate=True, generation_config=gen_config, report_to="none" + ) with self.assertRaises(ValueError) as exc: _ = Seq2SeqTrainer( model=model,