Skip to content

Commit

Permalink
Merge downstream main into tmp-main-20241118 with conflicts
Browse files Browse the repository at this point in the history
  • Loading branch information
github-actions[bot] committed Nov 18, 2024
2 parents 111932a + d806fa3 commit 015008e
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 0 deletions.
44 changes: 44 additions & 0 deletions .github/workflows/self-push-amd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,11 @@ jobs:
strategy:
matrix:
machine_type: [single-gpu, multi-gpu]
<<<<<<< HEAD
runs-on: [rocm, self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
=======
runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
>>>>>>> origin/upstream_sync
container:
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
Expand All @@ -54,7 +58,11 @@ jobs:
strategy:
matrix:
machine_type: [single-gpu, multi-gpu]
<<<<<<< HEAD
runs-on: [rocm, self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
=======
runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
>>>>>>> origin/upstream_sync
container:
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
Expand Down Expand Up @@ -159,7 +167,11 @@ jobs:
matrix:
folders: ${{ fromJson(needs.setup_gpu.outputs.matrix) }}
machine_type: [single-gpu, multi-gpu]
<<<<<<< HEAD
runs-on: [rocm, self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
=======
runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
>>>>>>> origin/upstream_sync
container:
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
Expand Down Expand Up @@ -261,7 +273,10 @@ jobs:
needs: [
check_runners,
setup_gpu,
<<<<<<< HEAD
run_tests_amdgpu
=======
>>>>>>> origin/upstream_sync
run_models_gpu,
# run_tests_torch_cuda_extensions_single_gpu,
# run_tests_torch_cuda_extensions_multi_gpu
Expand Down Expand Up @@ -311,3 +326,32 @@ jobs:
echo "updated branch = $(git branch --show-current)"
git checkout ${{ env.CI_SHA }}
echo "log = $(git log -n 1)"
<<<<<<< HEAD
=======

- uses: actions/download-artifact@v4
- name: Send message to Slack
env:
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
CI_SLACK_CHANNEL_ID_AMD: ${{ secrets.CI_SLACK_CHANNEL_ID_AMD }}
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_AMD }}
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
CI_EVENT: Push CI (AMD) - ${{ inputs.gpu_flavor }}
CI_TITLE_PUSH: ${{ github.event.head_commit.message }}
CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }}
CI_SHA: ${{ env.CI_SHA }}
RUNNER_STATUS: ${{ needs.check_runner_status.result }}
RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
SETUP_STATUS: ${{ needs.setup_gpu.result }}

# We pass `needs.setup_gpu.outputs.matrix` as the argument. A processing in `notification_service.py` to change
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
run: |
pip install huggingface_hub
pip install slack_sdk
pip show slack_sdk
python utils/notification_service.py "${{ needs.setup_gpu.outputs.matrix }}"
>>>>>>> origin/upstream_sync
7 changes: 7 additions & 0 deletions src/transformers/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1725,12 +1725,16 @@ def _wrap_model(self, model, training=True, dataloader=None):

# train/eval could be run multiple-times - if already wrapped, don't re-wrap it again
if self.accelerator.unwrap_model(model) is not model:
<<<<<<< HEAD
if self.args.ort:
from torch_ort import ORTModule
if type(model) is not ORTModule:
return model
else:
return model
=======
return model
>>>>>>> origin/upstream_sync

# Mixed precision training with apex (torch < 1.6)
if self.use_apex and training:
Expand Down Expand Up @@ -2450,12 +2454,15 @@ def _inner_training_loop(
self._total_loss_scalar += tr_loss.item()
effective_global_step = max(self.state.global_step, 0.001) # Avoid ZeroDivisionError
train_loss = self._total_loss_scalar / effective_global_step
<<<<<<< HEAD

metrics = speed_metrics("train", start_time, num_samples=num_train_samples, num_steps=self.state.max_steps,num_tokens=num_train_tokens,)

total_samples = self.state.global_step*total_train_batch_size if args.max_steps > 0 else num_examples*num_train_epochs
perf_samples = total_samples - self.args.warmup_steps*total_train_batch_size
stable_train_metrics = speed_metrics("stable_train", start_train_stable_time, perf_samples)
=======
>>>>>>> origin/upstream_sync

self.store_flos()
metrics["total_flos"] = self.state.total_flos
Expand Down
3 changes: 3 additions & 0 deletions src/transformers/training_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -568,9 +568,12 @@ class TrainingArguments:
*after* initializing the `TrainingArguments`, else it will not be applied.
</Tip>
<<<<<<< HEAD
ortmodule (:obj:`bool`, `optional`):
Use `ORTModule <https://github.com/microsoft/onnxruntime>`__.
=======
>>>>>>> origin/upstream_sync
accelerator_config (`str`, `dict`, or `AcceleratorConfig`, *optional*):
Config to be used with the internal `Accelerator` implementation. The value is either a location of
accelerator json config file (e.g., `accelerator_config.json`), an already loaded json file as `dict`,
Expand Down
2 changes: 2 additions & 0 deletions unit_test_summary.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Aggregated Unit Test Summary:
180 passed, 1 failed, 180 skipped, 180 deselected, 124 warnings

0 comments on commit 015008e

Please sign in to comment.