-
Notifications
You must be signed in to change notification settings - Fork 4.1k
89 lines (81 loc) · 3.48 KB
/
xpu-max1100.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
name: xpu-max1100
on:
workflow_dispatch:
schedule:
- cron: "0 0 * * *"
pull_request:
paths:
- ".github/workflows/xpu-max1100.yml"
- "accelerator/xpu_accelerator.py"
- "accelerator/abstract_accelerator.py"
- "accelerator/cpu_accelerator.py"
- "accelerator/real_accelerator.py"
- "csrc/xpu/**"
- "deepspeed/runtime/engine.py"
- "deepspeed/runtime/bf16_optimizer.py"
- "deepspeed/runtime/zero/stage_1_and_2.py"
- "deepspeed/runtime/zero/stage3.py"
- "deepspeed/runtime/zero/partition_parameters.py"
- "deepspeed/runtime/zero/partitioned_param_coordinator.py"
- "deepspeed/runtime/zero/parameter_offload.py"
- "deepspeed/runtime/pipe/engine.py"
- "deepspeed/runtime/utils.py"
- "op_builder/xpu/**"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
permissions:
contents: read
issues: write
jobs:
unit-tests:
runs-on: [self-hosted, intel, xpu]
container:
image: intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04
ports:
- 80
options: --privileged -it --rm --device /dev/dri:/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --ipc=host --cap-add=ALL
steps:
- uses: actions/checkout@v4
- name: Install prerequisite
run: |
apt-get update
apt-get install clinfo libaio-dev python3-pip -y
pip install torch==2.3.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torch/
pip install intel-extension-for-pytorch==2.3.110+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/intel-extension-for-pytorch/
pip install oneccl_bind_pt==2.3.100+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/oneccl-bind-pt/
pip install torchvision==0.18.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torchvision/
pip install py-cpuinfo numpy
pip install .[dev,autotuning]
- name: Check container state
run: |
ldd --version
ds_report
python3 -c "import torch; print('torch:', torch.__version__, torch)"
python3 -c "import torch; import intel_extension_for_pytorch; print('XPU available:', torch.xpu.is_available())"
python3 -c "from deepspeed.accelerator import get_accelerator; print('accelerator:', get_accelerator()._name)"
pip list
- name: Unit tests
run: |
cd tests/unit
pytest --verbose accelerator/*
pytest --verbose autotuning/*
pytest --verbose checkpoint/test_reshape_checkpoint.py
pytest --verbose checkpoint/test_moe_checkpoint.py
pytest --verbose checkpoint/test_shared_weights.py
pytest --verbose launcher/test_ds_arguments.py launcher/test_run.py
pytest --verbose model_parallelism/*
pytest --verbose moe/test_moe_tp.py
pytest --verbose monitor/*
pytest --verbose utils/*
pytest --verbose runtime/test_ds_config_model.py
pytest --verbose runtime/pipe/test_pipe_schedule.py
pytest --verbose runtime/zero/test_zero_config.py
pytest --verbose runtime/zero/test_zero_tiled.py
pytest --verbose runtime/zero/test_zeropp.py
pytest --verbose runtime/test_autocast.py
pytest --verbose runtime/test_data.py
pytest --verbose runtime/test_runtime_utils.py
pytest --verbose runtime/activation_checkpointing/*
pytest --verbose runtime/utils/*
pytest --verbose runtime/zero/test_zero_dynamic_class.py