forked from Lightning-AI/torchmetrics
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgpu-pipeline.yml
120 lines (103 loc) · 4.01 KB
/
gpu-pipeline.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# Python package
# Create and test a Python package on multiple Python versions.
# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more:
# https://docs.microsoft.com/azure/devops/pipelines/languages/python
trigger:
tags:
include:
- '*'
branches:
include:
- master
- release/*
- refs/tags/*
pr:
- master
- release/*
jobs:
- job: pytest
# how long to run the job before automatically cancelling
timeoutInMinutes: "55"
# how much time to give 'run always even if cancelled tasks' before stopping them
cancelTimeoutInMinutes: "2"
pool: azure-gpus-spot
container:
image: "pytorch/pytorch:1.8.1-cuda11.1-cudnn8-runtime"
options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --name ci-container -v /usr/bin/docker:/tmp/docker:ro"
workspace:
clean: all
steps:
- bash: |
whoami
id
lspci | egrep 'VGA|3D'
whereis nvidia
nvidia-smi
python --version
pip --version
pip list
displayName: 'Image info & NVIDIA'
- script: |
/tmp/docker exec -t -u 0 ci-container \
sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -o Dpkg::Options::="--force-confold" -y install sudo"
displayName: 'Install Sudo in container (thanks Microsoft!)'
- bash: |
set -ex
sudo apt-get update -qq --fix-missing
sudo apt-get install -y build-essential gcc g++ cmake ffmpeg git libsndfile1 unzip --no-install-recommends
# python -m pip install "pip==20.1"
pip install --requirement ./requirements/devel.txt --upgrade-strategy only-if-needed
pip install mkl-service==2.4.0 # needed for the gpu multiprocessing
pip list
displayName: 'Install dependencies'
- bash: |
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'GPU: {mgpu}'"
displayName: 'Sanity check'
- bash: |
cd tests
# wget is simpler but does not work on Windows
python -c "from urllib.request import urlretrieve ; urlretrieve('https://pl-public-data.s3.amazonaws.com/metrics/data.zip', 'data.zip')"
unzip -o data.zip
ls -l data/*
displayName: 'Pull testing data from S3'
- bash: |
python -m pytest torchmetrics tests -v --cov=torchmetrics --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50
displayName: 'Testing'
- bash: |
python -m coverage report
python -m coverage xml
python -m coverage html
python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) --flags=gpu,pytest --name="GPU-coverage" --env=linux,azure
ls -l
displayName: 'Statistics'
- task: PublishTestResults@2
displayName: 'Publish test results'
inputs:
testResultsFiles: '$(Build.StagingDirectory)/test-results.xml'
testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)'
condition: succeededOrFailed()
# todo: re-enable after schema check pass, also atm it seems does not have any effect
#- task: PublishCodeCoverageResults@1
# displayName: 'Publish coverage report'
# inputs:
# codeCoverageTool: 'Cobertura'
# summaryFileLocation: '$(Build.SourcesDirectory)/coverage.xml'
# reportDirectory: '$(Build.SourcesDirectory)/htmlcov'
# testRunTitle: '$(Agent.OS) - $(Build.BuildNumber)[$(Agent.JobName)] - Python $(python.version)'
# condition: succeededOrFailed()
- bash: |
set -e
pip install .
FILES="tm_examples/*.py"
for fn in $FILES
do
echo "Processing $fn example..."
python $fn
done
pip uninstall -y torchmetrics
displayName: 'Examples'
- bash: |
pip install -r requirements/integrate.txt --quiet --upgrade-strategy only-if-needed
pip uninstall -y torchmetrics
python -m pytest integrations -v --durations=25
displayName: 'Integrations'