Skip to content

Commit

Permalink
Use container job by default
Browse files Browse the repository at this point in the history
  • Loading branch information
haampie committed May 6, 2020
1 parent 2c49222 commit 3287f10
Show file tree
Hide file tree
Showing 5 changed files with 138 additions and 130 deletions.
171 changes: 41 additions & 130 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
@@ -1,156 +1,67 @@
include:
- remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.cscs.yml'

stages:
- build
- test

##
## BUILDS
##

.kubernetes:
variables:
DOCKER_DRIVER: overlay2
DOCKER_HOST: tcp://localhost:2375
DOCKER_TLS_CERTDIR: ""
DOCKER_BUILDKIT: 1
BUILDKIT_PROGRESS: plain

image: docker:stable
only:
- master
- staging
- trying
tags:
- kubernetes
services:
- docker:19.03.1-dind # Important to keep the patch version here!

# Builds a Docker image for the current commit, cpu / gpu
build sanitizer cpu:
.build_common:
extends: .dind
stage: build
only: ['master', 'staging', 'trying']
variables:
GIT_SUBMODULE_STRATEGY: recursive
extends: .kubernetes
stage: build
before_script:
- docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY
- export BUILD_IMAGE=$CI_REGISTRY_IMAGE/build-env-asan:latest
- export IMAGE=$CI_REGISTRY_IMAGE/deploy-cpu-asan:$CI_COMMIT_SHA
script:
- docker build --network=host --cache-from $BUILD_IMAGE --build-arg BUILDKIT_INLINE_CACHE=1 -t $BUILD_IMAGE -f docker/asan/build-env.Dockerfile .
- docker build --network=host --cache-from $BUILD_IMAGE --build-arg BUILDKIT_INLINE_CACHE=1 -t $BUILD_IMAGE -f $BUILD_DOCKERFILE .
- docker push $BUILD_IMAGE
- docker build -t $IMAGE --network=host --build-arg BUILDKIT_INLINE_CACHE=1 --build-arg BUILD_ENV=$BUILD_IMAGE -f docker/asan/deploy.Dockerfile .
- docker push $IMAGE
- docker build -t $DEPLOY_IMAGE --network=host --build-arg BUILDKIT_INLINE_CACHE=1 --build-arg BUILD_ENV=$BUILD_IMAGE -f $DEPLOY_DOCKERFILE .
- docker push $DEPLOY_IMAGE

build cpu:
# Builds a Docker image for the current commit, cpu / gpu
build sanitizer cpu:
extends: .build_common
variables:
GIT_SUBMODULE_STRATEGY: recursive
extends: .kubernetes
stage: build
before_script:
- docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY
- export BUILD_IMAGE=$CI_REGISTRY_IMAGE/build-env-cpu:latest
- export IMAGE=$CI_REGISTRY_IMAGE/deploy-cpu:$CI_COMMIT_SHA
script:
- docker build --network=host --cache-from $BUILD_IMAGE --build-arg BUILDKIT_INLINE_CACHE=1 -t $BUILD_IMAGE -f docker/cpu-release/build-env.Dockerfile .
- docker push $BUILD_IMAGE
- docker build -t $IMAGE --network=host --build-arg BUILDKIT_INLINE_CACHE=1 --build-arg BUILD_ENV=$BUILD_IMAGE -f docker/cpu-release/deploy.Dockerfile .
- docker push $IMAGE
BUILD_DOCKERFILE: docker/asan/build-env.Dockerfile
BUILD_IMAGE: $CI_REGISTRY_IMAGE/build-env-asan:latest
DEPLOY_DOCKERFILE: docker/asan/deploy.Dockerfile
DEPLOY_IMAGE: $CI_REGISTRY_IMAGE/deploy-cpu-asan:$CI_COMMIT_SHA

build gpu:
build cpu:
extends: .build_common
variables:
GIT_SUBMODULE_STRATEGY: recursive
extends: .kubernetes
stage: build
before_script:
- docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY
- export BUILD_IMAGE=$CI_REGISTRY_IMAGE/build-env-gpu:latest
- export IMAGE=$CI_REGISTRY_IMAGE/deploy-gpu:$CI_COMMIT_SHA
script:
- docker build --network=host --cache-from $BUILD_IMAGE --build-arg BUILDKIT_INLINE_CACHE=1 -t $BUILD_IMAGE -f docker/gpu/build-env.Dockerfile .
- docker push $BUILD_IMAGE
- docker build -t $IMAGE --network=host --build-arg BUILDKIT_INLINE_CACHE=1 --build-arg BUILD_ENV=$BUILD_IMAGE -f docker/gpu/deploy.Dockerfile .
- docker push $IMAGE
BUILD_DOCKERFILE: docker/cpu-release/build-env.Dockerfile
BUILD_IMAGE: $CI_REGISTRY_IMAGE/build-env-cpu:latest
DEPLOY_DOCKERFILE: docker/cpu-release/deploy.Dockerfile
DEPLOY_IMAGE: $CI_REGISTRY_IMAGE/deploy-cpu:$CI_COMMIT_SHA

##
## RUNS
##

.daint-common:
build gpu:
extends: .build_common
variables:
GIT_SUBMODULE_STRATEGY: none # no need to clone as we have images
CRAY_CUDA_MPS: 1
COSMA_GPU_MAX_TILE_M: 100
COSMA_GPU_MAX_TILE_N: 100
COSMA_GPU_MAX_TILE_K: 100
only:
- master
- staging
- trying
tags:
- daint
BUILD_DOCKERFILE: docker/gpu/build-env.Dockerfile
BUILD_IMAGE: $CI_REGISTRY_IMAGE/build-env-gpu:latest
DEPLOY_DOCKERFILE: docker/gpu/deploy.Dockerfile
DEPLOY_IMAGE: $CI_REGISTRY_IMAGE/deploy-gpu:$CI_COMMIT_SHA

# Executes the docker images on Daint via Sarus
test sanitizer cpu:
extends: .daint-common
sanitize:
stage: test
before_script:
- module load sarus daint-mc
- export IMAGE=$CI_REGISTRY_IMAGE/deploy-cpu-asan:$CI_COMMIT_SHA
- sarus pull $IMAGE
- salloc --no-shell --job-name=cosma-ci-$CI_JOB_ID -N 2 -n 16 -C mc -p normal
- export JOBID=$(squeue -h --name=cosma-ci-$CI_JOB_ID --format=%A)
script:
- srun --jobid=$JOBID -N 2 -n 16 -C mc -p normal unbuffer sarus run -t --mpi $IMAGE test.pdgemm
- srun --jobid=$JOBID -N 2 -n 16 -C mc -p normal unbuffer sarus run -t --mpi $IMAGE test.multiply
- srun --jobid=$JOBID -N 1 -n 1 -C mc -p normal unbuffer sarus run -t --mpi $IMAGE test.mapper
- srun --jobid=$JOBID -N 1 -n 8 -C mc -p normal unbuffer sarus run -t --mpi $IMAGE test.scalar_matmul
- srun --jobid=$JOBID -N 1 -n 4 -C mc -p normal unbuffer sarus run -t --mpi $IMAGE test.multiply_using_layout
after_script:
- module load sarus
- export IMAGE=$CI_REGISTRY_IMAGE/deploy-cpu-asan:$CI_COMMIT_SHA
- export JOBID=$(squeue -h --name=cosma-ci-$CI_JOB_ID --format=%A)
- scancel $JOBID
only: ['master', 'staging', 'trying']
trigger:
include: /ci/sanitize.yml

test cpu:
extends: .daint-common
cpu test:
stage: test
before_script:
- module load sarus daint-mc
- export IMAGE=$CI_REGISTRY_IMAGE/deploy-cpu:$CI_COMMIT_SHA
- sarus pull $IMAGE
- salloc --no-shell --job-name=cosma-ci-$CI_JOB_ID -N 2 -n 16 -C mc -p normal
- export JOBID=$(squeue -h --name=cosma-ci-$CI_JOB_ID --format=%A)
script:
- srun --jobid=$JOBID -N 2 -n 16 -C mc -p normal unbuffer sarus run -t --mpi $IMAGE test.pdgemm
- srun --jobid=$JOBID -N 2 -n 16 -C mc -p normal unbuffer sarus run -t --mpi $IMAGE test.multiply
- srun --jobid=$JOBID -N 1 -n 1 -C mc -p normal unbuffer sarus run -t --mpi $IMAGE test.mapper
- srun --jobid=$JOBID -N 1 -n 8 -C mc -p normal unbuffer sarus run -t --mpi $IMAGE test.scalar_matmul
- srun --jobid=$JOBID -N 1 -n 4 -C mc -p normal unbuffer sarus run -t --mpi $IMAGE test.multiply_using_layout
after_script:
- module load sarus
- export IMAGE=$CI_REGISTRY_IMAGE/deploy-cpu:$CI_COMMIT_SHA
- export JOBID=$(squeue -h --name=cosma-ci-$CI_JOB_ID --format=%A)
- scancel $JOBID
- sarus rmi $IMAGE
only: ['master', 'staging', 'trying']
trigger:
include: /ci/cpu.yml

test gpu:
extends: .daint-common
gpu test:
stage: test
before_script:
- module load sarus daint-gpu
- export IMAGE=$CI_REGISTRY_IMAGE/deploy-gpu:$CI_COMMIT_SHA
- sarus pull $IMAGE
- salloc --no-shell --job-name=cosma-ci-$CI_JOB_ID -N 2 -n 16 -C gpu -p normal
- export JOBID=$(squeue -h --name=cosma-ci-$CI_JOB_ID --format=%A)
script:
- srun --jobid=$JOBID -N 2 -n 16 -C gpu -p normal unbuffer sarus run -t --mpi $IMAGE test.pdgemm
- srun --jobid=$JOBID -N 2 -n 16 -C gpu -p normal unbuffer sarus run -t --mpi $IMAGE test.multiply
- srun --jobid=$JOBID -N 1 -n 1 -C gpu -p normal unbuffer sarus run -t --mpi $IMAGE test.mapper
- srun --jobid=$JOBID -N 1 -n 8 -C gpu -p normal unbuffer sarus run -t --mpi $IMAGE test.scalar_matmul
- srun --jobid=$JOBID -N 1 -n 4 -C gpu -p normal unbuffer sarus run -t --mpi $IMAGE test.multiply_using_layout
after_script:
- module load sarus
- export IMAGE=$CI_REGISTRY_IMAGE/deploy-gpu:$CI_COMMIT_SHA
- export JOBID=$(squeue -h --name=cosma-ci-$CI_JOB_ID --format=%A)
- scancel $JOBID
- sarus rmi $IMAGE

only: ['master', 'staging', 'trying']
trigger:
include: /ci/gpu.yml
11 changes: 11 additions & 0 deletions ci/cpu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
include:
- local: /ci/test.yml

image: $CI_REGISTRY_IMAGE/deploy-cpu:$CI_COMMIT_SHA

variables:
ALLOCATION_NAME: cosma-test-sanitizer-cpu-$CI_PIPELINE_ID
SLURM_CONSTRAINT: mc
SLURM_JOB_NUM_NODES: 2
SLURM_PARTITION: normal

11 changes: 11 additions & 0 deletions ci/gpu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
include:
- local: /ci/test.yml

image: $CI_REGISTRY_IMAGE/build-env-gpu:latest

variables:
ALLOCATION_NAME: cosma-test-sanitizer-gpu-$CI_PIPELINE_ID
SLURM_CONSTRAINT: gpu
SLURM_JOB_NUM_NODES: 2
SLURM_PARTITION: normal

10 changes: 10 additions & 0 deletions ci/sanitize.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
include:
- local: /ci/test.yml

image: $CI_REGISTRY_IMAGE/deploy-cpu-asan:$CI_COMMIT_SHA

variables:
ALLOCATION_NAME: cosma-test-sanitizer-cpu-$CI_PIPELINE_ID
SLURM_CONSTRAINT: mc
SLURM_JOB_NUM_NODES: 2
SLURM_PARTITION: normal
65 changes: 65 additions & 0 deletions ci/test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
include:
- remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.cscs.yml'

stages:
- allocate
- test
- cleanup

variables:
SLURM_TIMELIMIT: '15:00'
USE_MPI: 'YES'
DISABLE_AFTER_SCRIPT: 'YES'
PULL_IMAGE: 'NO'
COSMA_GPU_MAX_TILE_K: 100
COSMA_GPU_MAX_TILE_M: 100
COSMA_GPU_MAX_TILE_N: 100
CRAY_CUDA_MPS: 1

allocate:
stage: allocate
extends: .daint_alloc
variables:
PULL_IMAGE: 'YES'

pdgemm and multiply:
extends: .daint
stage: test
resource_group: daint-job
script:
- test.pdgemm
- test.multiply
variables:
SLURM_JOB_NUM_NODES: 2
SLURM_NTASKS: 16

mapper:
extends: .daint
stage: test
resource_group: daint-job
script: test.mapper
variables:
SLURM_JOB_NUM_NODES: 1
SLURM_NTASKS: 1

scalar_matmul:
extends: .daint
stage: test
resource_group: daint-job
script: test.mapper
variables:
SLURM_JOB_NUM_NODES: 1
SLURM_NTASKS: 8

multiply_using_layout:
extends: .daint
stage: test
resource_group: daint-job
script: test.mapper
variables:
SLURM_JOB_NUM_NODES: 1
SLURM_NTASKS: 4

deallocate:
stage: cleanup
extends: .daint_dealloc

0 comments on commit 3287f10

Please sign in to comment.