Skip to content

Commit

Permalink
Enable periodic builds for CUDA 11.7 (pytorch#81688)
Browse files Browse the repository at this point in the history
CC @atalman
Pull Request resolved: pytorch#81688
Approved by: https://github.com/atalman
  • Loading branch information
ptrblck authored and pytorchmergebot committed Aug 10, 2022
1 parent b236352 commit b4f7e22
Show file tree
Hide file tree
Showing 5 changed files with 60 additions and 7 deletions.
1 change: 1 addition & 0 deletions .circleci/cimodel/data/dimensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"102",
"113",
"116",
"117",
]

ROCM_VERSIONS = [
Expand Down
4 changes: 3 additions & 1 deletion .github/scripts/install_nvidia_utils_linux.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@

set -eou pipefail


DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID) \
DRIVER_FN="NVIDIA-Linux-x86_64-510.60.02.run"
DRIVER_FN="NVIDIA-Linux-x86_64-515.57.run"
YUM_REPO_URL="https://nvidia.github.io/nvidia-docker/${DISTRIBUTION}/nvidia-docker.repo"

install_nvidia_docker2_amzn2() {
Expand All @@ -24,6 +25,7 @@ install_nvidia_driver_amzn2() {
# ensure our kernel install is the same as our underlying kernel,
# groupinstall "Development Tools" has a habit of mismatching kernel headers
sudo yum install -y "kernel-devel-uname-r == $(uname -r)"
sudo modprobe backlight
sudo curl -fsL -o /tmp/nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN"
sudo /bin/bash /tmp/nvidia_driver -s --no-drm || (sudo cat /var/log/nvidia-installer.log && false)
sudo rm -fv /tmp/nvidia_driver
Expand Down
52 changes: 52 additions & 0 deletions .github/workflows/periodic.yml
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,58 @@ jobs:
{ config: "default", shard: 4, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
]}
linux-bionic-cuda11_7-py3_7-gcc7-debug-build:
name: linux-bionic-cuda11.7-py3.7-gcc7-debug
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-bionic-cuda11.7-py3.7-gcc7-debug
docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
build-with-debug: true

linux-bionic-cuda11_7-py3_7-gcc7-debug-test:
name: linux-bionic-cuda11.7-py3.7-gcc7-debug
uses: ./.github/workflows/_linux-test.yml
needs: linux-bionic-cuda11_7-py3_7-gcc7-debug-build
with:
build-environment: linux-bionic-cuda11.7-py3.7-gcc7-debug
docker-image: ${{ needs.linux-bionic-cuda11_7-py3_7-gcc7-debug-build.outputs.docker-image }}
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
{ config: "default", shard: 2, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
{ config: "default", shard: 3, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
{ config: "default", shard: 4, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
]}
libtorch-linux-bionic-cuda11_7-py3_7-gcc7-build:
name: libtorch-linux-bionic-cuda11.7-py3.7-gcc7
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: libtorch-linux-bionic-cuda11.7-py3.7-gcc7
docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
build-generates-artifacts: false

win-vs2019-cuda11_7-py3-build:
name: win-vs2019-cuda11.7-py3
uses: ./.github/workflows/_win-build.yml
with:
build-environment: win-vs2019-cuda11.7-py3
cuda-version: "11.7"

win-vs2019-cuda11_7-py3-test:
name: win-vs2019-cuda11.7-py3
uses: ./.github/workflows/_win-test.yml
needs: win-vs2019-cuda11_7-py3-build
with:
build-environment: win-vs2019-cuda11.7-py3
cuda-version: "11.7"
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 2, runner: "windows.8xlarge.nvidia.gpu" },
{ config: "default", shard: 2, num_shards: 2, runner: "windows.8xlarge.nvidia.gpu" },
{ config: "force_on_cpu", shard: 1, num_shards: 1, runner: "windows.4xlarge" },
]}
ios-12-5-1-x86-64-coreml:
name: ios-12-5-1-x86-64-coreml
uses: ./.github/workflows/_ios-build-test.yml
Expand Down
2 changes: 1 addition & 1 deletion test/run_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -820,7 +820,7 @@ def get_selected_tests(options):
options.exclude.extend(DISTRIBUTED_TESTS)

# these tests failing in CUDA 11.6 temporary disabling. issue https://github.com/pytorch/pytorch/issues/75375
if torch.version.cuda is not None and LooseVersion(torch.version.cuda) == "11.6":
if torch.version.cuda is not None and LooseVersion(torch.version.cuda) >= "11.6":
options.exclude.extend(["distributions/test_constraints"])

selected_tests = exclude_tests(options.exclude, selected_tests)
Expand Down
8 changes: 3 additions & 5 deletions test/test_linalg.py
Original file line number Diff line number Diff line change
Expand Up @@ -2858,7 +2858,7 @@ def run_test_singular_input(batch_dim, n):
@skipCPUIfNoLapack
@onlyNativeDeviceTypes # TODO: XLA doesn't raise exception
@skipCUDAIfRocm
@skipCUDAVersionIn([(11, 3), (11, 5), (11, 6)]) # https://github.com/pytorch/pytorch/issues/57482
@skipCUDAVersionIn([(11, 3), (11, 6), (11, 7)]) # https://github.com/pytorch/pytorch/issues/57482
@dtypes(*floating_and_complex_types())
def test_inverse_errors_large(self, device, dtype):
# Test batched inverse of singular matrices reports errors without crashing (gh-51930)
Expand Down Expand Up @@ -3450,11 +3450,9 @@ def test_matrix_rank_atol_rtol(self, device, dtype):
result = torch.linalg.matrix_rank(a, atol=tol_value, rtol=tol_value)
self.assertEqual(result, 2) # there are 2 singular values above max(0.81, 1.5*0.81)

# CUDA 11.6 issue failure https://github.com/pytorch/pytorch/issues/75391
@skipCUDAIf(torch.version.cuda is not None
and torch.version.cuda.split(".") == ["11", "6"], "There's a bug in CUDA 11.6")
@skipCUDAIfNoMagma
@skipCPUIfNoLapack
@skipCUDAVersionIn([(11, 6), (11, 7)]) # https://github.com/pytorch/pytorch/issues/75391
@dtypes(*floating_and_complex_types())
def test_matrix_rank_empty(self, device, dtype):
matrix_rank = torch.linalg.matrix_rank
Expand Down Expand Up @@ -4154,7 +4152,7 @@ def test_linalg_solve_triangular(self, device, dtype):
@onlyCUDA
@skipCUDAIfNoMagma # Magma needed for the PLU decomposition
@skipCUDAIfRocm # There is a memory access bug in rocBLAS in the (non-batched) solve_triangular
@skipCUDAVersionIn([(11, 3), (11, 5), (11, 6)]) # Tracked in https://github.com/pytorch/pytorch/issues/70111
@skipCUDAVersionIn([(11, 3), (11, 6), (11, 7)]) # Tracked in https://github.com/pytorch/pytorch/issues/70111
@dtypes(*floating_and_complex_types())
@precisionOverride({torch.float32: 1e-2, torch.complex64: 1e-2,
torch.float64: 1e-8, torch.complex128: 1e-8})
Expand Down

0 comments on commit b4f7e22

Please sign in to comment.