Enable periodic builds for CUDA 11.7 (pytorch#81688)

CC @atalman Pull Request resolved: pytorch#81688 Approved by: https://github.com/atalman
Mookel · Aug 10, 2022 · b4f7e22 · b4f7e22
1 parent b236352
commit b4f7e22
Show file tree

Hide file tree

Showing 5 changed files with 60 additions and 7 deletions.
diff --git a/.circleci/cimodel/data/dimensions.py b/.circleci/cimodel/data/dimensions.py
@@ -4,6 +4,7 @@
     "102",
     "113",
     "116",
+    "117",
 ]
 
 ROCM_VERSIONS = [

diff --git a/.github/scripts/install_nvidia_utils_linux.sh b/.github/scripts/install_nvidia_utils_linux.sh
@@ -2,8 +2,9 @@
 
 set -eou pipefail
 
+
 DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID) \
-DRIVER_FN="NVIDIA-Linux-x86_64-510.60.02.run"
+DRIVER_FN="NVIDIA-Linux-x86_64-515.57.run"
 YUM_REPO_URL="https://nvidia.github.io/nvidia-docker/${DISTRIBUTION}/nvidia-docker.repo"
 
 install_nvidia_docker2_amzn2() {
@@ -24,6 +25,7 @@ install_nvidia_driver_amzn2() {
         # ensure our kernel install is the same as our underlying kernel,
         # groupinstall "Development Tools" has a habit of mismatching kernel headers
         sudo yum install -y "kernel-devel-uname-r == $(uname -r)"
+        sudo modprobe backlight
         sudo curl -fsL -o /tmp/nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN"
         sudo /bin/bash /tmp/nvidia_driver -s --no-drm || (sudo cat /var/log/nvidia-installer.log && false)
         sudo rm -fv /tmp/nvidia_driver

diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
@@ -120,6 +120,58 @@ jobs:
           { config: "default", shard: 4, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
         ]}
 
+  linux-bionic-cuda11_7-py3_7-gcc7-debug-build:
+    name: linux-bionic-cuda11.7-py3.7-gcc7-debug
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-bionic-cuda11.7-py3.7-gcc7-debug
+      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
+      build-with-debug: true
+
+  linux-bionic-cuda11_7-py3_7-gcc7-debug-test:
+    name: linux-bionic-cuda11.7-py3.7-gcc7-debug
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-bionic-cuda11_7-py3_7-gcc7-debug-build
+    with:
+      build-environment: linux-bionic-cuda11.7-py3.7-gcc7-debug
+      docker-image: ${{ needs.linux-bionic-cuda11_7-py3_7-gcc7-debug-build.outputs.docker-image }}
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 4, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
+        ]}
+
+  libtorch-linux-bionic-cuda11_7-py3_7-gcc7-build:
+    name: libtorch-linux-bionic-cuda11.7-py3.7-gcc7
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: libtorch-linux-bionic-cuda11.7-py3.7-gcc7
+      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
+      build-generates-artifacts: false
+
+  win-vs2019-cuda11_7-py3-build:
+    name: win-vs2019-cuda11.7-py3
+    uses: ./.github/workflows/_win-build.yml
+    with:
+      build-environment: win-vs2019-cuda11.7-py3
+      cuda-version: "11.7"
+
+  win-vs2019-cuda11_7-py3-test:
+    name: win-vs2019-cuda11.7-py3
+    uses: ./.github/workflows/_win-test.yml
+    needs: win-vs2019-cuda11_7-py3-build
+    with:
+      build-environment: win-vs2019-cuda11.7-py3
+      cuda-version: "11.7"
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 2, runner: "windows.8xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 2, runner: "windows.8xlarge.nvidia.gpu" },
+          { config: "force_on_cpu", shard: 1, num_shards: 1, runner: "windows.4xlarge" },
+        ]}
+
   ios-12-5-1-x86-64-coreml:
     name: ios-12-5-1-x86-64-coreml
     uses: ./.github/workflows/_ios-build-test.yml

diff --git a/test/run_test.py b/test/run_test.py
@@ -820,7 +820,7 @@ def get_selected_tests(options):
         options.exclude.extend(DISTRIBUTED_TESTS)
 
     # these tests failing in CUDA 11.6 temporary disabling. issue https://github.com/pytorch/pytorch/issues/75375
-    if torch.version.cuda is not None and LooseVersion(torch.version.cuda) == "11.6":
+    if torch.version.cuda is not None and LooseVersion(torch.version.cuda) >= "11.6":
         options.exclude.extend(["distributions/test_constraints"])
 
     selected_tests = exclude_tests(options.exclude, selected_tests)

diff --git a/test/test_linalg.py b/test/test_linalg.py
@@ -2858,7 +2858,7 @@ def run_test_singular_input(batch_dim, n):
     @skipCPUIfNoLapack
     @onlyNativeDeviceTypes   # TODO: XLA doesn't raise exception
     @skipCUDAIfRocm
-    @skipCUDAVersionIn([(11, 3), (11, 5), (11, 6)])  # https://github.com/pytorch/pytorch/issues/57482
+    @skipCUDAVersionIn([(11, 3), (11, 6), (11, 7)])  # https://github.com/pytorch/pytorch/issues/57482
     @dtypes(*floating_and_complex_types())
     def test_inverse_errors_large(self, device, dtype):
         # Test batched inverse of singular matrices reports errors without crashing (gh-51930)
@@ -3450,11 +3450,9 @@ def test_matrix_rank_atol_rtol(self, device, dtype):
             result = torch.linalg.matrix_rank(a, atol=tol_value, rtol=tol_value)
             self.assertEqual(result, 2)  # there are 2 singular values above max(0.81, 1.5*0.81)
 
-    # CUDA 11.6 issue failure https://github.com/pytorch/pytorch/issues/75391
-    @skipCUDAIf(torch.version.cuda is not None
-                and torch.version.cuda.split(".") == ["11", "6"], "There's a bug in CUDA 11.6")
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
+    @skipCUDAVersionIn([(11, 6), (11, 7)])  # https://github.com/pytorch/pytorch/issues/75391
     @dtypes(*floating_and_complex_types())
     def test_matrix_rank_empty(self, device, dtype):
         matrix_rank = torch.linalg.matrix_rank
@@ -4154,7 +4152,7 @@ def test_linalg_solve_triangular(self, device, dtype):
     @onlyCUDA
     @skipCUDAIfNoMagma  # Magma needed for the PLU decomposition
     @skipCUDAIfRocm  # There is a memory access bug in rocBLAS in the (non-batched) solve_triangular
-    @skipCUDAVersionIn([(11, 3), (11, 5), (11, 6)])  # Tracked in https://github.com/pytorch/pytorch/issues/70111
+    @skipCUDAVersionIn([(11, 3), (11, 6), (11, 7)])  # Tracked in https://github.com/pytorch/pytorch/issues/70111
     @dtypes(*floating_and_complex_types())
     @precisionOverride({torch.float32: 1e-2, torch.complex64: 1e-2,
                         torch.float64: 1e-8, torch.complex128: 1e-8})
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,6 +4,7 @@ @@
         "102",
         "113",
         "116",
+        "117",
     ]
     ROCM_VERSIONS = [
@@ Expand Down @@