[Release] Upgrade instance types for xgboost gpu release tests (ray-p…

…roject#24002) In xgboost 1.6, support for older GPU architectures was removed (dmlc/xgboost#7767). This PR updates the instance types used in our xgboost-ray gpu release tests to use Volta GPUs instead of Kepler GPUs so that xgboost-ray can run successfully with xgboost v1.6. Closes ray-project#24048
jbn · Apr 20, 2022 · 47243ac · 47243ac
1 parent 4680de8
commit 47243ac
Show file tree

Hide file tree

Showing 4 changed files with 25 additions and 5 deletions.
diff --git a/release/ml_user_tests/xgboost/tpl_gpu_small_scaling.yaml b/release/ml_user_tests/xgboost/tpl_gpu_small_scaling.yaml
@@ -9,7 +9,7 @@ head_node_type:
 
 worker_node_types:
     - name: worker_node
-      instance_type: p2.xlarge
+      instance_type: p3.2xlarge
       min_workers: 0
       max_workers: 4
       use_spot: false

diff --git a/release/ml_user_tests/xgboost/train_gpu_connect.py b/release/ml_user_tests/xgboost/train_gpu_connect.py
@@ -18,12 +18,22 @@
     addr = os.environ.get("RAY_ADDRESS")
     job_name = os.environ.get("RAY_JOB_NAME", "train_gpu_connect")
 
-    runtime_env = {"env_vars": {"RXGB_PLACEMENT_GROUP_TIMEOUT_S": "1200"}}
+    # Manually set NCCL_SOCKET_IFNAME to "ens3" so NCCL training works on
+    # anyscale_default_cloud.
+    # See https://github.com/pytorch/pytorch/issues/68893 for more details.
+    # Passing in runtime_env to ray.init() will also set it for all the
+    # workers.
+    runtime_env = {
+        "env_vars": {
+            "RXGB_PLACEMENT_GROUP_TIMEOUT_S": "1200",
+            "NCCL_SOCKET_IFNAME": "ens3",
+        }
+    }
 
     if addr.startswith("anyscale://"):
         ray.init(address=addr, job_name=job_name, runtime_env=runtime_env)
     else:
-        ray.init(address="auto")
+        ray.init(address="auto", runtime_env=runtime_env)
 
     from xgboost_ray import RayParams
     from ray.util.xgboost.release_test_util import train_ray, get_parquet_files

diff --git a/release/xgboost_tests/tpl_gpu_small.yaml b/release/xgboost_tests/tpl_gpu_small.yaml
@@ -9,7 +9,7 @@ head_node_type:
 
 worker_node_types:
     - name: worker_node
-      instance_type: p2.xlarge
+      instance_type: p3.2xlarge
       min_workers: 4
       max_workers: 4
       use_spot: false

diff --git a/release/xgboost_tests/workloads/train_gpu.py b/release/xgboost_tests/workloads/train_gpu.py
@@ -29,7 +29,17 @@
 from ray.util.xgboost.release_test_util import train_ray
 
 if __name__ == "__main__":
-    ray.init(address="auto")
+    # Manually set NCCL_SOCKET_IFNAME to "ens3" so NCCL training works on
+    # anyscale_default_cloud.
+    # See https://github.com/pytorch/pytorch/issues/68893 for more details.
+    # Passing in runtime_env to ray.init() will also set it for all the
+    # workers.
+    runtime_env = {
+        "env_vars": {
+            "NCCL_SOCKET_IFNAME": "ens3",
+        }
+    }
+    ray.init(address="auto", runtime_env=runtime_env)
 
     ray_params = RayParams(
         elastic_training=False,