[air/benchmark] Torch benchmarks for 4x4 (ray-project#26692)

Add benchmark data for 4x4 GPU setup. Signed-off-by: Richard Liaw <rliaw@berkeley.edu> Co-authored-by: Jimmy Yao <jiahaoyao.math@gmail.com> Co-authored-by: Kai Fricke <kai@anyscale.com>
bllchmbrs · Jul 19, 2022 · 7e62e11 · 7e62e11
1 parent 6b6d301
commit 7e62e11
Show file tree

Hide file tree

Showing 7 changed files with 180 additions and 40 deletions.
diff --git a/doc/source/ray-air/benchmarks.rst b/doc/source/ray-air/benchmarks.rst
@@ -104,14 +104,16 @@ XGBoost parameters were kept as defaults for xgboost==1.6.1 this task.
 
 
 GPU image batch prediction
-----------------------------------------------------
+--------------------------
 
 This task uses the BatchPredictor module to process different amounts of data
 using a Pytorch pre-trained ResNet model.
 
 We test out the performance across different cluster sizes and data sizes.
 
 - `GPU image batch prediction script`_
+- `GPU training small cluster configuration`_
+- `GPU training large cluster configuration`_
 
 .. list-table::
 
@@ -134,14 +136,16 @@ We test out the performance across different cluster sizes and data sizes.
 
 
 GPU image training
-------------------------
+------------------
 
 This task uses the TorchTrainer module to train different amounts of data
 using an Pytorch ResNet model.
 
 We test out the performance across different cluster sizes and data sizes.
 
 - `GPU image training script`_
+- `GPU training small cluster configuration`_
+- `GPU training large cluster configuration`_
 
 .. note::
 
@@ -169,10 +173,36 @@ We test out the performance across different cluster sizes and data sizes.
       - `python pytorch_training_e2e.py --data-size-gb=100 --num-workers=16`
 
 
+Pytorch Training Parity
+-----------------------
+
+This task checks the performance parity between native Pytorch Distributed and
+Ray Train's distributed TorchTrainer.
+
+We demonstrate that the performance is similar between the two frameworks.
+
+.. list-table::
+
+    * - **Cluster Setup**
+      - **Dataset**
+      - **Performance**
+      - **Command**
+    * - 4 m5.2xlarge nodes (4 workers)
+      - FashionMNIST
+      - 144.75 s (vs 154.35 s Pytorch)
+      - `python workloads/torch_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 4 --cpus-per-worker 8`
+    * - 4 g4dn.12xlarge node (16 workers)
+      - FashionMNIST
+      - TODO
+      - `python workloads/torch_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 16 --cpus-per-worker 4 --use-gpu`
+
+
 .. _`Bulk Ingest Script`: https://github.com/ray-project/ray/blob/a30bdf9ef34a45f973b589993f7707a763df6ebf/release/air_tests/air_benchmarks/workloads/data_benchmark.py#L25-L40
 .. _`Bulk Ingest Cluster Configuration`: https://github.com/ray-project/ray/blob/a30bdf9ef34a45f973b589993f7707a763df6ebf/release/air_tests/air_benchmarks/data_20_nodes.yaml#L6-L15
 .. _`XGBoost Training Script`: https://github.com/ray-project/ray/blob/a241e6a0f5a630d6ed5b84cce30c51963834d15b/release/air_tests/air_benchmarks/workloads/xgboost_benchmark.py#L40-L58
 .. _`XGBoost Prediction Script`: https://github.com/ray-project/ray/blob/a241e6a0f5a630d6ed5b84cce30c51963834d15b/release/air_tests/air_benchmarks/workloads/xgboost_benchmark.py#L63-L71
 .. _`XGBoost Cluster Configuration`: https://github.com/ray-project/ray/blob/a241e6a0f5a630d6ed5b84cce30c51963834d15b/release/air_tests/air_benchmarks/xgboost_compute_tpl.yaml#L6-L24
 .. _`GPU image batch prediction script`: https://github.com/ray-project/ray/blob/cec82a1ced631525a4d115e4dc0c283fa4275a7f/release/air_tests/air_benchmarks/workloads/gpu_batch_prediction.py#L18-L49
-.. _`GPU image training script`: https://github.com/ray-project/ray/blob/cec82a1ced631525a4d115e4dc0c283fa4275a7f/release/air_tests/air_benchmarks/workloads/pytorch_training_e2e.py#L95-L106
+.. _`GPU image training script`: https://github.com/ray-project/ray/blob/cec82a1ced631525a4d115e4dc0c283fa4275a7f/release/air_tests/air_benchmarks/workloads/pytorch_training_e2e.py#L95-L106
+.. _`GPU training small cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_gpu_1.yaml#L6-L24
+.. _`GPU training large cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_gpu_16.yaml#L5-L25
diff --git a/release/air_tests/air_benchmarks/compute_gpu_2x2.yaml b/release/air_tests/air_benchmarks/compute_gpu_2x2.yaml
@@ -0,0 +1,15 @@
+cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
+region: us-west-2
+
+max_workers: 1
+
+head_node_type:
+    name: head_node
+    instance_type: g3.8xlarge
+
+worker_node_types:
+    - name: worker_node
+      instance_type: g3.8xlarge
+      max_workers: 1
+      min_workers: 1
+      use_spot: false
diff --git a/...r_tests/air_benchmarks/compute_gpu_4.yaml → ...tests/air_benchmarks/compute_gpu_4x4.yaml b/...r_tests/air_benchmarks/compute_gpu_4.yaml → ...tests/air_benchmarks/compute_gpu_4x4.yaml
@@ -5,11 +5,11 @@ max_workers: 3
 
 head_node_type:
     name: head_node
-    instance_type: g4dn.xlarge
+    instance_type: g4dn.12xlarge
 
 worker_node_types:
     - name: worker_node
-      instance_type: g4dn.xlarge
+      instance_type: g4dn.12xlarge
       max_workers: 3
       min_workers: 3
       use_spot: false
diff --git a/release/air_tests/air_benchmarks/workloads/benchmark_util.py b/release/air_tests/air_benchmarks/workloads/benchmark_util.py
@@ -1,5 +1,8 @@
 import os
+import socket
 import subprocess
+from collections import defaultdict
+from contextlib import closing
 from pathlib import Path
 
 import ray
@@ -101,3 +104,56 @@ def run_fn_on_actors(
     for actor in actors:
         futures.append(actor.run_fn.remote(fn, *args, **kwargs))
     return ray.get(futures)
+
+
+def get_ip_port_actors(actors: List[ray.actor.ActorHandle]) -> List[str]:
+    # We need this wrapper to avoid deserialization issues with benchmark_util.py
+
+    def get_ip_port():
+        ip = ray.util.get_node_ip_address()
+        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
+            s.bind(("localhost", 0))
+            s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+            port = s.getsockname()[1]
+        return ip, port
+
+    return run_fn_on_actors(actors=actors, fn=get_ip_port)
+
+
+def get_gpu_ids_actors(actors: List[ray.actor.ActorHandle]) -> List[List[int]]:
+    # We need this wrapper to avoid deserialization issues with benchmark_util.py
+
+    def get_gpu_ids():
+        return ray.get_gpu_ids()
+
+    return run_fn_on_actors(actors=actors, fn=get_gpu_ids)
+
+
+def map_ips_to_gpus(ips: List[str], gpus: List[List[int]]):
+    assert len(ips) == len(gpus)
+
+    map = defaultdict(set)
+    for ip, gpu in zip(ips, gpus):
+        map[ip].update(set(gpu))
+    return {ip: sorted(gpus) for ip, gpus in map.items()}
+
+
+def set_cuda_visible_devices(
+    actors: List[ray.actor.ActorHandle],
+    actor_ips: List[str],
+    ip_to_gpus: Dict[str, set],
+):
+    assert len(actors) == len(actor_ips)
+
+    def set_env(key: str, val: str):
+        os.environ[key] = val
+
+    futures = []
+    for actor, ip in zip(actors, actor_ips):
+        assert ip in ip_to_gpus
+
+        gpu_str = ",".join([str(device) for device in sorted(ip_to_gpus[ip])])
+        future = actor.run_fn.remote(set_env, "CUDA_VISIBLE_DEVICES", gpu_str)
+        futures.append(future)
+
+    ray.get(futures)
diff --git a/release/air_tests/air_benchmarks/workloads/tensorflow_benchmark.py b/release/air_tests/air_benchmarks/workloads/tensorflow_benchmark.py
@@ -1,8 +1,6 @@
 import json
 import os
-import socket
 import time
-from contextlib import closing
 
 import click
 import numpy as np
@@ -152,12 +150,11 @@ def train_tf_vanilla(
 ) -> Tuple[float, float]:
     # This function is kicked off by the main() function and subsequently kicks
     # off tasks that run train_tf_vanilla_worker() on the worker nodes.
-    import ray
     from benchmark_util import (
         upload_file_to_all_nodes,
         create_actors_with_resources,
         run_commands_on_actors,
-        run_fn_on_actors,
+        get_ip_port_actors,
     )
 
     path = os.path.abspath(__file__)
@@ -173,15 +170,7 @@ def train_tf_vanilla(
         },
     )
 
-    def get_ip_port():
-        ip = ray.util.get_node_ip_address()
-        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
-            s.bind(("localhost", 0))
-            s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-            port = s.getsockname()[1]
-        return ip, port
-
-    ips_ports = run_fn_on_actors(actors=actors, fn=get_ip_port)
+    ips_ports = get_ip_port_actors(actors=actors)
     ip_port_list = [f"{ip}:{port}" for ip, port in ips_ports]
     ip_port_str = ",".join(ip_port_list)