ray-project · justinvyu · Nov 24, 2025 · Aug 5, 2025 · Oct 16, 2025 · Oct 29, 2025
@@ -245,6 +245,22 @@ py_test(
     ],
 )
 
+py_test(
+    name = "test_jax_gpu",
+    size = "medium",
+    srcs = ["tests/test_jax_gpu.py"],
+    env = {"RAY_TRAIN_V2_ENABLED": "1"},
+    tags = [
+        "exclusive",
+        "team:ml",
+        "train_v2_gpu",
+    ],
+    deps = [
+        ":conftest",
+        "//:ray_lib",
+    ],
+)
+
 py_test(
     name = "test_lightgbm_trainer",
     size = "small",

@@ -20,14 +20,20 @@
 @dataclass
 class JaxConfig(BackendConfig):
     use_tpu: bool = False
+    use_gpu: bool = False
 
     @property
     def backend_cls(self):
         return _JaxBackend
 
 
 def _setup_jax_distributed_environment(
-    master_addr_with_port: str, num_workers: int, index: int, use_tpu: bool
+    master_addr_with_port: str,
+    num_workers: int,
+    index: int,
+    use_tpu: bool,
+    use_gpu: bool,
+    resources_per_worker: dict,
 ):
     """Set up distributed Jax training information.
 
@@ -40,6 +46,9 @@ def _setup_jax_distributed_environment(
         index: Index of this worker.
         use_tpu: Whether to configure for TPU. If True and JAX_PLATFORMS is not
             already set, it will be set to "tpu".
+        use_gpu: Whether to configure for GPU. If True and JAX_PLATFORMS is not
+            already set, it will be set to "cuda".
+        resources_per_worker: The resources per worker.
     """
     # Get JAX_PLATFORMS from environment if already set
     jax_platforms = os.environ.get("JAX_PLATFORMS", "").lower()
@@ -48,12 +57,31 @@ def _setup_jax_distributed_environment(
         os.environ["JAX_PLATFORMS"] = "tpu"
         jax_platforms = "tpu"
 
-    # TODO(lehui): Add env vars for JAX on GPU.
+    if not jax_platforms and use_gpu:
+        os.environ["JAX_PLATFORMS"] = "cuda"
+        jax_platforms = "cuda"
+
+    if "cuda" in jax_platforms.split(","):
+        num_gpus_per_worker = resources_per_worker.get("GPU", 0)
+        os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
+            str(i) for i in range(num_gpus_per_worker)
+        )
 
     import jax
 
     if "tpu" in jax_platforms.split(","):
         jax.distributed.initialize(master_addr_with_port, num_workers, index)
+        logger.info("Initialized JAX distributed on TPU.")
+
+    if "cuda" in jax_platforms.split(","):
+        if num_gpus_per_worker > 0:
+            local_device_ids = list(range(num_gpus_per_worker))
+        else:
+            local_device_ids = 0
+        jax.distributed.initialize(
+            master_addr_with_port, num_workers, index, local_device_ids
+        )
+        logger.info("Initialized JAX distributed on CUDA.")
 
 
 def _shutdown_jax_distributed():
@@ -72,14 +100,13 @@ def _shutdown_jax_distributed():
 
 class _JaxBackend(Backend):
     def on_start(self, worker_group: WorkerGroup, backend_config: JaxConfig):
-        if not backend_config.use_tpu:
+        if not backend_config.use_tpu and not backend_config.use_gpu:
             return
 
         master_addr, master_port = worker_group.execute_single(0, get_address_and_port)
         master_addr_with_port = f"{master_addr}:{master_port}"
 
         # Set up JAX distributed environment on all workers
-        # This sets JAX_PLATFORMS env var and initializes JAX distributed
         setup_futures = []
         for i in range(len(worker_group)):
             setup_futures.append(
@@ -90,13 +117,15 @@ def on_start(self, worker_group: WorkerGroup, backend_config: JaxConfig):
                     num_workers=len(worker_group),
                     index=i,
                     use_tpu=backend_config.use_tpu,
+                    use_gpu=backend_config.use_gpu,
+                    resources_per_worker=worker_group.get_resources_per_worker(),
                 )
             )
         ray.get(setup_futures)
 
     def on_shutdown(self, worker_group: WorkerGroup, backend_config: JaxConfig):
         """Cleanup JAX distributed resources when shutting down worker group."""
-        if not backend_config.use_tpu:
+        if not backend_config.use_tpu and not backend_config.use_gpu:
             return
 
         # Shutdown JAX distributed on all workers

@@ -132,6 +132,7 @@ def __init__(
         if not jax_config:
             jax_config = JaxConfig(
                 use_tpu=scaling_config.use_tpu,
+                use_gpu=scaling_config.use_gpu,
             )
         super(JaxTrainer, self).__init__(
             train_loop_per_worker=train_loop_per_worker,

@@ -0,0 +1,65 @@
+import sys
+
+import pytest
+
+from ray.train import RunConfig, ScalingConfig
+from ray.train.v2._internal.constants import (
+    HEALTH_CHECK_INTERVAL_S_ENV_VAR,
+    is_v2_enabled,
+)
+from ray.train.v2.jax import JaxTrainer
+
+assert is_v2_enabled()
+
+
+@pytest.fixture(autouse=True)
+def reduce_health_check_interval(monkeypatch):
+    monkeypatch.setenv(HEALTH_CHECK_INTERVAL_S_ENV_VAR, "0.2")
+    yield
+
+
+@pytest.mark.skipif(sys.platform == "darwin", reason="JAX GPU not supported on macOS")
+def test_jax_distributed_gpu_training(ray_start_4_cpus_2_gpus, tmp_path):
+    """Test multi-GPU JAX distributed training.
+
+    This test verifies that JAX distributed initialization works correctly
+    across multiple GPU workers and that they can coordinate.
+    """
+
+    def train_func():
+        import jax
+
+        from ray import train
+
+        # Get JAX distributed info
+        devices = jax.devices()
+        world_rank = train.get_context().get_world_rank()
+        world_size = train.get_context().get_world_size()
+
+        # Verify distributed setup
+        assert world_size == 2, f"Expected world size 2, got {world_size}"
+        assert world_rank in [0, 1], f"Invalid rank {world_rank}"
+        assert len(devices) == 2, f"Expected 2 devices, got {len(devices)}"
+
+        train.report(
+            {
+                "world_rank": world_rank,
+                "world_size": world_size,
+                "num_devices": len(devices),
+            }
+        )
+
+    trainer = JaxTrainer(
+        train_func,
+        scaling_config=ScalingConfig(num_workers=2, use_gpu=True),
+        run_config=RunConfig(storage_path=str(tmp_path)),
+    )
+
+    result = trainer.fit()
+    assert result.error is None
+
+
+if __name__ == "__main__":
+    import sys
+
+    sys.exit(pytest.main(["-v", "-x", __file__]))
@@ -27,3 +27,7 @@ torch-spline-conv==1.2.2
 torch-geometric==2.5.3
 
 cupy-cuda12x==13.1.0; sys_platform != 'darwin'
+
+# Keep JAX version consistent with dl-gpu-requirements.txt
+jax==0.4.13; python_version < '3.12' and sys_platform != 'darwin'
+jaxlib==0.4.13; python_version < '3.12' and sys_platform != 'darwin'
@@ -17,3 +17,7 @@ torch-spline-conv==1.2.2+pt23cu121
 
 cupy-cuda12x==13.1.0; sys_platform != 'darwin'
 nixl==0.4.0; sys_platform != 'darwin'
+
+--find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+# Downgrading to JAX 0.4.13 to be compatible with CUDA 12.1
+jaxlib==0.4.13+cuda12.cudnn89; python_version < '3.12' and sys_platform != 'darwin'
diff --git a/python/requirements/ml/train-test-requirements.txt b/python/requirements/ml/train-test-requirements.txt
@@ -1,6 +1,4 @@
 evaluate==0.4.3
 mosaicml; python_version < "3.12"
 sentencepiece==0.1.96
-jax==0.4.25
-jaxlib==0.4.25
 s3torchconnector==1.4.3
diff --git a/python/requirements_compiled.txt b/python/requirements_compiled.txt
@@ -862,10 +862,10 @@ isoduration==20.11.0
     # via jsonschema
 itsdangerous==2.1.2
     # via flask
-jax==0.4.25
-    # via -r python/requirements/ml/train-test-requirements.txt
-jaxlib==0.4.25
-    # via -r python/requirements/ml/train-test-requirements.txt
+jax==0.4.13 ; python_version < "3.12" and sys_platform != "darwin"
+    # via -r python/requirements/ml/dl-cpu-requirements.txt
+jaxlib==0.4.13 ; python_version < "3.12" and sys_platform != "darwin"
+    # via -r python/requirements/ml/dl-cpu-requirements.txt
 jedi==0.19.1
     # via ipython
 jinja2==3.1.6