feat(cpu-offload): enable CPU Offload for XPU

dbyoung18 · dbyoung18 · commit 042ca1fd4247 · 2024-11-22T05:18:45.000Z
Signed-off-by: dbyoung18 &lt;yang5.yang@intel.com&gt;
diff --git a/test/prototype/test_low_bit_optim.py b/test/prototype/test_low_bit_optim.py
@@ -42,7 +42,12 @@
     lpmm = None
 
 
-_DEVICES = ["cpu"] + (["cuda"] if torch.cuda.is_available() else [])
+if torch.cuda.is_available():
+    _DEVICES = ["cpu", "cuda"]
+elif torch.xpu.is_available():
+    _DEVICES = ["cpu", "xpu"]
+else:
+    _DEVICES = ["cpu"]
 
 
 class TestQuantize(TestCase):
@@ -244,11 +249,12 @@ def test_optim_4bit_correctness(self, optim_name):
             torch.testing.assert_close(p2, p1, rtol=1e-5, atol=1e-5)
 
     @pytest.mark.skipif(
-        not torch.cuda.is_available(), reason="optim CPU offload requires CUDA"
+        not torch.cuda.is_available() and not torch.xpu.is_available(),
+        reason="optim CPU offload requires CUDA or XPU"
     )
     @parametrize("offload_grad,grad_accum", [(False, 1), (False, 2), (True, 1)])
     def test_optim_cpu_offload_correctness(self, offload_grad, grad_accum):
-        device = "cuda"
+        device = _DEVICES[-1]
         model1 = nn.Sequential(nn.Linear(32, 1024), nn.ReLU(), nn.Linear(1024, 128))
         model1.to(device)
 
@@ -261,6 +267,7 @@ def test_optim_cpu_offload_correctness(self, offload_grad, grad_accum):
             model2.parameters(),
             torch.optim.AdamW,
             offload_gradients=offload_grad,
+            device=device,
         )
 
         for _ in range(2):
@@ -279,10 +286,11 @@ def test_optim_cpu_offload_correctness(self, offload_grad, grad_accum):
             torch.testing.assert_close(p2, p1)
 
     @pytest.mark.skipif(
-        not torch.cuda.is_available(), reason="optim CPU offload requires CUDA"
+        not torch.cuda.is_available() and not torch.xpu.is_available(),
+        reason="optim CPU offload requires CUDA or XPU"
     )
     def test_optim_cpu_offload_save_load(self):
-        device = "cuda"
+        device = _DEVICES[-1]
         model1 = nn.Sequential(nn.Linear(32, 1024), nn.ReLU(), nn.Linear(1024, 128))
         model1.to(device)
         optim1 = low_bit_optim.CPUOffloadOptimizer(model1.parameters(), torch.optim.AdamW)
diff --git a/torchao/prototype/low_bit_optim/cpu_offload.py b/torchao/prototype/low_bit_optim/cpu_offload.py
@@ -13,6 +13,7 @@ def __init__(
         optimizer_class: Type[Optimizer] = torch.optim.AdamW,
         *,
         offload_gradients: bool = False,
+        device: str = "cuda",
         **kwargs,
     ) -> None:
         """Offload optimizer to CPU for single-GPU training. This will reduce GPU memory by the size of optimizer state.
@@ -22,6 +23,7 @@ def __init__(
             params: a list of parameters or parameter groups.
             optimizer_class: constructor of the base optimizer. Defaults to :class:`torch.optim.AdamW`.
             offload_gradients: free GPU gradients once they are moved to CPU. Not compatible with gradient accumulation.
+            device: device type for GPU. Choose from "cuda" and "xpu". Defaults to "cuda".
             kwargs: other keyword arguments to be passed to the base optimizer e.g. `lr`, `weight_decay`.
         """
         # default to fused CPU AdamW
@@ -38,51 +40,60 @@ def __init__(
         if not isinstance(param_groups[0], dict):
             param_groups = [{"params": param_groups}]
 
-        self.param_cuda2cpu_map = dict()
+        self.param_d2h_map = dict()
         self.optim_dict = dict()
-        self.stream = torch.cuda.Stream()
+        self.device = device
+        if self.device == "cuda":
+            self.stream = torch.cuda.Stream()
+        elif self.device == "xpu":
+            self.stream = torch.xpu.Stream()
 
         # the queue maintains the order which param we should do optim step on first.
         self.queue = dict()
 
-        def backward_hook(p_cuda):
-            if p_cuda.grad is not None:
-                p_cpu = self.param_cuda2cpu_map[p_cuda]
+        def backward_hook(p_device):
+            if p_device.grad is not None:
+                p_host = self.param_d2h_map[p_device]
 
                 # make sure backward for this param finishes
-                self.stream.wait_stream(torch.cuda.current_stream())
-                with torch.cuda.stream(self.stream):
-                    p_cpu.grad.copy_(p_cuda.grad, non_blocking=True)
+                if self.device == "cuda":
+                    self.stream.wait_stream(torch.cuda.current_stream())
+                    with torch.cuda.stream(self.stream):
+                        p_host.grad.copy_(p_device.grad, non_blocking=True)
+                elif self.device == "xpu":
+                    self.stream.wait_stream(torch.xpu.current_stream())
+                    with torch.xpu.stream(self.stream):
+                        p_host.grad.copy_(p_device.grad, non_blocking=True)
 
                 # we rely on CPython implementation of dictionary, which preserves insertion order.
                 # if a param is added again (e.g. due to gradient accumulation), it is moved to the
                 # end of the queue by removing and inserting it again.
-                if p_cuda in self.queue:
-                    del self.queue[p_cuda]
-                self.queue[p_cuda] = self.stream.record_event()
+                if p_device in self.queue:
+                    del self.queue[p_device]
+                self.queue[p_device] = self.stream.record_event()
 
-                # deallocate CUDA gradients once D2H transfer finishes.
+                # deallocate DEVICE gradients once D2H transfer finishes.
                 if offload_gradients:
-                    p_cuda.grad.record_stream(self.stream)
-                    p_cuda.grad = None
+                    p_device.grad.record_stream(self.stream)
+                    p_device.grad = None
 
         for param_group in param_groups:
             params = param_group.pop("params")
 
-            for p_cuda in params:
-                if not p_cuda.requires_grad:
+            for p_device in params:
+                if not p_device.requires_grad:
                     continue
 
                 # pre-allocate CPU params and grads
-                p_cpu = torch.empty_like(p_cuda, device="cpu", pin_memory=True)
-                p_cpu.grad = torch.empty_like(p_cpu, pin_memory=True)
+                p_host = torch.empty_like(p_device, device="cpu", pin_memory=True)
+                p_host.grad = torch.empty_like(p_host, pin_memory=True)
 
-                p_cpu.copy_(p_cuda.detach(), non_blocking=True)
-                self.param_cuda2cpu_map[p_cuda] = p_cpu
+                p_host.copy_(p_device.detach(), non_blocking=True)
+                self.param_d2h_map[p_device] = p_host
 
-                p_cuda.register_post_accumulate_grad_hook(backward_hook)
-                self.optim_dict[p_cuda] = optimizer_class(
-                    [{"params": p_cpu, **param_group}], **kwargs
+                p_device.register_post_accumulate_grad_hook(backward_hook)
+                self.optim_dict[p_device] = optimizer_class(
+                    [{"params": p_host, **param_group}], **kwargs
                 )
 
     @torch.no_grad()
@@ -91,26 +102,30 @@ def step(self, closure=None):
         if closure is not None:
             loss = closure()
 
-        for p_cuda, grad_d2h_event in self.queue.items():
+        for p_device, grad_d2h_event in self.queue.items():
             grad_d2h_event.synchronize()
-            self.optim_dict[p_cuda].step()
+            self.optim_dict[p_device].step()
 
             # submit more job to self.stream. it guarantees that we only start
             # moving param H2D once all backwards finish, since self.stream
             # will wait for current_stream when moving grad D2H.
-            p_cpu = self.param_cuda2cpu_map[p_cuda]
-            with torch.cuda.stream(self.stream):
-                p_cuda.copy_(p_cpu, non_blocking=True)
+            p_host = self.param_d2h_map[p_device]
+            if self.device == "cuda":
+                with torch.cuda.stream(self.stream):
+                    p_device.copy_(p_host, non_blocking=True)
+            elif self.device == "xpu":
+                with torch.xpu.stream(self.stream):
+                    p_device.copy_(p_host, non_blocking=True)
 
         self.queue.clear()
         return loss
 
     def zero_grad(self, set_to_none=True):
         assert set_to_none
 
-        # only clear CUDA grad. CPU grad will always be overwritten by CUDA grad.
-        for p_cuda in self.param_cuda2cpu_map.keys():
-            p_cuda.grad = None
+        # only clear DEVICE grad. CPU grad will always be overwritten by DEVICE grad.
+        for p_device in self.param_d2h_map.keys():
+            p_device.grad = None
 
     @property
     def param_groups(self):