[Nested Tensor] detach (pytorch#84078)

drisspg · pytorchmergebot · commit 092fe71f33fe · 2022-08-27T03:00:55.000Z
## Summary Add detach op for nested tensors. Nested tensors are not part of the composite explicit dispatch key set and therefore need to be added manually. The Detach test is failing only for the dtype=torch.float32, torch.float16 and device=cuda. The chain of ops that called are sum.backward() -> from_padded() -> unbind(). This populates the grad for a and b. Does this potentially indicated that cuda implementation for one of these ops, likely from_padded() is incorrect? Pull Request resolved: pytorch#84078 Approved by: https://github.com/albanD
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -4667,6 +4667,7 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: detach
+    NestedTensorCPU, NestedTensorCUDA: detach
 
 # Like `detach()`, but modifies this `Variable` in-place. This method may
 # only be called on non-view `Variable`s. You can use `is_view()` to check
diff --git a/test/test_nestedtensor.py b/test/test_nestedtensor.py
@@ -10,6 +10,7 @@
     skipMeta,
     onlyCPU
 )
+from torch.testing._internal.common_dtype import floating_types_and_half
 from torch.testing._internal.common_utils import TestCase, IS_FBCODE, run_tests, freeze_rng_state, parametrize, gradcheck
 from torch import nested_tensor
 
@@ -92,12 +93,6 @@ def test_unbind_1(self):
             torch.tensor([1]), torch.tensor([7]),
         )
 
-    # @torch.inference_mode()
-    # def test_unbind_2(self):
-    #     self._test_unbind_case(
-    #         torch.tensor(1), torch.tensor(7),
-    #     )
-
     @torch.inference_mode()
     def test_unbind_3(self):
         self._test_unbind_case(
@@ -302,6 +297,36 @@ def random_nt_pair(self, device, dtype, num_tensors, max_dims):
         return (torch.nested_tensor(ts1, device=device, dtype=dtype),
                 torch.nested_tensor(ts2, device=device, dtype=dtype))
 
+    @dtypes(*floating_types_and_half())
+    @dtypesIfCUDA(torch.float64)
+    def test_detach(self, device, dtype):
+        a = torch.randn(2, 4, device=device, dtype=dtype, requires_grad=False)
+        b = torch.randn(5, 4, device=device, dtype=dtype, requires_grad=False)
+        x = torch.nested_tensor([a, b]).requires_grad_()
+
+        x_detach = x.detach()
+
+        z = x_detach * 4
+        self.assertFalse(x_detach.requires_grad)
+        self.assertFalse(z.requires_grad)
+
+        a = torch.randn(2, 4, device=device, dtype=dtype, requires_grad=True)
+        b = torch.randn(5, 4, device=device, dtype=dtype, requires_grad=True)
+        x = torch.nested_tensor([a, b])
+
+        y = x * 2
+        y = y.detach()
+        self.assertFalse(y.requires_grad)
+        self.assertIsNone(y.grad_fn)
+
+        z = x + y
+        z.to_padded_tensor(0).sum().backward()
+        # This is an incorrect gradient, but we assume that's what the user
+        # wanted. detach() is an advanced option.
+        self.assertEqual(a.grad, torch.ones(2, 4, device=device, dtype=dtype))
+        self.assertEqual(b.grad, torch.ones(5, 4, device=device, dtype=dtype))
+
+
     @dtypes(torch.float, torch.float16, torch.double)
     def test_unbind_noncontiguous(self, device, dtype):
         nt_contiguous, nt_noncontiguous = random_nt_noncontiguous_pair((2, 3, 6, 7), device, dtype)