Replace AT_DISPATCH with FBGEMM_DISPATCH, pt 4 (#2385)

q10 · facebook-github-bot · commit 5914cd3a746e · 2024-03-04T16:40:43.000-08:00
Summary: Pull Request resolved: #2385 - Replace AT_DISPATCH with FBGEMM_DISPATCH, pt 4 Reviewed By: spcyppt Differential Revision: D54501814
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
@@ -1183,7 +1183,7 @@ Tensor asynchronous_exclusive_cumsum_cpu(const Tensor& t_in) {
 
   const auto t_in_contig = t_in.expect_contiguous();
   auto output = native_empty_like(*t_in_contig);
-  FBGEMM_DISPATCH_INTEGRAL_TYPES(
+  FBGEMM_DISPATCH_ALL_TYPES(
       t_in_contig->scalar_type(),
       "asynchronous_exclusive_cumsum_cpu_kernel",
       [&] {
@@ -1200,7 +1200,7 @@ Tensor asynchronous_inclusive_cumsum_cpu(const Tensor& t_in) {
 
   const auto t_in_contig = t_in.expect_contiguous();
   auto output = native_empty_like(*t_in_contig);
-  FBGEMM_DISPATCH_INTEGRAL_TYPES(
+  FBGEMM_DISPATCH_ALL_TYPES(
       t_in_contig->scalar_type(),
       "asynchronous_inclusive_cumsum_cpu_kernel",
       [&] {
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_reorder_batched_ad.cu b/fbgemm_gpu/src/sparse_ops/sparse_reorder_batched_ad.cu
@@ -210,7 +210,7 @@ DLL_PUBLIC Tensor reorder_batched_ad_indices_gpu(
     const dim3 blocks(cuda_calc_xblock_count(
         reordered_cat_ad_offsets.numel() - 1,
         NUM_WARPS)); // one warp per sample
-    FBGEMM_DISPATCH_INTEGRAL_TYPES(
+    FBGEMM_DISPATCH_ALL_TYPES(
         cat_ad_indices.scalar_type(), "narrow_broadcast_indices_kernel_1", [&] {
           AT_DISPATCH_INDEX_TYPES(
               cat_ad_offsets.scalar_type(),
diff --git a/fbgemm_gpu/test/sparse/cumsum_test.py b/fbgemm_gpu/test/sparse/cumsum_test.py
@@ -7,9 +7,10 @@
 
 # pyre-strict
 
-# pyre-ignore-all-errors[56]
+# pyre-ignore-all-errors[53,56]
 
 import unittest
+from typing import Tuple, Type
 
 import hypothesis.strategies as st
 import numpy as np
@@ -20,27 +21,45 @@
 
 if open_source:
     # pyre-ignore[21]
-    from test_utils import gpu_available
+    from test_utils import cpu_and_maybe_gpu, gpu_available
 else:
     import fbgemm_gpu.sparse_ops  # noqa: F401, E402
-    from fbgemm_gpu.test.test_utils import gpu_available
+    from fbgemm_gpu.test.test_utils import cpu_and_maybe_gpu, gpu_available
 
 
 class CumSumTest(unittest.TestCase):
     @given(
         n=st.integers(min_value=0, max_value=10),
-        long_index=st.booleans(),
+        index_types=st.sampled_from(
+            [
+                (torch.int64, np.int64),
+                (torch.int32, np.int32),
+                (torch.float32, np.float32),
+            ]
+        ),
+        device=cpu_and_maybe_gpu(),
     )
     @settings(verbosity=Verbosity.verbose, max_examples=20, deadline=None)
-    def test_cumsum(self, n: int, long_index: bool) -> None:
-        index_dtype = torch.int64 if long_index else torch.int32
-        np_index_dtype = np.int64 if long_index else np.int32
+    def test_cumsum(
+        self,
+        n: int,
+        index_types: Tuple[Type[object], Type[object]],
+        device: torch.device,
+    ) -> None:
+        (pt_index_dtype, np_index_dtype) = index_types
+
+        # The CPU variants of asynchronous_*_cumsum support floats, since some
+        # downstream tests appear to be relying on this behavior.  As such, the
+        # test is disabled for GPU + float test cases.
+        if device == torch.device("cuda") and pt_index_dtype is torch.float32:
+            return
 
-        # cpu tests
-        x = torch.randint(low=0, high=100, size=(n,)).type(index_dtype)
+        # pyre-ignore-errors[16]
+        x = torch.randint(low=0, high=100, size=(n,)).type(pt_index_dtype).to(device)
         ze = torch.ops.fbgemm.asynchronous_exclusive_cumsum(x)
         zi = torch.ops.fbgemm.asynchronous_inclusive_cumsum(x)
         zc = torch.ops.fbgemm.asynchronous_complete_cumsum(x)
+
         torch.testing.assert_close(
             torch.from_numpy(np.cumsum(x.cpu().numpy()).astype(np_index_dtype)),
             zi.cpu(),
@@ -59,68 +78,59 @@ def test_cumsum(self, n: int, long_index: bool) -> None:
         )
 
         # meta tests
-        mx = torch.randint(low=0, high=100, size=(n,)).type(index_dtype).to("meta")
+        # pyre-ignore-errors[16]
+        mx = torch.randint(low=0, high=100, size=(n,)).type(pt_index_dtype).to("meta")
+
         mze = torch.ops.fbgemm.asynchronous_exclusive_cumsum(mx)
         self.assertEqual(ze.size(), mze.size())
-        # mzi = torch.ops.fbgemm.asynchronous_inclusive_cumsum(mx)
-        # self.assertEqual(zi.size(), mzi.size())
+
+        mzi = torch.ops.fbgemm.asynchronous_inclusive_cumsum(mx)
+        self.assertEqual(zi.size(), mzi.size())
+
         mzc = torch.ops.fbgemm.asynchronous_complete_cumsum(mx)
         self.assertEqual(zc.size(), mzc.size())
 
-        if gpu_available:
-            x = x.cuda()
-            ze = torch.ops.fbgemm.asynchronous_exclusive_cumsum(x)
-            zi = torch.ops.fbgemm.asynchronous_inclusive_cumsum(x)
-            zc = torch.ops.fbgemm.asynchronous_complete_cumsum(x)
-            torch.testing.assert_close(
-                torch.from_numpy(np.cumsum(x.cpu().numpy()).astype(np_index_dtype)),
-                zi.cpu(),
-            )
-            torch.testing.assert_close(
-                torch.from_numpy(
-                    (np.cumsum([0] + x.cpu().numpy().tolist())[:-1]).astype(
-                        np_index_dtype
-                    )
-                ),
-                ze.cpu(),
-            )
-            torch.testing.assert_close(
-                torch.from_numpy(
-                    (np.cumsum([0] + x.cpu().numpy().tolist())).astype(np_index_dtype)
-                ),
-                zc.cpu(),
-            )
-
     @given(
         n=st.integers(min_value=0, max_value=60),
         b=st.integers(min_value=0, max_value=10),
-        long_index=st.booleans(),
+        index_types=st.sampled_from(
+            [
+                (torch.int64, np.int64),
+                (torch.int32, np.int32),
+                (torch.float32, np.float32),
+            ]
+        ),
+        device=cpu_and_maybe_gpu(),
     )
     @settings(verbosity=Verbosity.verbose, max_examples=20, deadline=None)
     def test_asynchronous_complete_cumsum_2d(
-        self, n: int, b: int, long_index: bool
+        self,
+        n: int,
+        b: int,
+        index_types: Tuple[Type[object], Type[object]],
+        device: torch.device,
     ) -> None:
-        index_dtype = torch.int64 if long_index else torch.int32
-
-        def test_asynchronous_complete_cumsum_2d_helper(x: torch.Tensor) -> None:
-            np_index_dtype = np.int64 if long_index else np.int32
-            zc = torch.ops.fbgemm.asynchronous_complete_cumsum(x)
-            zeros = torch.zeros(b, 1)
-            torch.testing.assert_close(
-                torch.from_numpy(
-                    np.cumsum(
-                        torch.concat([zeros, x.cpu()], dim=1).numpy(), axis=1
-                    ).astype(np_index_dtype)
-                ),
-                zc.cpu(),
-            )
-
-        x = torch.randint(low=0, high=100, size=(b, n)).type(index_dtype)
-        # cpu test
-        test_asynchronous_complete_cumsum_2d_helper(x)
-        if gpu_available:
-            # gpu test
-            test_asynchronous_complete_cumsum_2d_helper(x.cuda())
+        (pt_index_dtype, np_index_dtype) = index_types
+
+        # The CPU variants of asynchronous_*_cumsum support floats, since some
+        # downstream tests appear to be relying on this behavior.  As such, the
+        # test is disabled for GPU + float test cases.
+        if device == torch.device("cuda") and pt_index_dtype is torch.float32:
+            return
+
+        # pyre-ignore-errors[16]
+        x = torch.randint(low=0, high=100, size=(b, n)).type(pt_index_dtype).to(device)
+
+        zc = torch.ops.fbgemm.asynchronous_complete_cumsum(x)
+        zeros = torch.zeros(b, 1)
+        torch.testing.assert_close(
+            torch.from_numpy(
+                np.cumsum(torch.concat([zeros, x.cpu()], dim=1).numpy(), axis=1).astype(
+                    np_index_dtype
+                )
+            ),
+            zc.cpu(),
+        )
 
 
 extend_test_class(CumSumTest)