Add optimized TBE training forward (#1804)

sryap · facebook-github-bot · commit 46f255923416 · 2023-06-05T10:39:53.000-07:00
Summary: Pull Request resolved: #1804 This diff adds the frontend changes and tests for TBE v2 (D43634651) The `FBGEMM_EXPERIMENTAL_TBE` environment variable flag is added for enabling/disabling the new implementation at runtime. If `FBGEMM_EXPERIMENTAL_TBE` is not set, TBE will use the orignal implementation. If `FBGEMM_EXPERIMENTAL_TBE=1`, TBE will use the new implementation. If the TBE usecases are not supported in the new implementation, TBE will fall back to the original implementation. By default, `FBGEMM_EXPERIMENTAL_TBE` is not set. This can also be enabled by passing `use_experimental_tbe=True` when instantiating the TBE operator. ``` emb_op = SplitTableBatchedEmbeddingBagsCodegen( embedding_specs=..., ..., use_experimental_tbe=True, ) ``` Reviewed By: jianyuh Differential Revision: D44479772 fbshipit-source-id: b961811488a25904a3f34660c553067b1ab93c95
diff --git a/fbgemm_gpu/codegen/lookup_args.py b/fbgemm_gpu/codegen/lookup_args.py
@@ -40,6 +40,7 @@ class CommonArgs(NamedTuple):
     lxu_cache_locations: torch.Tensor
     output_dtype: int
     vbe_metadata: VBEMetadata
+    is_experimental: bool
 
 
 class OptimizerArgs(NamedTuple):
diff --git a/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template b/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template
@@ -277,5 +277,6 @@ def invoke(
         max_counter=max_counter,
         {% endif %}
         output_dtype=common_args.output_dtype,
+        is_experimental=common_args.is_experimental,
     )
     {% endif %}
diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py
@@ -9,6 +9,7 @@
 
 import enum
 import logging
+import os
 from dataclasses import dataclass, field
 from itertools import accumulate
 from math import log2
@@ -209,6 +210,7 @@ def __init__(  # noqa C901
         device: Optional[Union[str, int, torch.device]] = None,
         bounds_check_mode: BoundsCheckMode = BoundsCheckMode.WARNING,
         uvm_non_rowwise_momentum: bool = False,  # place non-rowwise momentum on UVM
+        use_experimental_tbe: bool = False,  # set to True to use TBE v2 (only support NVIDIA GPUs)
     ) -> None:
         super(SplitTableBatchedEmbeddingBagsCodegen, self).__init__()
 
@@ -595,6 +597,22 @@ def __init__(  # noqa C901
 
         self.step = 0
 
+        # Check whether to use TBE v2
+        is_experimental = False
+        fbgemm_exp_tbe = os.environ.get("FBGEMM_EXPERIMENTAL_TBE")
+        if use_experimental_tbe:
+            is_experimental = True
+            logging.info(
+                "use_experimental_tbe is set to True; Use experimental TBE: True"
+            )
+        elif fbgemm_exp_tbe is not None:
+            is_experimental = int(fbgemm_exp_tbe) == 1
+            logging.info(
+                f"FBGEMM_EXPERIMENTAL_TBE is set to {fbgemm_exp_tbe}; "
+                f"Use experimental TBE: {is_experimental}"
+            )
+        self.is_experimental: bool = is_experimental
+
     def _register_nonpersistent_buffers(self, prefix: str) -> None:
         # NOTE: make TorchScript work!
         self.register_buffer(
@@ -811,6 +829,7 @@ def forward(  # noqa: C901
             lxu_cache_locations=lxu_cache_locations,
             output_dtype=self.output_dtype,
             vbe_metadata=vbe_metadata,
+            is_experimental=self.is_experimental,
         )
 
         if self.optimizer == OptimType.EXACT_SGD:
diff --git a/fbgemm_gpu/fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py b/fbgemm_gpu/fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py
@@ -451,6 +451,7 @@ def forward(
                 max_B_feature_rank=-1,
                 output_size=-1,
             ),
+            is_experimental=False,
         )
 
         momentum1 = invokers.lookup_args.Momentum(
diff --git a/fbgemm_gpu/test/split_table_batched_embeddings_test.py b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
@@ -156,6 +156,7 @@ def execute_forward_(  # noqa C901
         pooling_mode: PoolingMode,
         use_cpu: bool,
         output_dtype: SparseType,
+        use_experimental_tbe: bool,
     ) -> None:
         # NOTE: cache is not applicable to CPU version.
         assume(not use_cpu or not use_cache)
@@ -324,6 +325,7 @@ def execute_forward_(  # noqa C901
             cache_algorithm=cache_algorithm,
             pooling_mode=pooling_mode,
             output_dtype=output_dtype,
+            use_experimental_tbe=use_experimental_tbe,
         )
         # NOTE: test TorchScript-compatible!
         cc = torch.jit.script(cc)
@@ -412,6 +414,7 @@ def test_forward_cpu_int8(
             pooling_mode,
             use_cpu,
             SparseType.FP32,
+            False,  # use_experimental_tbe
         )
 
     def test_forward_cpu_fp32(
@@ -456,6 +459,7 @@ def test_forward_cpu_fp32(
             pooling_mode,
             use_cpu,
             SparseType.FP32,
+            False,  # use_experimental_tbe
         )
 
     @unittest.skipIf(*gpu_unavailable)
@@ -505,11 +509,22 @@ def test_forward_gpu_no_cache_int8(
             pooling_mode,
             use_cpu,
             SparseType.FP32,
+            False,  # use_experimental_tbe
         )
 
     @unittest.skipIf(*gpu_unavailable)
+    @given(
+        use_experimental_tbe=st.booleans() if not TEST_WITH_ROCM else st.just(False),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=MAX_EXAMPLES_LONG_RUNNING,
+        deadline=None,
+        suppress_health_check=[HealthCheck.filter_too_much, HealthCheck.data_too_large],
+    )
     def test_forward_gpu_no_cache_fp16(
         self,
+        use_experimental_tbe: bool,
     ) -> None:
         weights_precision = SparseType.FP16
         use_cpu = False
@@ -527,15 +542,17 @@ def test_forward_gpu_no_cache_fp16(
             [
                 PoolingMode.SUM,
                 PoolingMode.MEAN,
-                PoolingMode.NONE,
             ]
+            + ([PoolingMode.NONE] if not use_experimental_tbe else [])
         )
         if pooling_mode == PoolingMode.NONE:
             mixed = False
             mixed_B = False
         else:
             mixed = random.choice([True, False])
-            mixed_B = random.choice([True, False])
+            mixed_B = (
+                random.choice([True, False]) if not use_experimental_tbe else False
+            )
         if pooling_mode == PoolingMode.SUM:
             weighted = random.choice([True, False])
         else:
@@ -555,11 +572,22 @@ def test_forward_gpu_no_cache_fp16(
             pooling_mode,
             use_cpu,
             SparseType.FP32,
+            use_experimental_tbe,
         )
 
     @unittest.skipIf(*gpu_unavailable)
+    @given(
+        use_experimental_tbe=st.booleans() if not TEST_WITH_ROCM else st.just(False),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=MAX_EXAMPLES_LONG_RUNNING,
+        deadline=None,
+        suppress_health_check=[HealthCheck.filter_too_much, HealthCheck.data_too_large],
+    )
     def test_forward_gpu_no_cache_fp32(
         self,
+        use_experimental_tbe: bool,
     ) -> None:
         weights_precision = SparseType.FP32
         use_cpu = False
@@ -577,15 +605,17 @@ def test_forward_gpu_no_cache_fp32(
             [
                 PoolingMode.SUM,
                 PoolingMode.MEAN,
-                PoolingMode.NONE,
             ]
+            + ([PoolingMode.NONE] if not use_experimental_tbe else [])
         )
         if pooling_mode == PoolingMode.NONE:
             mixed = False
             mixed_B = False
         else:
             mixed = random.choice([True, False])
-            mixed_B = random.choice([True, False])
+            mixed_B = (
+                random.choice([True, False]) if not use_experimental_tbe else False
+            )
         if pooling_mode == PoolingMode.SUM:
             weighted = random.choice([True, False])
         else:
@@ -605,6 +635,7 @@ def test_forward_gpu_no_cache_fp32(
             pooling_mode,
             use_cpu,
             SparseType.FP32,
+            use_experimental_tbe,
         )
 
     @unittest.skipIf(*gpu_unavailable)
@@ -668,11 +699,13 @@ def test_forward_gpu_uvm_cache_int8(
             pooling_mode,
             use_cpu,
             output_dtype,
+            False,  # use_experimental_tbe
         )
 
     @unittest.skipIf(*gpu_unavailable)
     @given(
         cache_algorithm=st.sampled_from(CacheAlgorithm),
+        use_experimental_tbe=st.booleans() if not TEST_WITH_ROCM else st.just(False),
     )
     @settings(
         verbosity=Verbosity.verbose,
@@ -683,6 +716,7 @@ def test_forward_gpu_uvm_cache_int8(
     def test_forward_gpu_uvm_cache_fp16(
         self,
         cache_algorithm: CacheAlgorithm,
+        use_experimental_tbe: bool,
     ) -> None:
         weights_precision = SparseType.FP16
         use_cpu = False
@@ -698,8 +732,8 @@ def test_forward_gpu_uvm_cache_fp16(
             [
                 PoolingMode.SUM,
                 PoolingMode.MEAN,
-                PoolingMode.NONE,
             ]
+            + ([PoolingMode.NONE] if not use_experimental_tbe else [])
         )
         output_dtype = random.choice(
             [
@@ -731,11 +765,13 @@ def test_forward_gpu_uvm_cache_fp16(
             pooling_mode,
             use_cpu,
             output_dtype,
+            use_experimental_tbe,
         )
 
     @unittest.skipIf(*gpu_unavailable)
     @given(
         cache_algorithm=st.sampled_from(CacheAlgorithm),
+        use_experimental_tbe=st.booleans() if not TEST_WITH_ROCM else st.just(False),
     )
     @settings(
         verbosity=Verbosity.verbose,
@@ -746,6 +782,7 @@ def test_forward_gpu_uvm_cache_fp16(
     def test_forward_gpu_uvm_cache_fp32(
         self,
         cache_algorithm: CacheAlgorithm,
+        use_experimental_tbe: bool,
     ) -> None:
         weights_precision = SparseType.FP32
         use_cpu = False
@@ -761,8 +798,8 @@ def test_forward_gpu_uvm_cache_fp32(
             [
                 PoolingMode.SUM,
                 PoolingMode.MEAN,
-                PoolingMode.NONE,
             ]
+            + ([PoolingMode.NONE] if not use_experimental_tbe else [])
         )
         output_dtype = random.choice(
             [
@@ -794,6 +831,7 @@ def test_forward_gpu_uvm_cache_fp32(
             pooling_mode,
             use_cpu,
             output_dtype,
+            use_experimental_tbe,
         )
 
     @unittest.skipIf(*gpu_unavailable)

Original file line number	Diff line number	Diff line change
`@@ -277,5 +277,6 @@ def invoke(`
`277`	`277`	`max_counter=max_counter,`
`278`	`278`	`{% endif %}`
`279`	`279`	`output_dtype=common_args.output_dtype,`
	`280`	`+ is_experimental=common_args.is_experimental,`
`280`	`281`	`)`
`281`	`282`	`{% endif %}`
Original file line number	Diff line number	Diff line change
`@@ -451,6 +451,7 @@ def forward(`
`451`	`451`	`max_B_feature_rank=-1,`
`452`	`452`	`output_size=-1,`
`453`	`453`	`),`
	`454`	`+ is_experimental=False,`
`454`	`455`	`)`
`455`	`456`
`456`	`457`	`momentum1 = invokers.lookup_args.Momentum(`