Multitple fixes to MC modules to facilitate integration (#1391)

Levy Zhao · facebook-github-bot · commit 807a2142b97c · 2023-09-20T17:48:25.000-07:00
Summary: Some bug fixes during the integration test in PyPER O3: ### fix #1 `_embedding_bag_collection` (`ShardedEmbeddingBagCollection`) is not really called by input_dist (because the same thing is already distributed by ShardedManagedCollisionCollection) . So it never get a chance to initiate `_input_dist`. As a result, TREC pipelining thinks it's not ready for input distribution. This is not expected, since the module is not used in the stage anyway, nor should it be put in fused a2a communication. With this change, https://fburl.com/code/ud8lnixv it'll satisfy the assertion, meanwhile doesn't carry _input_dists so won't be put into fused a2a. ### fix #2 ManagedCollisionCollection.forward is not traceable because it uses unwarpped `KeyedJaggedTensor.from_jt_dict`. We don't care about its internal detail so just keep it atomic. ### fix #3 Due to how remap table is set, `MCHManagedCollisionModule` doesn't support i32 id list for now. An easy fix is to convert to i64 regardless. A more memory efficient fix is probably change the remapper to i32 if necessary Differential Revision: D48804332
diff --git a/torchrec/distributed/mc_embeddingbag.py b/torchrec/distributed/mc_embeddingbag.py
@@ -91,6 +91,9 @@ def __init__(
                 device=device,
             )
         )
+        # TODO: This is a hack since _embedding_bag_collection doesn't need input
+        # dist, so eliminating it so all fused a2a will ignore it.
+        self._embedding_bag_collection._has_uninitialized_input_dist = False
         self._managed_collision_collection: ShardedManagedCollisionCollection = mc_sharder.shard(
             module._managed_collision_collection,
             table_name_to_parameter_sharding,
diff --git a/torchrec/distributed/tests/test_mc_embeddingbag.py b/torchrec/distributed/tests/test_mc_embeddingbag.py
@@ -11,6 +11,7 @@
 
 import torch
 import torch.nn as nn
+from torchrec.distributed.embeddingbag import ShardedEmbeddingBagCollection
 from torchrec.distributed.mc_embeddingbag import (
     ManagedCollisionEmbeddingBagCollectionSharder,
     ShardedManagedCollisionEmbeddingBagCollection,
@@ -160,6 +161,22 @@ def _test_sharding(  # noqa C901
         assert isinstance(
             sharded_sparse_arch._mc_ebc, ShardedManagedCollisionEmbeddingBagCollection
         )
+        assert isinstance(
+            sharded_sparse_arch._mc_ebc._embedding_bag_collection,
+            ShardedEmbeddingBagCollection,
+        )
+        assert (
+            sharded_sparse_arch._mc_ebc._embedding_bag_collection._has_uninitialized_input_dist
+            is False
+        )
+        assert (
+            not hasattr(
+                sharded_sparse_arch._mc_ebc._embedding_bag_collection, "_input_dists"
+            )
+            or len(sharded_sparse_arch._mc_ebc._embedding_bag_collection._input_dists)
+            == 0
+        )
+
         assert isinstance(
             sharded_sparse_arch._mc_ebc._managed_collision_collection,
             ShardedManagedCollisionCollection,
diff --git a/torchrec/modules/mc_modules.py b/torchrec/modules/mc_modules.py
@@ -43,6 +43,11 @@ def apply_mc_method_to_jt_dict(
     return mc_output
 
 
+@torch.fx.wrap
+def coalesce_feature_dict(features_dict: Dict[str, JaggedTensor]) -> KeyedJaggedTensor:
+    return KeyedJaggedTensor.from_jt_dict(features_dict)
+
+
 class ManagedCollisionModule(nn.Module):
     """
     Abstract base class for ManagedCollisionModule.
@@ -190,7 +195,7 @@ def forward(
             table_to_features=self._table_to_features,
             managed_collisions=self._managed_collision_modules,
         )
-        return KeyedJaggedTensor.from_jt_dict(features_dict)
+        return coalesce_feature_dict(features_dict)
 
     def evict(self) -> Dict[str, Optional[torch.Tensor]]:
         evictions: Dict[str, Optional[torch.Tensor]] = {}
@@ -818,7 +823,10 @@ def remap(self, features: Dict[str, JaggedTensor]) -> Dict[str, JaggedTensor]:
 
         remapped_features: Dict[str, JaggedTensor] = {}
         for name, feature in features.items():
-            values = feature.values()
+            # TODO: This is a temporary hack to support i32 ID list so it could
+            # match remapper size. A more memory-efficient fix would be make
+            # remapper i32-tensor instead.
+            values = feature.values().to(torch.int64)
             remapped_ids = torch.empty_like(values)
 
             # compute overlap between incoming IDs and remapping table
diff --git a/torchrec/modules/tests/test_mc_embedding_modules.py b/torchrec/modules/tests/test_mc_embedding_modules.py
@@ -256,6 +256,39 @@ def test_zch_ebc_eval(self) -> None:
 
         assert torch.all(remapped_kjt4["f2"].values() == remapped_kjt2["f2"].values())
 
+    def test_mc_collection_traceable(self) -> None:
+        device = torch.device("cpu")
+        zch_size = 20
+        update_interval = 2
+
+        embedding_configs = [
+            EmbeddingBagConfig(
+                name="t1",
+                embedding_dim=8,
+                num_embeddings=zch_size,
+                feature_names=["f1", "f2"],
+            ),
+        ]
+        mc_modules = {
+            "t1": cast(
+                ManagedCollisionModule,
+                MCHManagedCollisionModule(
+                    zch_size=zch_size,
+                    device=device,
+                    input_hash_size=2 * zch_size,
+                    eviction_interval=update_interval,
+                    eviction_policy=DistanceLFU_EvictionPolicy(),
+                ),
+            ),
+        }
+        mcc = ManagedCollisionCollection(
+            managed_collision_modules=mc_modules,
+            # pyre-ignore[6]
+            embedding_configs=embedding_configs,
+        )
+        gm: torch.fx.GraphModule = torch.fx.symbolic_trace(mcc)
+        gm.print_readable()
+
     def test_mch_ebc(self) -> None:
         device = torch.device("cpu")
         zch_size = 10

Original file line number	Diff line number	Diff line change
`@@ -91,6 +91,9 @@ def __init__(`
`91`	`91`	`device=device,`
`92`	`92`	`)`
`93`	`93`	`)`
	`94`	`+ # TODO: This is a hack since _embedding_bag_collection doesn't need input`
	`95`	`+ # dist, so eliminating it so all fused a2a will ignore it.`
	`96`	`+ self._embedding_bag_collection._has_uninitialized_input_dist = False`
`94`	`97`	`self._managed_collision_collection: ShardedManagedCollisionCollection = mc_sharder.shard(`
`95`	`98`	`module._managed_collision_collection,`
`96`	`99`	`table_name_to_parameter_sharding,`