improve granularity of PooledEmbeddingArchAwaitable (pytorch#1267)

suo · facebook-github-bot · commit fd49090e1c63 · 2023-07-11T10:19:39.000-07:00
Summary: Pull Request resolved: pytorch#1267 Today, if I do a `__getitem__` call on `PooledEmbeddingArchAwaitable`, it triggers the wait. We'd like to defer that further to when the result of `__getitem__` is actually used. So instead, have `__getitem__` return another `LazyAwaitable` which represents the pooled embedding. Usage of that value in the context of a torchfunction will trigger the wait as desired. This ends up being important for PT2 IR integration, which eagerly dumps a bunch of `__getitem__` calls right after the sparse arch because PT2 IR prefers to operate on "flat" values. With improved granularity, we still get the desired lazy behavior. For pure eager users, this should be a no-op (we generally only call `__getitem__` right before use, so this doesn't reorder anything). The laziness affects the ordering of comms/compute, which is important in two ways: 1. PEA design means that the per-rank feature processing behavior causes the specific order of execution to be load-bearing. Without the laziness, the execution order of ranks with vs. without feature processing will diverge, causing training hangs. 2. getting comms/compute overlapping for the all to all comms vs. dense compute is likely to be a performance improvement, although it is hard to make a direct comparison because of issue pytorch#1. Further details can be found in: https://fb.workplace.com/groups/319878845696681/posts/1017888535895705 Reviewed By: dstaay-fb Differential Revision: D47272219 fbshipit-source-id: e3250caf23d800783202c07ae669c2e00708ab6e
diff --git a/torchrec/distributed/embeddingbag.py b/torchrec/distributed/embeddingbag.py
@@ -50,6 +50,7 @@
     EmbeddingModuleShardingPlan,
     EnumerableShardingSpec,
     LazyAwaitable,
+    LazyGetItemMixin,
     NullShardedModuleContext,
     ParameterSharding,
     QuantizedCommCodecs,
@@ -289,7 +290,9 @@ def construct_output_kt(
     )
 
 
-class EmbeddingBagCollectionAwaitable(LazyAwaitable[KeyedTensor]):
+class EmbeddingBagCollectionAwaitable(
+    LazyGetItemMixin[str, Tensor], LazyAwaitable[KeyedTensor]
+):
     def __init__(
         self,
         awaitables: List[Awaitable[torch.Tensor]],
diff --git a/torchrec/distributed/tests/test_lazy_awaitable.py b/torchrec/distributed/tests/test_lazy_awaitable.py
@@ -10,7 +10,7 @@
 
 import torch
 import torch.fx
-from torchrec.distributed.types import LazyAwaitable
+from torchrec.distributed.types import LazyAwaitable, LazyGetItemMixin
 
 
 class NeedWait(LazyAwaitable[torch.Tensor]):
@@ -252,3 +252,33 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             self.assertTrue(torch.equal(ref_res, 17 * torch.ones(3, 4)))
 
         tempFile.close()
+
+    def test_lazy_getitem_mixin(self) -> None:
+        class LazyGetItemAwaitable(
+            LazyGetItemMixin[str, torch.Tensor], LazyAwaitable[Dict[str, torch.Tensor]]
+        ):
+            def __init__(self, actual_value: Dict[str, torch.Tensor]):
+                super().__init__()
+                self.actual_value = actual_value
+
+            def _wait_impl(self) -> Dict[str, torch.Tensor]:
+                for v in self.actual_value.values():
+                    v *= 3
+                return self.actual_value
+
+        actual_value = {"foo": torch.tensor(1), "bar": torch.tensor(2)}
+        a = LazyGetItemAwaitable(actual_value)
+        lazy_foo = a["foo"]
+        lazy_bar = a["bar"]
+        # The returned value should be lazy
+        self.assertIsInstance(lazy_foo, LazyAwaitable)
+        self.assertIsInstance(lazy_bar, LazyAwaitable)
+
+        # Our lazy values should not have been waited yet
+        self.assertIsNone(lazy_foo._result)
+        self.assertIsNone(lazy_bar._result)
+        self.assertIsNone(a._result)
+
+        # The use of a torch op should trigger exactly one wait on the parent object.
+        result = torch.add(lazy_foo, lazy_bar)
+        self.assertEqual(result, torch.tensor(1 * 3 + 2 * 3))
diff --git a/torchrec/distributed/types.py b/torchrec/distributed/types.py
@@ -342,8 +342,9 @@ def _wait_async(obj: Any) -> Any:
         else:
             return obj
 
+    @classmethod
     # pyre-ignore [2, 3]
-    def __torch_function__(self, func, types, args=(), kwargs=None):
+    def __torch_function__(cls, func, types, args=(), kwargs=None):
         """
         The LazyAwaitable type has a `__torch_function__` implementation.
         This means when this type is seens as an argument to a PyTorch
@@ -391,6 +392,48 @@ def _wait_impl(self) -> W:
         return self._obj
 
 
+KT = TypeVar("KT")
+VT_co = TypeVar("VT_co")
+ParentW = TypeVar("ParentW")
+
+
+class LazyGetItemMixin(Generic[KT, VT_co]):
+    """Augments the base LazyAwaitable with a lazy __getitem__ method.
+
+    Instead of triggering a wait() on a __getitem__ call, KeyedLazyAwaitable
+    will return another awaitable. This can achieve better
+    communication/computation overlap by deferring the wait() until the
+    tensor data is actually needed.
+
+    This is intended for Awaitables that model keyed collections, like
+    dictionaries or EmbeddingBagCollectionAwaitable.
+
+    NOTE: if using this mixin, please include it before LazyAwaitable in the
+    inheritance list, so that Python MRO can properly select this __getitem__
+    implementation.
+    """
+
+    def __getitem__(self, key: KT) -> LazyAwaitable[VT_co]:
+        return GetItemLazyAwaitable(self, key)
+
+
+class GetItemLazyAwaitable(LazyAwaitable[W], Generic[W, ParentW, KT]):
+    """The LazyAwaitable returned from a __getitem__ call on `LazyGetItemMixin`.
+
+    When the actual value of this awaitable is requested, wait on the parent and
+    then call __getitem__ on the result.
+    """
+
+    def __init__(self, parent: LazyAwaitable[ParentW], key: KT) -> None:
+        super().__init__()
+        self._parent = parent
+        self._key = key
+
+    def _wait_impl(self) -> W:
+        kt = LazyAwaitable._wait_async(self._parent)
+        return kt[self._key]
+
+
 # install magic methods
 for orig_method_name in torch.fx.graph.magic_methods:
     as_magic = f"__{orig_method_name}__"