Enable aten-aten decomps (pytorch#85921)

eellison · pytorchmergebot · commit d3f7c34cb3d7 · 2022-10-08T05:12:42.000Z
Invokes aten-aten decomps with re-entrant FakeMode. These decomps are being used in other places, so it's good to unify the path static fake tensor takes / get additional testing etc. There is also an instance where we return different devices with cpu/cuda which this fixes ([batch_norm](https://github.com/pytorch/pytorch/blob/master/torch/_decomp/decompositions.py#L1374)) Pull Request resolved: pytorch#85921 Approved by: https://github.com/ezyang
diff --git a/functorch/test/test_aotdispatch.py b/functorch/test/test_aotdispatch.py
@@ -880,7 +880,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('mvlgamma', 'mvlgamma_p_3'),  # aten.digamma_.default - couldn't find symbolic meta function/decom...
     xfail('mvlgamma', 'mvlgamma_p_5'),  # aten.digamma_.default - couldn't find symbolic meta function/decom...
     xfail('nanmedian', ''),  # aten.logical_or_.default - couldn't find symbolic meta function/decomposition
-    xfail('native_layer_norm', ''),  # could not find kernel
     xfail('nn.functional._scaled_dot_product_attention', ''),  # Cannot call sizes() on tensor with symbolic ...
     xfail('nn.functional.adaptive_avg_pool1d', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.adaptive_avg_pool2d', ''),  # aten._adaptive_avg_pool2d_backward.default - couldn't ...
@@ -923,7 +922,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('nn.functional.interpolate', 'trilinear'),  # Cannot call sizes() on tensor with symbolic sizes/st...
     xfail('nn.functional.kl_div', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.l1_loss', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('nn.functional.layer_norm', ''),  # could not find kernel
     xfail('nn.functional.linear', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.local_response_norm', ''),  # aten.fill.Scalar - couldn't find symbolic meta functio...
     xfail('nn.functional.max_pool1d', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
diff --git a/test/test_fake_tensor.py b/test/test_fake_tensor.py
@@ -464,6 +464,15 @@ def fn(tensors):
                 inputs = [a, b]
                 ref = fn(inputs)
 
+    def test_fake_tensor_batch_norm_cpu(self):
+        with torch._subclasses.CrossRefFakeMode():
+            m = torch.nn.Sequential(
+                torch.nn.BatchNorm2d(10),
+                torch.nn.ReLU(),
+            )
+            m.eval()
+            out = m(torch.randn([2, 10, 8, 8]))
+
     def test_shared_storage_invalidation(self):
         with FakeTensorMode():
             x = torch.tensor([1.])
diff --git a/test/test_ops.py b/test/test_ops.py
@@ -1811,10 +1811,6 @@ def test_refs_are_in_decomp_table(self, op):
     "linalg.norm",
     "linalg.svd",
     "linalg.svdvals",
-    "nn.functional.binary_cross_entropy_with_logits",
-    "nn.functional.huber_loss",
-    "nn.functional.logsigmoid",
-    "nn.functional.multilabel_soft_margin_loss",
     "pca_lowrank",
     "roll",
     "svd_lowrank",
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
@@ -843,9 +843,8 @@ def _get_cublas_allow_fp16_reduced_precision_reduction() -> _bool: ... #THPModul
 def _set_cublas_allow_fp16_reduced_precision_reduction(arg: _bool) -> None: ... #THPModule_setAllowFP16ReductionCuBLAS
 def _set_conj(x: Tensor, conj: _bool) -> None: ...
 def _set_neg(x: Tensor, neg: _bool) -> None: ...
-def _add_meta_to_tls_dispatch_include() -> None: ...
+def _set_meta_in_tls_dispatch_include(meta_in_tls: _bool) -> None: ...
 def _meta_in_tls_dispatch_include() -> _bool: ...
-def _remove_meta_from_tls_dispatch_include() -> None: ...
 def _has_storage(x: Tensor) -> _bool: ...
 def _should_allow_numbers_as_tensors(func_name: str) -> _bool: ...
 # NB: There is no Capsule type in typing, see
diff --git a/torch/_decomp/__init__.py b/torch/_decomp/__init__.py
@@ -18,6 +18,10 @@
 
 meta_lib = torch.library.Library("aten", "IMPL", "Meta")
 
+# decompositions which have been disabled as meta kernel implementations,
+# usually due to mismatching strides, aliasing, or other inconsistent property
+_disabled_meta_decomps = set()
+
 
 def register_decomposition(aten_op, registry=None, *, disable_meta: bool = False):
     """
@@ -105,6 +109,11 @@ def add_op_to_table(aten_op):
                 name = op_overload._schema.name
                 if op_overload._schema.overload_name:
                     name += "." + op_overload._schema.overload_name
+
+                if disable_meta:
+                    global _disabled_meta_decomps
+                    _disabled_meta_decomps.add(op_overload)
+
                 if (
                     not disable_meta
                     # TorchScript dumps a bunch of extra nonsense overloads
diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
@@ -1284,12 +1284,8 @@ def native_layer_norm_backward(
     if M <= 0 or N <= 0:
         return (
             input.new_zeros(input_shape) if output_mask[0] else None,
-            input.new_zeros(input_shape[axis:])
-            if output_mask[1] and weight_cast
-            else None,
-            input.new_zeros(input_shape[axis:])
-            if output_mask[2] and bias_cast
-            else None,
+            input.new_zeros(input_shape[axis:]) if output_mask[1] else None,
+            input.new_zeros(input_shape[axis:]) if output_mask[2] else None,
         )
 
     x_hat = (input_cast - mean) * rstd
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
@@ -115,6 +115,20 @@ def get_schema_info(func):
     return torch._C._SchemaInfo(func._schema)  # type: ignore[attr-defined]
 
 
+# many of the decompositions registered to torch/_prims do not at the moment model
+# aliasing or strides, so as an incremental step, just enable the decompositions in
+# torch/_decomp/decompositions.py.
+# decomps are used for aot autograd tracing so we would like to unify on their
+# implementation and add additional testing to them
+@functools.lru_cache(None)
+def torch_decomp_decompositions(func):
+    from torch._decomp import decomposition_table
+
+    decompositions = torch._decomp.decompositions
+    decomp_attrs = [getattr(decompositions, attr) for attr in dir(decompositions)]
+    return decomposition_table[func] in decomp_attrs
+
+
 def tree_flatten_only(ty: Type[T], pytree: PyTree):
     flat_vals, _ = tree_flatten(pytree)
     return [elem for elem in flat_vals if isinstance(elem, ty)]
@@ -302,7 +316,8 @@ def non_kwarg_to(fake_mode, func, *args, **kwargs):
     input_device = new_kwargs["device"]
     out_device = input_device if input_device else new_kwargs["input"].device
     new_kwargs["device"] = torch.device("meta")
-    r = func(*args, **new_kwargs)
+    inp = new_kwargs.pop("input")
+    r = func(inp, **new_kwargs)
     return fake_mode.fake_tensor_converter(fake_mode, r, out_device)
 
 
@@ -329,7 +344,7 @@ def to_copy(fake_mode, func, *args, **kwargs):
 
     input_device = new_kwargs.pop("device", None)
     out_device = input_device if input_device else new_kwargs["input"].device
-    with no_dispatch(), in_kernel_invocation_manager(fake_mode):
+    with in_kernel_invocation_manager(fake_mode):
         input = new_kwargs.pop("input").to("meta")
         return FakeTensor(fake_mode, aten._to_copy(input, **new_kwargs), out_device)
 
@@ -417,18 +432,19 @@ def nyi(fake_mode, func, *args, **kwargs):
 @contextlib.contextmanager
 def in_kernel_invocation_manager(fake_mode):
     # See: note [Fake Tensor Dispatch Keys]
+    prev_in_kernel = fake_mode.in_kernel_invocation
     meta_in_tls = torch._C._meta_in_tls_dispatch_include()
-    prev = fake_mode.in_kernel_invocation
+    assert meta_in_tls == prev_in_kernel, f"{meta_in_tls}, {prev_in_kernel}"
 
+    guard = torch._C._DisableTorchDispatch()  # type: ignore[attr-defined]
     fake_mode.in_kernel_invocation = True
-    if not meta_in_tls:
-        torch._C._add_meta_to_tls_dispatch_include()
+    torch._C._set_meta_in_tls_dispatch_include(True)
     try:
         yield
     finally:
-        fake_mode.in_kernel_invocation = prev
-        if not meta_in_tls:
-            torch._C._remove_meta_from_tls_dispatch_include()
+        fake_mode.in_kernel_invocation = prev_in_kernel
+        torch._C._set_meta_in_tls_dispatch_include(prev_in_kernel)
+        del guard
 
 
 class FakeTensor(torch.Tensor):
@@ -728,14 +744,15 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         # is written to must be invalidated
         self.invalidate_written_to_constants(func, flat_arg_fake_tensors, args, kwargs)
 
+        from torch._decomp import _disabled_meta_decomps, decomposition_table
+
         # IDK: feels bad man, sym_numel on as_strided infinite loops otherwise
         if (
             has_symbolic_sizes
             and func not in self.functions_with_cpp_meta_impl_that_support_symint
         ):
             # TODO: Find better approach for this
             # Avoid circular import
-            from torch._decomp import decomposition_table
             from torch._meta_registrations import meta_table
 
             with no_dispatch():
@@ -759,6 +776,15 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
                 if r is not NotImplemented:
                     return r
 
+        if (
+            func in decomposition_table
+            and torch_decomp_decompositions(func)
+            and func not in _disabled_meta_decomps
+            and all(not e.is_sparse for e in flat_arg_fake_tensors)
+        ):
+            with self:
+                return decomposition_table[func](*args, **kwargs)
+
         # prims already wrap FakeTensor inputs to FakeTensor outputs
         # and do device logic, we dont need do anything but run them
         # and ensure that Meta kernels are dispatched to (see)
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
@@ -1386,25 +1386,21 @@ Call this whenever a new thread is created in order to propagate values from
   py_module.def(
       "_has_storage", [](const at::Tensor& x) { return x.has_storage(); });
 
-  py_module.def("_add_meta_to_tls_dispatch_include", []() {
+  py_module.def("_set_meta_in_tls_dispatch_include", [](bool meta_in_tls) {
     auto local_keyset = c10::impl::tls_local_dispatch_key_set();
     c10::DispatchKeySet key_set({at::DispatchKey::Meta});
-    local_keyset.included_ = local_keyset.included_ | key_set;
-    c10::impl::_force_tls_local_dispatch_key_set(local_keyset);
-  });
-  py_module.def("_remove_meta_from_tls_dispatch_include", []() {
-    auto local_keyset = c10::impl::tls_local_dispatch_key_set();
-    c10::DispatchKeySet key_set({at::DispatchKey::Meta});
-    auto k = key_set.highestBackendKey();
-    local_keyset.included_ = local_keyset.included_.remove_backend(k);
+    if (meta_in_tls) {
+      local_keyset.included_ = local_keyset.included_ | key_set;
+    } else {
+      local_keyset.included_ =
+          local_keyset.included_.remove_backend(c10::BackendComponent::MetaBit);
+    }
     c10::impl::_force_tls_local_dispatch_key_set(local_keyset);
   });
 
   py_module.def("_meta_in_tls_dispatch_include", []() {
     auto local_keyset = c10::impl::tls_local_dispatch_key_set();
-    c10::DispatchKeySet key_set({at::DispatchKey::Meta});
-    auto k = key_set.highestBackendKey();
-    return local_keyset.included_.has_backend(k);
+    return local_keyset.included_.has_backend(c10::BackendComponent::MetaBit);
   });
 
   py_module.def("_dump_local_tls_set", []() {