[inductor] add conv_transpose2d unary fusion for cpu in inference mode (#90265)

chunyuan-w · pytorchmergebot · commit d6fe9838d19a · 2022-12-15T14:22:04.000Z
An FX transformation is added to fuse ConvTranspose2d with eltwise OPs in torchinductor for CPU in inference mode, following the implementation in #87063. The fusion OP is implemented in #90264 and will be treated as an extern kernel call in torchinductor. The fusion of ConvTranspose2d with the below OPs is supported: - relu - sigmoid - tanh - hardswish - leaky_relu - hardtanh - gelu Pull Request resolved: #90265 Approved by: https://github.com/jgong5, https://github.com/jansel
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
@@ -1856,6 +1856,53 @@ def forward(self, x, y):
                 with torch.no_grad():
                     self.common(mod, (v, other), atol=2e-3, rtol=0.016)
 
+    @unittest.skipIf(HAS_CUDA, "only support cpu conv_transpose2d unary test")
+    def test_conv_transpose2d_unary(self):
+        test_memory_format = [torch.contiguous_format, torch.channels_last]
+        options = itertools.product(
+            unary_list,
+            [True, False],
+            [1, 3],
+            [1, 2],
+            [1, 4],
+            [0, 1],
+            test_memory_format,
+        )
+
+        for (
+            unary_fn,
+            bias,
+            kernel_size,
+            dilation,
+            groups,
+            padding,
+            memory_format,
+        ) in options:
+            oC = 32 * groups
+            iC = 3 * groups
+            x_shape = (1, iC, 28, 28)
+            mod = torch.nn.Sequential(
+                torch.nn.ConvTranspose2d(
+                    iC,
+                    oC,
+                    kernel_size=kernel_size,
+                    padding=padding,
+                    dilation=dilation,
+                    groups=groups,
+                    bias=bias,
+                ),
+                unary_fn,
+            ).eval()
+
+            v = torch.randn(x_shape, dtype=torch.float32).to(
+                memory_format=memory_format
+            )
+            with torch.no_grad():
+                self.common(
+                    mod,
+                    (v,),
+                )
+
     def test_gather1(self):
         def fn(a, b):
             return (
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
@@ -3417,6 +3417,8 @@ def _prepare_convolution_fusion_create(
     stride_: List[int],
     dilation_: List[int],
     groups: int,
+    transposed: bool = False,
+    output_padding_: List[int] = None,
 ):
     """
     This function is a helper function to prepare inputs, layout and constant args
@@ -3429,6 +3431,8 @@ def _prepare_convolution_fusion_create(
     padding = tuple(padding_)
     dilation = tuple(dilation_)
     assert isinstance(groups, int)
+    output_padding = tuple(output_padding_) if output_padding_ else (0, 0)
+
     with V.graph.fake_mode:
         x_fake = ir_node_to_tensor(x, guard_shape=True)
         weight_fake = ir_node_to_tensor(weight, guard_shape=True)
@@ -3442,8 +3446,8 @@ def _prepare_convolution_fusion_create(
             stride,
             padding,
             dilation,
-            False,
-            [0, 0],
+            transposed,
+            output_padding,
             groups,
         )
         output_size = output.size()
@@ -3462,6 +3466,8 @@ def _prepare_convolution_fusion_create(
         output_stride,
     )
     constant_args = [padding, stride, dilation, groups]
+    if transposed:
+        constant_args.insert(1, output_padding)
 
     if bias is not None:
         inputs.append(bias)
@@ -3796,6 +3802,62 @@ def apply_constraint(self):
         pass
 
 
+class ConvolutionTransposeUnary(ExternKernelAlloc):
+    kernel = "torch.ops.mkldnn._convolution_transpose_pointwise"
+
+    def __init__(
+        self,
+        layout,
+        inputs,
+        constant_args=(),
+        kernel="torch.ops.mkldnn._convolution_transpose_pointwise",
+    ):
+        super().__init__(layout, inputs, constant_args)
+        self.kernel = kernel
+
+    def codegen(self, wrapper):
+        wrapper.writeline(
+            f"{self.get_name()} = {self.kernel}({', '.join(self.codegen_args())})"
+        )
+
+    @classmethod
+    def create(
+        cls,
+        x: "TensorBox",
+        weight: "TensorBox",
+        bias: "TensorBox",
+        padding_: List[int],
+        output_padding_: List[int],
+        stride_: List[int],
+        dilation_: List[int],
+        groups_: int,
+        attr,
+        scalars,
+        algorithm,
+    ):
+        kernel = "torch.ops.mkldnn._convolution_transpose_pointwise"
+        transposed = True
+        (inputs, constant_args, kernel_layout, _,) = _prepare_convolution_fusion_create(
+            cls,
+            x,
+            weight,
+            bias,
+            padding_,
+            stride_,
+            dilation_,
+            groups_,
+            transposed,
+            output_padding_,
+        )
+        constant_args = constant_args + [attr, scalars, algorithm]
+        return ConvolutionTransposeUnary(
+            layout=kernel_layout,
+            inputs=inputs,
+            constant_args=constant_args,
+            kernel=kernel,
+        )
+
+
 @dataclasses.dataclass
 class MutableBox(IRNode):
     """
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
@@ -962,6 +962,36 @@ def linear_unary(
         def linear_binary(x: TensorBox, y: TensorBox, w: TensorBox, b: TensorBox, attr):
             return TensorBox.create(ir.LinearBinary.create(x, y, w, b, attr))
 
+        @register_lowering(torch.ops.mkldnn._convolution_transpose_pointwise)
+        def convolution_transpose_unary(
+            x: TensorBox,
+            weight: TensorBox,
+            bias: TensorBox,
+            padding,
+            output_padding,
+            stride,
+            dilation,
+            groups,
+            attr,
+            scalars,
+            algorithm,
+        ):
+            return TensorBox.create(
+                ir.ConvolutionTransposeUnary.create(
+                    x,
+                    weight,
+                    bias,
+                    padding,
+                    output_padding,
+                    stride,
+                    dilation,
+                    groups,
+                    attr,
+                    scalars,
+                    algorithm,
+                )
+            )
+
         if torch._C.has_mkl:
 
             @register_lowering(torch.ops.mkl._mkl_linear)
diff --git a/torch/_inductor/overrides.py b/torch/_inductor/overrides.py
@@ -406,6 +406,69 @@ def forward(self, input, other):
         return y
 
 
+class ConvTransposeUnary2d(nn.ConvTranspose2d):
+    def __init__(
+        self,
+        conv_transpose: nn.Module,
+        unary: nn.Module,
+    ):
+        super(ConvTransposeUnary2d, self).__init__(
+            conv_transpose.in_channels,
+            conv_transpose.out_channels,
+            conv_transpose.kernel_size,
+            conv_transpose.stride,
+            conv_transpose.padding,
+            conv_transpose.output_padding,
+            conv_transpose.groups,
+            conv_transpose.bias is not None,
+            conv_transpose.dilation,
+            conv_transpose.padding_mode,
+            conv_transpose.weight.device,
+            conv_transpose.weight.dtype,
+        )
+        self._update_module_params(conv_transpose, unary)
+
+    def _update_module_params(self, conv_transpose, unary):
+        self.__dict__ = copy.deepcopy(conv_transpose.__dict__)
+        self.attr, self.scalars, self.algorithm = unary_modules_map[unary.__class__](
+            unary
+        )
+
+    def _conv_transpose_forward(self, input, weight, bias):
+        if self.padding_mode != "zeros":
+            return torch.ops.mkldnn._convolution_transpose_pointwise(
+                F.pad(
+                    input, self._reversed_padding_repeated_twice, mode=self.padding_mode
+                ),
+                weight,
+                bias,
+                _pair(0),
+                self.output_padding,
+                self.stride,
+                self.dilation,
+                self.groups,
+                self.attr,
+                self.scalars,
+                self.algorithm,
+            )
+        return torch.ops.mkldnn._convolution_transpose_pointwise(
+            input,
+            weight,
+            bias,
+            self.padding,
+            self.output_padding,
+            self.stride,
+            self.dilation,
+            self.groups,
+            self.attr,
+            self.scalars,
+            self.algorithm,
+        )
+
+    def forward(self, input):
+        return self._conv_transpose_forward(input, self.weight, self.bias)
+
+
 def packed_conv_eval(conv: nn.Module, input_size: list):
     assert not (conv.training), "Fusion only for eval!"
     return ConvUnary2d(
@@ -481,6 +544,16 @@ def fused_linear_binary_eval(linear: nn.Module, attr: str, input_size: list):
     return linear_binary
 
 
+def fused_conv_transpose_unary_eval(
+    conv_transpose: nn.Module, unary: nn.Module, input_size: list
+):
+    assert not (conv_transpose.training), "Fusion only for eval!"
+    return ConvTransposeUnary2d(
+        conv_transpose,
+        unary,
+    )
+
+
 def check_node_kind(current_node, modules, node_kind):
     if not isinstance(current_node, torch.fx.Node):
         return False
@@ -1262,6 +1335,7 @@ def rand_like(x, **kwargs):
     nn.Linear: fused_linear_unary_eval,
     ConvBinary2d: fused_conv_binary_unary_eval,
     ConvBinaryInplace2d: fused_conv_binary_unary_eval,
+    nn.ConvTranspose2d: fused_conv_transpose_unary_eval,
 }