pytorch · mcremon-meta · Jun 7, 2024
diff --git a/backends/cadence/aot/TARGETS b/backends/cadence/aot/TARGETS
@@ -29,6 +29,7 @@ python_library(
     ],
     deps = [
         ":passes",
+        ":utils",
         "//caffe2:torch",
         "//executorch/exir:lib",
     ],

diff --git a/backends/cadence/aot/compiler.py b/backends/cadence/aot/compiler.py
@@ -16,7 +16,9 @@
     ReplaceScalarTensorWithFullPass,
     ReplaceSqueezeAndUnsqueezeWithViewPass,
 )
+from executorch.backends.cadence.aot.utils import model_is_quantized
 from executorch.exir import EdgeCompileConfig, EdgeProgramManager, to_edge
+from torch.ao.quantization.pt2e.export_utils import model_is_exported
 
 from torch.export import export
 from torch.export.exported_program import ExportedProgram
@@ -29,14 +31,21 @@ def export_program(
 ) -> ExportedProgram:
     assert isinstance(model, torch.nn.Module), "model should be an nn.Module"
 
-    # If the model is already a GraphModule (most likely from quantization), call the
-    # suggested torch.ao.quantization API instead, which only does dropout and batchnorm.
-    if isinstance(model, torch.fx.GraphModule):
-        torch.ao.quantization.move_exported_model_to_eval(model)
-    else:
-        # We don't support training mode. Make it eval
+    # We don't support training mode. Make the model inference mode by
+    # calling model.eval() or an equivalent call for quantized models.
+    # GraphModules cannot call eval(), so we skip them.
+    if not isinstance(model, torch.fx.GraphModule):
         if hasattr(model, "eval"):
             model.eval()
+    else:
+        # If the model is quantized, call the suggested torch.ao.quantization API
+        # which only does dropout and batchnorm.
+        if model_is_quantized(model):
+            torch.ao.quantization.move_exported_model_to_eval(model)
+        else:
+            # If we get a GraphModule which is _not_ quantized, then it should already
+            # have been exported.
+            assert model_is_exported(model), "model should be from an ExportedProgram"
 
     # Prevent mkldnn decompositions
     torch._C._set_mkldnn_enabled(False)

diff --git a/backends/cadence/aot/utils.py b/backends/cadence/aot/utils.py
@@ -14,6 +14,27 @@
 from executorch.exir.dialects.edge._ops import EdgeOpOverload, EdgeOpOverloadPacket
 from tabulate import tabulate
 
+quant_ops = {
+    torch.ops.quantized_decomposed.quantize_per_tensor.default,
+    torch.ops.quantized_decomposed.dequantize_per_tensor.default,
+    torch.ops.quantized_decomposed.quantize_per_channel.default,
+    torch.ops.quantized_decomposed.dequantize_per_channel.default,
+}
+
+
+# Check if the model is quantized, by looking at the graph and finding quant/dequant ops
+def model_is_quantized(model: torch.nn.Module) -> bool:
+    # Quantized models have to be GraphModules already, from prepare/convert calls.
+    # Return false if the model is not a GraphModule.
+    if not isinstance(model, torch.fx.GraphModule):
+        return False
+
+    # Walk through the graph and look for quant/dequant ops
+    for op in quant_ops:
+        if model.graph.find_nodes(op="call_function", target=op):
+            return True
+    return False
+
 
 # Get the output size of a 1D convolution given the input size and parameters
 def get_conv1d_output_size(