Make the quantized path the main testing path, and introduce a nop quantizer for fp32 cases

mcremon-meta · facebook-github-bot · commit b72f4b57a1a0 · 2025-01-23T14:57:49.000-08:00
Summary:
For a while, testing APIs were `quantize_and_run` and `run_and_verify`, with the former calling the latter. That flow is a bit inconvenient since the quantized and fp32 cases are not consistent, and the names are also inconsistent.
This diff changes the two main APIs to become `export_run_and_verify` and `quantize_export_run_and_verify` to be more descriptive.
It also changes the calling order; we now use a nop quantizer for the fp32 case, allowing us to use the exact same flow as the quantized cases.
The existing `run_and_verify` function is made "private" (as far as python goes at least) and now takes in an `ExportedProgram` instead of the `torch.nn.Module` before.
Finally, it removes the `eval()` part of `export_program`, since now everything should go through the quantizer (including as a nop).

Reviewed By: zonglinpeng, hsharma35

Differential Revision: D67561806
diff --git a/backends/cadence/aot/compiler.py b/backends/cadence/aot/compiler.py
@@ -24,7 +24,6 @@
 from executorch.backends.cadence.aot.utils import (
     get_default_memory_config,
     MemoryConfig,
-    model_is_quantized,
 )
 from executorch.devtools import generate_etrecord
 from executorch.exir import (
@@ -38,7 +37,6 @@
 from executorch.exir.passes import ToOutVarPass
 from executorch.exir.passes.sym_shape_eval_pass import HintBasedSymShapeEvalPass
 from torch._inductor.decomposition import remove_decompositions
-from torch.ao.quantization.pt2e.export_utils import model_is_exported
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
 
 from torch.export import export
@@ -158,26 +156,10 @@ def export_program(
 ) -> ExportedProgram:
     assert isinstance(model, torch.nn.Module), "model should be an nn.Module"
 
-    # We don't support training mode. Make the model inference mode by
-    # calling model.eval() or an equivalent call for quantized models.
-    # GraphModules cannot call eval(), so we skip them.
-    if not isinstance(model, torch.fx.GraphModule):
-        if hasattr(model, "eval"):
-            model.eval()
-    else:
-        # If the model is quantized, call the suggested torch.ao.quantization API
-        # which only does dropout and batchnorm.
-        if model_is_quantized(model):
-            torch.ao.quantization.move_exported_model_to_eval(model)
-        else:
-            # If we get a GraphModule which is _not_ quantized, then it should already
-            # have been exported.
-            assert model_is_exported(model), "model should be from an ExportedProgram"
-
     # Prevent mkldnn decompositions
     torch._C._set_mkldnn_enabled(False)
 
-    # else: capture the model and return it.
+    # Export the model and return it.
     expo_program = export(model, inputs, strict=True)
 
     if dump_graphs:
@@ -206,8 +188,8 @@ def export_to_edge(
             _skip_dim_order=True,
             # Allow specific non-core aten ops in the IR.
             _core_aten_ops_exception_list=[
+                torch.ops.aten._native_batch_norm_legit_functional.default,
                 torch.ops.aten.linear.default,
-                torch.ops.aten.native_batch_norm.default,
                 torch.ops.aten.linalg_vector_norm.default,
                 torch.ops.aten.unfold.default,
                 torch.ops.aten.angle.default,
@@ -226,10 +208,9 @@ def export_to_cadence(
     model: torch.nn.Module,
     inputs: tuple[object, ...],
     dump_graphs: bool = False,
-    output_dir: Optional[str] = None,
     opt_level: int = 1,
 ) -> EdgeProgramManager:
-    edge_prog_manager = export_to_edge(model, inputs)
+    edge_prog_manager = export_to_edge(model, inputs, dump_graphs=dump_graphs)
     cadence_passes = get_cadence_passes(opt_level)
 
     # Run a couple required passes for quant/dequant ops
diff --git a/backends/cadence/aot/quantizer/quantizer.py b/backends/cadence/aot/quantizer/quantizer.py
@@ -183,3 +183,15 @@ def __init__(self, qconfig: Optional[QuantizationConfig] = None) -> None:
             qconfig = _default_qconfig
         quantizers = get_cadence_default_quantizer_list_with_config(qconfig)
         super().__init__(quantizers)
+
+
+# Nop quantizer, used to run fp32 cases
+# Calls an empty list of quantizers (no quantization). Note
+# that we do not strictly need that class since we could call
+# CadenceQuantizer([]), but this is more explicit and
+# does not require knowledge of the internals of the base class.
+class CadenceNopQuantizer(CadenceQuantizer):
+    def __init__(
+        self,
+    ) -> None:
+        super().__init__([])
diff --git a/backends/cadence/aot/utils.py b/backends/cadence/aot/utils.py
@@ -20,24 +20,9 @@
 from executorch.exir.dialects.edge._ops import EdgeOpOverload, EdgeOpOverloadPacket
 from tabulate import tabulate
 
-from torch.ao.quantization.quantize_pt2e import _QUANT_OPS as quant_ops
 from torch.utils._pytree import tree_flatten
 
 
-# Check if the model is quantized, by looking at the graph and finding quant/dequant ops
-def model_is_quantized(model: torch.nn.Module) -> bool:
-    # Quantized models have to be GraphModules already, from prepare/convert calls.
-    # Return false if the model is not a GraphModule.
-    if not isinstance(model, torch.fx.GraphModule):
-        return False
-
-    # Walk through the graph and look for quant/dequant ops
-    for op in quant_ops:
-        if model.graph.find_nodes(op="call_function", target=op):
-            return True
-    return False
-
-
 # Get the output size of a 1D convolution given the input size and parameters
 def get_conv1d_output_size(
     in_size: torch.Size,