Add a parameter to output delegate summary in llama export (#8174)

limintang · facebook-github-bot · commit c9513905951e · 2025-02-04T16:36:23.000-08:00
Summary:

Print delegation summary when the verbose parameter is set.

Differential Revision: D68991594
diff --git a/devtools/backend_debug/__init__.py b/devtools/backend_debug/__init__.py
@@ -7,6 +7,7 @@
 from executorch.devtools.backend_debug.delegation_info import (
     DelegationBreakdown,
     get_delegation_info,
+    print_delegation_info,
 )
 
-__all__ = ["DelegationBreakdown", "get_delegation_info"]
+__all__ = ["DelegationBreakdown", "get_delegation_info", "print_delegation_info"]
diff --git a/devtools/backend_debug/delegation_info.py b/devtools/backend_debug/delegation_info.py
@@ -7,6 +7,7 @@
 import re
 from collections import defaultdict
 from dataclasses import asdict, dataclass
+from tabulate import tabulate
 from typing import Dict
 
 import pandas as pd
@@ -174,3 +175,10 @@ def _insert_op_occurrences_dict(node_name: str, delegated: bool) -> None:
         num_delegated_subgraphs=delegated_subgraph_counter,
         delegation_by_operator=op_occurrences_dict,
     )
+
+
+def print_delegation_info(graph_module: torch.fx.GraphModule):
+    delegation_info = get_delegation_info(graph_module)
+    print(delegation_info.get_summary())
+    df = delegation_info.get_operator_delegation_dataframe()
+    print(tabulate(df, headers="keys", tablefmt="fancy_grid"))
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -23,7 +23,7 @@
 import torch
 
 from executorch.backends.vulkan._passes.remove_asserts import remove_asserts
-from executorch.devtools.backend_debug import get_delegation_info
+from executorch.devtools.backend_debug import print_delegation_info
 
 from executorch.devtools.etrecord import generate_etrecord
 from executorch.exir.passes.init_mutable_pass import InitializedMutableBufferPass
@@ -46,7 +46,6 @@
     get_vulkan_quantizer,
 )
 from executorch.util.activation_memory_profiler import generate_memory_trace
-from tabulate import tabulate
 
 from ..model_factory import EagerModelFactory
 from .source_transformation.apply_spin_quant_r1_r2 import (
@@ -801,12 +800,6 @@ def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
     for partitioner in partitioners:
         logging.info(f"--> {partitioner.__class__.__name__}")
 
-    def print_delegation_info(graph_module: torch.fx.GraphModule):
-        delegation_info = get_delegation_info(graph_module)
-        print(delegation_info.get_summary())
-        df = delegation_info.get_operator_delegation_dataframe()
-        print(tabulate(df, headers="keys", tablefmt="fancy_grid"))
-
     additional_passes = []
     if args.model in TORCHTUNE_DEFINED_MODELS:
         additional_passes = [InitializedMutableBufferPass(["kv_cache_pos"])]
diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py
@@ -51,6 +51,8 @@
     get_soc_to_chipset_map,
     update_spill_fill_size,
 )
+
+from executorch.devtools.backend_debug import print_delegation_info
 from executorch.examples.models.llama.source_transformation.quantize import (
     get_quant_embedding_transform,
 )
@@ -389,6 +391,7 @@ def lowering_modules(
         num_sharding=1,
         passes_job=OrderedDict(),
         shared_buffer=False,
+        verbose=False,
     ):
         executorch_config = ExecutorchBackendConfig(
             # For shared buffer, user must pass the memory address
@@ -440,6 +443,10 @@ def lowering_modules(
             edge_prog_mgr = edge_prog_mgr.to_backend(partitioner)
             if num_sharding > 1:
                 update_spill_fill_size(edge_prog_mgr.exported_program())
+
+            if verbose:
+                print_delegation_info(edge_prog_mgr.exported_program().graph_module)
+
             exec_prog_mgr = edge_prog_mgr.to_executorch(config=executorch_config)
             with open(f"{work_space}/{self.pte_filename}.pte", "wb") as file:
                 exec_prog_mgr.write_to_file(file)
@@ -667,6 +674,10 @@ def compile(args, pte_filename, tokenizer):
             )
             compiler_specs[0][0].value = option_to_flatbuffer(qnn_executorch_options)
 
+        if args.verbose:
+            for exported_program in exported_programs:
+                print_delegation_info(exported_program.graph_module)
+
         executorch_config = ExecutorchBackendConfig(
             # For shared buffer, user must pass the memory address
             # which is allocated by RPC memory to executor runner.
@@ -980,6 +991,8 @@ def _build_parser():
         help="Fallback to cpu embedding operator and type of embedding quantization, '<bitwidth>,<groupsize>', e.g., '4,32'.",
     )
 
+    parser.add_argument("-v", "--verbose", action="store_true")
+
     return parser
 
 
diff --git a/examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp b/examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp
@@ -557,8 +557,8 @@ SmartMaskIoMgr::SmartMaskIoMgr(
     const bool use_int64_token)
     : IoMgrBase(modules),
       shard_layers_({num_layers}),
-      prefill_cache_len_(prefill_cache_len),
       kv_cache_len_(kv_cache_len),
+      prefill_cache_len_(prefill_cache_len),
       vocab_size_(vocab_size),
       num_layers_(num_layers),
       head_dim_(head_dim),
@@ -1002,7 +1002,7 @@ void SmartMaskIoMgr::prepare_prefill_io(
 
   // [O]: logits
   int logit_index = 0;
-  Result<TensorInfo> logits = methods_meta[0]->output_tensor_meta(0);
+  Result<TensorInfo> logits = methods_meta[0]->output_tensor_meta(logit_index);
   prefill_logits_ = std::make_unique<TensorImpl>(
       logits->scalar_type(),
       logits->sizes().size(),

Original file line number	Diff line number	Diff line change
`@@ -7,6 +7,7 @@`
`7`	`7`	`from executorch.devtools.backend_debug.delegation_info import (`
`8`	`8`	`DelegationBreakdown,`
`9`	`9`	`get_delegation_info,`
	`10`	`+ print_delegation_info,`
`10`	`11`	`)`
`11`	`12`
`12`		`-__all__ = ["DelegationBreakdown", "get_delegation_info"]`
	`13`	`+__all__ = ["DelegationBreakdown", "get_delegation_info", "print_delegation_info"]`