Skip to content

Commit 82948d4

Browse files
author
Chun-I Tsai
committed
Add suffix to multi graph adaptor
- Seems that we need the information about backenddelegate/chain/instructions
1 parent a5c7609 commit 82948d4

File tree

6 files changed

+52
-32
lines changed

6 files changed

+52
-32
lines changed

backends/qualcomm/aot/python/PyQnnManagerAdaptor.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ class PyQnnManager {
6363
std::vector<uint8_t> tensor_data;
6464
std::vector<uint8_t*> tensor_ptr;
6565
std::vector<uint64_t> tensor_size;
66+
std::unordered_map<std::string, int> partition_num;
6667
uint64_t total_tensor_size = 0;
6768
for (size_t i = 0; i < qcirs.size(); ++i) {
6869
py::buffer_info info(py::buffer(qcirs[i].cast<py::bytes>()).request());
@@ -147,7 +148,8 @@ class PyQnnManager {
147148
&params));
148149
}
149150
graphs.emplace_back(qcir::CreateGraphDirect(
150-
builder_, graph->name()->str().c_str(), &nodes, &tensors));
151+
builder_, (graph->name()->str() + "_" + std::to_string(partition_num[graph->name()->str()])).c_str(), &nodes, &tensors));
152+
partition_num[graph->name()->str()] = partition_num[graph->name()->str()] + 1;
151153
}
152154
}
153155

backends/qualcomm/qnn_preprocess.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@
2020
from executorch.backends.qualcomm.builders.node_visitor import get_node_visitors
2121
from executorch.backends.qualcomm.builders.qnn_constants import OpContextLoader
2222
from executorch.backends.qualcomm.partition.utils import generate_qnn_executorch_option
23+
from executorch.backends.qualcomm.serialization.qc_schema_serialize import (
24+
flatbuffer_to_option,
25+
option_to_flatbuffer,
26+
)
2327
from executorch.exir.backend.backend_details import (
2428
BackendDetails,
2529
CompileSpec,

backends/qualcomm/utils/utils.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -818,6 +818,7 @@ def generate_multi_graph_program(
818818
executorch_in_order,
819819
executorch_out_order,
820820
) = ({}, {}, {}, {}, {})
821+
# graph name will be suffixed with _{num}
821822
qnn_mgr = PyQnnManagerAdaptor.QnnManager(
822823
generate_qnn_executorch_option(compiler_specs), processed_bytes
823824
)
@@ -831,15 +832,16 @@ def generate_multi_graph_program(
831832

832833
# We need to obtain the order of the IOs to correctly map QNN with nn.module
833834
for graph_name in graph_names:
835+
ori_graph_name, cur_idx = "_".join(graph_name.split("_")[:-1]), int(graph_name.split("_")[-1])
834836
if input_nodes_dict:
835837
# input
836-
input_names = [node.name for node in input_nodes_dict[graph_name]]
838+
input_names = [node.name for node in input_nodes_dict[ori_graph_name][cur_idx]]
837839
qnn_input_names = [
838840
wrapper.GetName() for wrapper in graph_inputs[graph_name]
839841
]
840842
# The input of intermideate module including call_function node
841843
# could not be reorder by node name
842-
if len(input_names) == len(qnn_input_names):
844+
if len(input_names) == len(qnn_input_names) and cur_idx == 0:
843845
input_order_list = []
844846
for input_name in input_names:
845847
# e.g., input_0_tokens_0
@@ -868,7 +870,7 @@ def generate_multi_graph_program(
868870
bundle_progs = [
869871
from_context_binary(
870872
ctx_path=binary_info,
871-
op_name=f"loader_{graph_name}_{int(time.time())}",
873+
op_name=graph_name,
872874
soc_model=compiler_options.soc_info.soc_model,
873875
custom_info={
874876
"graph_inputs": graph_inputs[graph_name],
@@ -877,10 +879,10 @@ def generate_multi_graph_program(
877879
"qnn_in_order": qnn_in_order.get(graph_name, None),
878880
"executorch_in_order": executorch_in_order.get(graph_name, None),
879881
"executorch_out_order": executorch_out_order.get(graph_name, None),
880-
},
881-
)
882+
},
883+
)
882884
for graph_name in graph_names
883-
]
885+
]
884886
# leverage ExecutorchProgramManager for generating pte with multi-methods
885887
edge_prog_mgr = to_edge(
886888
{
@@ -898,11 +900,15 @@ def generate_multi_graph_program(
898900
n.meta[OpContextLoader.meta_ctx_bin] = binary_info
899901
break
900902

903+
opt = flatbuffer_to_option(compiler_specs[0].value)
904+
opt.graph_name = "multi_graph"
905+
new_opt = option_to_flatbuffer(opt)
906+
compiler_specs[0].value = new_opt
901907
edge_prog_mgr = edge_prog_mgr.to_backend(QnnPartitioner(compiler_specs))
902908
exec_prog = edge_prog_mgr.to_executorch(
903909
config=backend_config or ExecutorchBackendConfig()
904910
)
905-
return exec_prog, bundle_progs
911+
return exec_prog, bundle_progs, graph_names
906912

907913

908914
def generate_composite_llama_program(

examples/qualcomm/oss_scripts/llama/llama.py

Lines changed: 26 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import os
1515
import sys
1616
import time
17+
from collections import defaultdict
1718
from functools import partial
1819
from multiprocessing.connection import Client
1920

@@ -626,7 +627,7 @@ def compile(args, pte_filename, tokenizer):
626627
call_delegate_inputs_dict = {name: [] for name in graph_names}
627628
call_delegate_node_name_dict = {name: [] for name in graph_names}
628629
outputs_dict = {name: [] for name in graph_names}
629-
input_nodes_dict = {name: [] for name in graph_names}
630+
input_nodes_dict = defaultdict(list)
630631
for prog, graph_name in zip(exported_programs, graph_names):
631632
for node in prog.graph_module.graph.nodes:
632633
if (
@@ -654,8 +655,11 @@ def compile(args, pte_filename, tokenizer):
654655

655656
if args.num_sharding > 0:
656657
bundle_progs_list = []
658+
processed_bytes = []
659+
call_delegate_node = []
660+
657661
for num in range(args.num_sharding - 1, -1, -1):
658-
processed_bytes = []
662+
cur_inputs = []
659663
for prog, graph_name in zip(exported_programs, graph_names):
660664
processed_bytes.append(
661665
getattr(
@@ -669,28 +673,28 @@ def compile(args, pte_filename, tokenizer):
669673
if node.op == "get_attr"
670674
and node.name == f"lowered_module_{num}"
671675
]
672-
input_nodes_dict[graph_name] = [
673-
node
674-
for node in call_delegate_node[0].args
675-
if node.op == "placeholder"
676+
cur_inputs =[
677+
node for node in call_delegate_node[0].args if node.op == "placeholder"
676678
]
679+
input_nodes_dict[graph_name].append(cur_inputs)
680+
prog_mgr, bundle_progs, partitioned_graph_names = generate_multi_graph_program(
681+
compiler_specs=compiler_specs[0],
682+
processed_bytes=processed_bytes,
683+
input_nodes_dict=input_nodes_dict,
684+
backend_config=executorch_config,
685+
constant_methods=llama_instance_list[
686+
1
687+
].llama_meta, # kv method meta
688+
)
677689

678-
prog_mgr, bundle_progs = generate_multi_graph_program(
679-
compiler_specs=compiler_specs[0],
680-
processed_bytes=processed_bytes,
681-
input_nodes_dict=input_nodes_dict,
682-
backend_config=executorch_config,
683-
constant_methods=llama_instance_list[
684-
1
685-
].llama_meta, # kv method meta
686-
)
687-
bundle_progs_list.append(bundle_progs)
688-
for graph_name in graph_names:
689-
lower_module_dict[graph_name].append(
690-
prog_mgr.exported_program(graph_name).graph_module._modules.get(
691-
"lowered_module_0"
692-
)
690+
bundle_progs_list.append(bundle_progs)
691+
for graph_name in partitioned_graph_names:
692+
ori_graph_name, cur_idx = "_".join(graph_name.split("_")[:-1]), int(graph_name.split("_")[-1])
693+
lower_module_dict[ori_graph_name].append(
694+
prog_mgr.exported_program(f"{graph_name}").graph_module._modules.get(
695+
"lowered_module_0"
693696
)
697+
)
694698

695699
exec_prog = generate_composite_llama_program(
696700
graph_names=graph_names,
@@ -723,7 +727,7 @@ def compile(args, pte_filename, tokenizer):
723727
if node.op == "output"
724728
]
725729

726-
prog_mgr, _ = generate_multi_graph_program(
730+
prog_mgr, _, _ = generate_multi_graph_program(
727731
compiler_specs=compiler_specs[0],
728732
processed_bytes=processed_bytes,
729733
input_nodes_dict=input_nodes_dict,

examples/qualcomm/oss_scripts/llama/model/static_llama.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@
1313
import torch.nn as nn
1414
import torch.nn.functional as F
1515
from executorch.examples.models.llama.llama_transformer import (
16-
ModelArgs,
17-
precompute_freqs_cis,
16+
ModelArgs
1817
)
18+
from executorch.examples.models.llama.rope import precompute_freqs_cis
1919

2020

2121
def apply_rotary_emb_single(

runtime/executor/method.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1179,6 +1179,10 @@ Error Method::execute_instruction() {
11791179
}
11801180
} break;
11811181
case executorch_flatbuffer::InstructionArguments::DelegateCall: {
1182+
ET_LOG(Info, "CHECK n_delegate_: %zu", n_delegate_);
1183+
ET_LOG(Info, "CHECK n_chains_: %zu", n_chains_);
1184+
ET_LOG(Info, "CHECK num instructions of cur_chain: %zu", instructions->size());
1185+
11821186
EXECUTORCH_SCOPE_PROF("DELEGATE_CALL");
11831187
internal::EventTracerProfileOpScope event_tracer_op_scope =
11841188
internal::EventTracerProfileOpScope(event_tracer_, "DELEGATE_CALL");

0 commit comments

Comments
 (0)