Qualcomm AI Engine Direct - enable per-tensor dump mechanism (#1294)

shewu-quic · facebook-github-bot · commit 911094eb46cd · 2024-03-05T09:48:59.000-08:00
Summary: - Add "tensor_dump_output_path" option into compiler spec - Add "output_" for output tensors in the AOT phase. In the runtime, we fill in the output tensor based on the order of the output tensor in the context cache. If tensor_dump_output_path is given, Delegate would write outputs of each OP there in runtime. In ALL cases, we don't recommend setting this option. This option exists just for debugging some accuracy issues. Pull Request resolved: #1294 Reviewed By: kirklandsign Differential Revision: D53947214 Pulled By: cccclai fbshipit-source-id: 64cb2a0e998da87c66cc16249a54264ae3fcf046
diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt
@@ -139,6 +139,7 @@ add_library(qnn_backend STATIC)
 add_library(qnn_factory STATIC)
 add_library(qnn_header INTERFACE)
 add_library(wrappers STATIC)
+add_library(utils STATIC)
 
 #
 # declare dependency
@@ -228,6 +229,7 @@ target_link_libraries(qnn_manager
     qnn_factory
     wrappers
     qnn_schema
+    utils
 )
 target_link_libraries(qnn_executorch_backend
     PRIVATE
@@ -237,6 +239,10 @@ target_link_libraries(qnn_executorch_backend
     executorch
     qcir_utils
 )
+target_link_libraries(utils
+    PRIVATE
+    qnn_executorch_logging
+)
 
 #
 # add linker option
diff --git a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp
@@ -30,7 +30,8 @@ PYBIND11_MODULE(PyQnnManagerAdaptor, m) {
       .def("IsNodeSupportedByBackend", &PyQnnManager::IsNodeSupportedByBackend)
       .def("Compile", &PyQnnManager::Compile)
       .def("Destroy", &PyQnnManager::Destroy)
-      .def("IsAvailable", &PyQnnManager::IsAvailable);
+      .def("IsAvailable", &PyQnnManager::IsAvailable)
+      .def("IsTensorDump", &PyQnnManager::IsTensorDump);
 }
 } // namespace qnn
 } // namespace executor
diff --git a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h
@@ -137,6 +137,10 @@ class PyQnnManager {
     return qnn_manager_->IsAvailable();
   }
 
+  bool IsTensorDump() {
+    return qnn_manager_->IsTensorDump();
+  }
+
  private:
   // Store the bytes object instead of a raw pointer so that this module will
   // keep the bytes alive.
diff --git a/backends/qualcomm/aot/wrappers/TensorWrapper.cpp b/backends/qualcomm/aot/wrappers/TensorWrapper.cpp
@@ -121,6 +121,19 @@ Error TensorWrapper::FillDataBuffer(const void* data, bool copy_data) {
   return Error::Ok;
 }
 
+Error TensorWrapper::AllocateDataBuffer() {
+  char* static_data_buffer = new (std::nothrow) char[bytes_]; // NOLINT
+  if (static_data_buffer == nullptr) {
+    return Error::Internal;
+  }
+  owned_data_ = std::unique_ptr<char[]>(static_data_buffer);
+  QNN_VER_PTR(tensor_)->memType = QNN_TENSORMEMTYPE_RAW;
+  QNN_VER_PTR(tensor_)->clientBuf.dataSize = bytes_;
+  QNN_VER_PTR(tensor_)->clientBuf.data = owned_data_.get();
+
+  return Error::Ok;
+}
+
 void TensorWrapper::UpdateQnnTensorMeta(const Qnn_Tensor_t& tensor_src) {
   QNN_VER_PTR(tensor_)->id = QNN_VER_PTR(tensor_src)->id;
 }
diff --git a/backends/qualcomm/aot/wrappers/TensorWrapper.h b/backends/qualcomm/aot/wrappers/TensorWrapper.h
@@ -35,6 +35,8 @@ class TensorWrapper {
 
   Error FillDataBuffer(const void* data, bool copy_data = false);
 
+  Error AllocateDataBuffer();
+
   // update qnn tensor meta
   // this function is used to recover metadata from QNN context binary.
   void UpdateQnnTensorMeta(const Qnn_Tensor_t& tensor_src);
diff --git a/backends/qualcomm/builders/node_visitor.py b/backends/qualcomm/builders/node_visitor.py
@@ -57,10 +57,14 @@ class NodeVisitor:
     """
 
     def __init__(
-        self, external_ids, edge_program: torch.export.ExportedProgram
+        self,
+        external_ids,
+        edge_program: torch.export.ExportedProgram,
+        enable_tensor_dump,
     ) -> None:
         self.external_ids = external_ids or {}
         self.edge_program = edge_program
+        self.enable_tensor_dump = enable_tensor_dump
 
     def get_tensor(self, input_node, op_node, idx=None):
         """
@@ -176,6 +180,9 @@ def get_tensor_type(
         if is_parameter(node, self.edge_program):
             return PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC
 
+        # dump all tensor, set to app read
+        if self.enable_tensor_dump:
+            return PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_APP_READ
         return tensor_type
 
     def get_data_type(
@@ -250,13 +257,16 @@ def define_value(
 
         if node_name in nodes_to_wrappers:
             return nodes_to_wrappers[node_name]
+        tensor_name = node.name
+        if is_graph_output(node):
+            tensor_name = "output_" + tensor_name
         dims = [1] if len(tensor.size()) == 0 else tensor.size()
         tensor_type = self.get_tensor_type(node, tensor_type)
         quant_encoding, quant_configs = self.get_quant_encoding_conf(node)
         dtype = self.get_data_type(tensor, quant_configs, is_tensor)
         if isinstance(tensor, torch._subclasses.fake_tensor.FakeTensor):
             tensor_wrapper = PyQnnWrapper.TensorWrapper(
-                node_name,
+                tensor_name,
                 tensor_type,
                 dtype,
                 quant_encoding,
@@ -270,7 +280,7 @@ def define_value(
             if quant_configs:
                 tensor = self.get_quant_tensor_value(node, tensor, dtype)
             tensor_wrapper = PyQnnWrapper.TensorWrapper(
-                node_name,
+                tensor_name,
                 tensor_type,
                 dtype,
                 quant_encoding,
@@ -372,6 +382,7 @@ def generate_node_to_external_map(
 
 def get_node_visitors(
     edge_program: torch.export.ExportedProgram,
+    enable_tensor_dump=False,
 ) -> Dict[str, NodeVisitor]:
     """Create a new class instance at runtime, and put them in a dict"""
     node_to_external_map = generate_node_to_external_map(edge_program)
@@ -380,5 +391,7 @@ def get_node_visitors(
         assert callable(
             visitor
         ), f"Expeting a callable class, but got {visitor} of type {type(visitor)}"
-        node_visitors[target] = visitor(node_to_external_map, edge_program)
+        node_visitors[target] = visitor(
+            node_to_external_map, edge_program, enable_tensor_dump
+        )
     return node_visitors
diff --git a/backends/qualcomm/qnn_preprocess.py b/backends/qualcomm/qnn_preprocess.py
@@ -53,8 +53,11 @@ def preprocess(
         pass_result = qnn_compiler_passes(edge_program.graph_module)
         assert pass_result is not None
 
+        enable_tensor_dump = qnn_manager.IsTensorDump()
         nodes_to_wrappers = {}
-        node_visitors = get_node_visitors(edge_program)
+        node_visitors = get_node_visitors(
+            edge_program, enable_tensor_dump=enable_tensor_dump
+        )
         py_op_wrapper_list = []
         for node in pass_result.graph_module.graph.nodes:
             if node.op == "call_function":
diff --git a/backends/qualcomm/runtime/CMakeLists.txt b/backends/qualcomm/runtime/CMakeLists.txt
@@ -39,3 +39,11 @@ target_sources(qnn_executorch_logging
     PRIVATE
     ${CMAKE_CURRENT_LIST_DIR}/Logging.cpp
 )
+
+# utils
+target_sources(utils
+    PUBLIC
+    ${CMAKE_CURRENT_LIST_DIR}/Utils.h
+    PRIVATE
+    ${CMAKE_CURRENT_LIST_DIR}/Utils.cpp
+)
diff --git a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
@@ -194,13 +194,16 @@ Error QnnExecuTorchBackend::execute(
     input_tensor_structs.push_back(input_tensors[i]->CloneTensorStruct());
   }
 
-  for (int i = input_tensors.size();
-       i < input_tensors.size() + output_tensors.size();
-       ++i) {
-    output_tensors[i - input_tensors.size()]->FillDataBuffer(
-        args[i]->toTensor().mutable_data_ptr(), false /* copy_data */);
-    output_tensor_structs.push_back(
-        output_tensors[i - input_tensors.size()]->CloneTensorStruct());
+  int output_index = input_tensors.size();
+  for (const auto& output_tensor : output_tensors) {
+    // pos=0 limits the search to the prefix
+    if (output_tensor->GetName().rfind("output_", 0) == 0) {
+      output_tensor->FillDataBuffer(
+          args[output_index]->toTensor().mutable_data_ptr(),
+          false /* copy_data */);
+      output_index++;
+    }
+    output_tensor_structs.push_back(output_tensor->CloneTensorStruct());
   }
 
   ET_CHECK_OR_RETURN_ERROR(
diff --git a/backends/qualcomm/runtime/QnnManager.cpp b/backends/qualcomm/runtime/QnnManager.cpp
@@ -6,10 +6,12 @@
  * LICENSE file in the root directory of this source tree.
  */
 #include <executorch/backends/qualcomm/runtime/QnnManager.h>
+#include <executorch/backends/qualcomm/runtime/Utils.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnImplementation.h>
 
 #include <cstdlib>
 #include <cstring>
+#include <fstream>
 namespace torch {
 namespace executor {
 namespace qnn {
@@ -25,6 +27,7 @@ QnnManager::QnnManager(
     : backend_type_(options->backend_type()),
       library_path_(options->library_path()->c_str()),
       skel_library_dir_(options->skel_library_dir()->c_str()),
+      tensor_dump_output_path_(options->tensor_dump_output_path()->c_str()),
       graph_name_(options->graph_name()->c_str()),
       soc_info_(options->soc_info()),
       htp_options_(options->htp_options()),
@@ -41,6 +44,9 @@ QnnManager::QnnManager(
         "library_path: %s", options->library_path()->c_str());
     QNN_EXECUTORCH_LOG_INFO(
         "skel_library_dir: %s", options->skel_library_dir()->c_str());
+    QNN_EXECUTORCH_LOG_INFO(
+        "tensor_dump_output_path: %s",
+        options->tensor_dump_output_path()->c_str());
     QNN_EXECUTORCH_LOG_INFO(
         "log_level: %s", EnumNameQnnExecuTorchLogLevel(options->log_level()));
     QNN_EXECUTORCH_LOG_INFO(
@@ -144,6 +150,9 @@ Error QnnManager::AllocateTensor() {
   for (auto& tensor : output_tensors) {
     std::shared_ptr<TensorWrapper> tensor_wrapper = CreateTensorWrapper(tensor);
     tensor_wrapper->UpdateQnnTensorMeta(tensor);
+    if (!tensor_dump_output_path_.empty()) {
+      tensor_wrapper->AllocateDataBuffer();
+    }
     output_tensors_.emplace_back(std::move(tensor_wrapper));
   }
   return Error::Ok;
@@ -153,6 +162,11 @@ Error QnnManager::AllocateTensor(
     std::vector<std::shared_ptr<TensorWrapper>>& inputs,
     std::vector<std::shared_ptr<TensorWrapper>>& outputs) {
   input_tensors_ = std::move(inputs);
+  for (auto& output_tensor : outputs) {
+    if (!tensor_dump_output_path_.empty()) {
+      output_tensor->AllocateDataBuffer();
+    }
+  }
   output_tensors_ = std::move(outputs);
   return Error::Ok;
 }
@@ -171,6 +185,32 @@ Error QnnManager::Execute(
     return Error::Internal;
   }
 
+  if (!tensor_dump_output_path_.empty()) {
+    // TODO: Need to handle the graph which is partitioned.
+    // Maybe we could use graph name.
+    std::string dir = tensor_dump_output_path_ + "/Result/";
+    CreateDirectory(dir);
+    QNN_EXECUTORCH_LOG_INFO("Dump tensor to the path: %s", dir.c_str());
+    for (std::size_t out_idx = 0; out_idx < output_tensor_structs.size();
+         ++out_idx) {
+      const Qnn_Tensor_t& output_tensor = output_tensor_structs[out_idx];
+
+      std::string output_path =
+          dir + QNN_VER_PTR(output_tensor)->name + "_tensor.raw";
+
+      std::ofstream fout(output_path, std::ios::binary);
+      if (fout.fail()) {
+        QNN_EXECUTORCH_LOG_ERROR(
+            "Dump tensor name: %s Failed.", QNN_VER_PTR(output_tensor)->name);
+        return Error::Internal;
+      }
+
+      fout.write(
+          static_cast<const char*>(QNN_VER_PTR(output_tensor)->clientBuf.data),
+          QNN_VER_PTR(output_tensor)->clientBuf.dataSize);
+    }
+  }
+
   return Error::Ok;
 }
 
diff --git a/backends/qualcomm/runtime/QnnManager.h b/backends/qualcomm/runtime/QnnManager.h
@@ -42,6 +42,10 @@ class QnnManager {
 
   bool IsAvailable();
 
+  bool IsTensorDump() {
+    return !tensor_dump_output_path_.empty();
+  }
+
   bool IsOnlinePrepare();
 
   bool IsNodeSupportedByBackend(
@@ -68,6 +72,7 @@ class QnnManager {
   QnnExecuTorchBackendType backend_type_;
   std::string library_path_;
   std::string skel_library_dir_;
+  std::string tensor_dump_output_path_;
   std::string graph_name_;
   const SocInfo* soc_info_;
   const QnnExecuTorchHtpBackendOptions* htp_options_;
diff --git a/backends/qualcomm/runtime/Utils.cpp b/backends/qualcomm/runtime/Utils.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <executorch/backends/qualcomm/runtime/Logging.h>
+#include <executorch/backends/qualcomm/runtime/Utils.h>
+#include <sys/stat.h>
+namespace torch {
+namespace executor {
+namespace qnn {
+
+void CreateDirectory(const std::string& path) {
+  // Create any recursive directory
+  if (path.empty()) {
+    QNN_EXECUTORCH_LOG_ERROR("Create folder shouldn't be empty");
+    return;
+  }
+  std::size_t pos = path.find_last_of('/');
+  std::string subdir = (std::string::npos == pos) ? "" : path.substr(0, pos);
+  if (subdir.empty() || subdir == "." || subdir == "..") {
+    return;
+  }
+  CreateDirectory(subdir);
+  int mkdir_err = mkdir(subdir.c_str(), S_IRWXU | S_IRWXG | S_IRWXO);
+  if (mkdir_err != 0 && errno != EEXIST) {
+    std::string err_msg = "Failed to create " + subdir + " folder\n";
+    QNN_EXECUTORCH_LOG_ERROR(err_msg.c_str());
+  }
+}
+
+} // namespace qnn
+} // namespace executor
+} // namespace torch
diff --git a/backends/qualcomm/runtime/Utils.h b/backends/qualcomm/runtime/Utils.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+
+#include <string>
+
+namespace torch {
+namespace executor {
+namespace qnn {
+// Create Directory
+void CreateDirectory(const std::string& path);
+
+} // namespace qnn
+} // namespace executor
+} // namespace torch
diff --git a/backends/qualcomm/serialization/qnn_compile_spec_schema.py b/backends/qualcomm/serialization/qnn_compile_spec_schema.py
@@ -114,3 +114,4 @@ class QnnExecuTorchOptions:
     htp_options: QnnExecuTorchHtpBackendOptions = QnnExecuTorchHtpBackendOptions()
     soc_info: SocInfo = SocInfo()
     online_prepare: bool = False
+    tensor_dump_output_path: str = ""
diff --git a/backends/qualcomm/serialization/schema.fbs b/backends/qualcomm/serialization/schema.fbs
@@ -140,6 +140,12 @@ table QnnExecuTorchOptions {
 
   /// Check if on-device graph construction. Default is false.
   online_prepare:bool;
+
+  /// Tensor dump output path. If a path is given, Delegate would write
+  /// outputs of each OP there.
+  /// In ALL cases, we don't recommend to set this option.
+  /// This option exist just for debugging some accuracy issues.
+  tensor_dump_output_path:string;
 }
 
 root_type QnnExecuTorchOptions;
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py

Original file line number	Diff line number	Diff line change
`@@ -139,6 +139,7 @@ add_library(qnn_backend STATIC)`
`139`	`139`	`add_library(qnn_factory STATIC)`
`140`	`140`	`add_library(qnn_header INTERFACE)`
`141`	`141`	`add_library(wrappers STATIC)`
	`142`	`+add_library(utils STATIC)`
`142`	`143`
`143`	`144`	`#`
`144`	`145`	`# declare dependency`
`@@ -228,6 +229,7 @@ target_link_libraries(qnn_manager`
`228`	`229`	`qnn_factory`
`229`	`230`	`wrappers`
`230`	`231`	`qnn_schema`
	`232`	`+ utils`
`231`	`233`	`)`
`232`	`234`	`target_link_libraries(qnn_executorch_backend`
`233`	`235`	`PRIVATE`
`@@ -237,6 +239,10 @@ target_link_libraries(qnn_executorch_backend`
`237`	`239`	`executorch`
`238`	`240`	`qcir_utils`
`239`	`241`	`)`
	`242`	`+target_link_libraries(utils`
	`243`	`+ PRIVATE`
	`244`	`+ qnn_executorch_logging`
	`245`	`+)`
`240`	`246`
`241`	`247`	`#`
`242`	`248`	`# add linker option`