pytorch · mcremon-meta · Jul 12, 2024
diff --git a/backends/cadence/CMakeLists.txt b/backends/cadence/CMakeLists.txt
@@ -25,5 +25,5 @@ include(${EXECUTORCH_ROOT}/build/Utils.cmake)
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 
 
-add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/hifi/operators)
-add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/hifi/kernels)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/reference/operators)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/reference/kernels)
diff --git a/backends/cadence/aot/functions.yaml b/backends/cadence/aot/functions.yaml
@@ -107,30 +107,30 @@
   variants: function
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::quantize_per_tensor_out
+      kernel_name: impl::reference::quantize_per_tensor_out
 
 - func: cadence::dequantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::dequantize_per_tensor_out
+      kernel_name: impl::reference::dequantize_per_tensor_out
 
 - func: cadence::quantized_conv.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::quantized_conv_out
+      kernel_name: impl::reference::quantized_conv_out
 
 - func: cadence::quantized_layer_norm.out(Tensor input, Tensor in_scale, Tensor in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::quantized_layer_norm_out
+      kernel_name: impl::reference::quantized_layer_norm_out
 
 - func: cadence::quantized_linear.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::quantized_linear_out
+      kernel_name: impl::reference::quantized_linear_out
 
 - func: cadence::quantized_relu.out(Tensor X, Tensor X_zero_point, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::quantized_relu_out
+      kernel_name: impl::reference::quantized_relu_out
diff --git a/backends/cadence/reference/kernels/CMakeLists.txt b/backends/cadence/reference/kernels/CMakeLists.txt
@@ -0,0 +1,16 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# lint_cmake: -linelength
+add_library(
+  cadence_kernels
+  kernels.cpp
+)
+
+target_include_directories(
+  cadence_kernels
+  PUBLIC .
+)
diff --git a/backends/cadence/reference/kernels/kernels.cpp b/backends/cadence/reference/kernels/kernels.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "kernels.h"
+
+#include <algorithm>
+#include <limits>
+
+namespace impl {
+namespace reference {
+namespace kernels {
+
+// Quantize a fp32 value to an int8_t/uint8_t value
+template <typename T>
+__attribute__((always_inline)) T
+quantize(const float x, float scale, int32_t zero_point) {
+  constexpr float min_val = std::numeric_limits<T>::min();
+  constexpr float max_val = std::numeric_limits<T>::max();
+  float tmp = roundf(x * scale + zero_point);
+  return std::max(std::min(tmp, max_val), min_val);
+}
+
+// Quantize an fp32 array to an int8_t/uint8_t array
+template <typename T>
+void quantize(
+    T* __restrict__ y,
+    const float* __restrict__ x,
+    float inv_scale,
+    int32_t zero_point,
+    size_t size) {
+  for (size_t i = 0; i < size; ++i) {
+    y[i] = quantize<T>(x[i], inv_scale, zero_point);
+  }
+}
+
+// Dequantize an int8_t/uint8_t value to an fp32 value
+template <typename T>
+__attribute__((always_inline)) float
+dequantize(const T x, float scale, int32_t zero_point) {
+  return scale * (x - zero_point);
+}
+
+// Dequantize an int8_t/uint8_t/int16_t array to an fp32 array
+template <typename T>
+void dequantize(
+    float* __restrict__ y,
+    const T* __restrict__ x,
+    float scale,
+    int32_t zero_point,
+    size_t size) {
+  for (size_t i = 0; i < size; ++i) {
+    y[i] = dequantize<T>(x[i], scale, zero_point);
+  }
+}
+
+// explicit template instantiation
+
+#define typed_quantize_val(dtype)                         \
+  template __attribute__((always_inline)) dtype quantize( \
+      const float x, float inv_scale, int32_t zero_point);
+typed_quantize_val(int8_t);
+typed_quantize_val(uint8_t);
+typed_quantize_val(int16_t);
+typed_quantize_val(int32_t);
+#undef typed_quantize_val
+
+#define typed_quantize_vec(dtype)  \
+  template void quantize(          \
+      dtype* __restrict__ y,       \
+      const float* __restrict__ x, \
+      float inv_scale,             \
+      int32_t zero_point,          \
+      size_t size);
+typed_quantize_vec(int8_t);
+typed_quantize_vec(uint8_t);
+typed_quantize_vec(int16_t);
+typed_quantize_vec(int32_t);
+#undef typed_quantize_vec
+
+#define typed_dequantize_val(dtype)                         \
+  template __attribute__((always_inline)) float dequantize( \
+      const dtype x, float scale, int32_t zero_point);
+typed_dequantize_val(int8_t);
+typed_dequantize_val(uint8_t);
+typed_dequantize_val(int16_t);
+typed_dequantize_val(int32_t);
+#undef typed_dequantize_val
+
+#define typed_dequantize_vec(dtype) \
+  template void dequantize(         \
+      float* __restrict__ y,        \
+      const dtype* __restrict__ x,  \
+      float scale,                  \
+      int32_t zero_point,           \
+      size_t size);
+typed_dequantize_vec(int8_t);
+typed_dequantize_vec(uint8_t);
+typed_dequantize_vec(int16_t);
+typed_dequantize_vec(int32_t);
+#undef typed_dequantize_vec
+
+}; // namespace kernels
+}; // namespace reference
+}; // namespace impl
diff --git a/backends/cadence/reference/kernels/kernels.h b/backends/cadence/reference/kernels/kernels.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "inttypes.h"
+#include "stddef.h"
+
+namespace impl {
+namespace reference {
+namespace kernels {
+
+template <typename T>
+T quantize(const float x, float scale, int32_t zero_point);
+
+template <typename T>
+float dequantize(const T x, float scale, int32_t zero_point);
+
+template <typename T>
+void quantize(
+    T* __restrict__ y,
+    const float* __restrict__ x,
+    float scale,
+    int32_t zero_point,
+    size_t size);
+
+// Deuantize an int8_t/uint8_t/int16_t array to an fp32 array
+template <typename T>
+void dequantize(
+    float* __restrict__ y,
+    const T* __restrict__ x,
+    float scale,
+    int32_t zero_point,
+    size_t size);
+
+}; // namespace kernels
+}; // namespace reference
+}; // namespace impl
diff --git a/backends/cadence/reference/operators/CMakeLists.txt b/backends/cadence/reference/operators/CMakeLists.txt
@@ -0,0 +1,86 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+cmake_minimum_required(VERSION 3.19)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+if(NOT CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+endif()
+
+include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+
+if(NOT PYTHON_EXECUTABLE)
+  resolve_python_executable()
+endif()
+
+# ATen compliant ops that are needed to run this model.
+set(_aten_ops__srcs
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_add.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_embedding.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_full.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_view_copy.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/activation_ops_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/copy_ops_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/broadcast_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/index_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/kernel_ops_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/matmul_ops_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/reduce_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/repeat_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_cat.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_div.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_mul.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_permute_copy.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_sigmoid.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_slice_copy.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_softmax.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_split_with_sizes_copy.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_sub.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_to_copy.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_where.cpp")
+add_library(aten_ops_cadence ${_aten_ops__srcs})
+target_link_libraries(aten_ops_cadence PUBLIC executorch)
+target_link_libraries(aten_ops_cadence PRIVATE cadence_kernels)
+
+# Let files say "include <executorch/path/to/header.h>".
+set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+
+target_include_directories(aten_ops_cadence PUBLIC ${ROOT_DIR}/..
+                                                ${CMAKE_BINARY_DIR}
+                                                ${_common_include_directories})
+
+# Custom ops that are needed to run the test model.
+add_library(
+  custom_ops "quantized_linear_out.cpp" "quantized_conv_out.cpp"
+  "quantized_relu_out.cpp" "quantized_layer_norm.cpp"
+  "quantize_per_tensor.cpp" "dequantize_per_tensor.cpp")
+target_include_directories(custom_ops PUBLIC ${ROOT_DIR}/..
+                                             ${CMAKE_BINARY_DIR}
+                                             ${_common_include_directories})
+
+target_link_libraries(custom_ops PUBLIC executorch)
+target_link_libraries(custom_ops PRIVATE cadence_kernels)
+
+# Generate C++ bindings to register kernels into both PyTorch (for AOT) and
+# Executorch (for runtime). Here select all ops in functions.yaml
+gen_selected_ops(
+  LIB_NAME "cadence_ops_lib" OPS_SCHEMA_YAML
+  "${CMAKE_CURRENT_LIST_DIR}/../../aot/functions.yaml" "" ""
+)
+generate_bindings_for_kernels(
+  LIB_NAME "cadence_ops_lib" OPS_SCHEMA_YAML
+  FUNCTIONS_YAML ${CMAKE_CURRENT_SOURCE_DIR}/../../aot/functions.yaml
+)
+message("Generated files ${gen_command_sources}")
+
+gen_operators_lib(
+  LIB_NAME "cadence_ops_lib"
+  KERNEL_LIBS custom_ops
+  DEPS aten_ops_cadence)
diff --git a/backends/cadence/reference/operators/dequantize_per_tensor.cpp b/backends/cadence/reference/operators/dequantize_per_tensor.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include "kernels.h"
+
+namespace impl {
+namespace reference {
+namespace native {
+
+using Tensor = exec_aten::Tensor;
+using RuntimeContext = torch::executor::RuntimeContext;
+using ScalarType = exec_aten::ScalarType;
+
+void dequantize_per_tensor_out(
+    RuntimeContext& context,
+    const Tensor& input,
+    double scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  float* out_data = out.mutable_data_ptr<float>();
+  size_t numel = out.numel();
+
+  if (input.scalar_type() == ScalarType::Byte) {
+    const uint8_t* input_data = input.const_data_ptr<uint8_t>();
+    impl::reference::kernels::dequantize<uint8_t>(
+        out_data, input_data, scale, zero_point, numel);
+  } else if (input.scalar_type() == ScalarType::Char) {
+    const int8_t* input_data = input.const_data_ptr<int8_t>();
+    impl::reference::kernels::dequantize<int8_t>(
+        out_data, input_data, scale, zero_point, numel);
+  } else if (input.scalar_type() == ScalarType::Int) {
+    const int32_t* input_data = input.const_data_ptr<int32_t>();
+    impl::reference::kernels::dequantize<int32_t>(
+        out_data, input_data, scale, zero_point, numel);
+  } else {
+    ET_CHECK_MSG(false, "Unhandled input dtype %hhd", input.scalar_type());
+  }
+}
+
+}; // namespace native
+}; // namespace reference
+}; // namespace impl
diff --git a/backends/cadence/hifi/operators/op_add.cpp → ...ds/cadence/reference/operators/op_add.cpp b/backends/cadence/hifi/operators/op_add.cpp → ...ds/cadence/reference/operators/op_add.cpp
diff --git a/...s/cadence/hifi/operators/op_embedding.cpp → ...ence/reference/operators/op_embedding.cpp b/...s/cadence/hifi/operators/op_embedding.cpp → ...ence/reference/operators/op_embedding.cpp
@@ -7,7 +7,6 @@
  */
 
 #include <executorch/runtime/kernel/kernel_includes.h>
-#include "kernels.h"
 
 namespace torch {
 namespace executor {
@@ -31,7 +30,7 @@ void embedding_out(
 
   for (int i = 0, e = indices.numel(); i < e; i++) {
     // memcpy(dest, src, nbytes);
-    impl::HiFi::kernels::memcpy(
+    memcpy(
         out_data, w_data + nbytes_per_entry * indices_ptr[i], nbytes_per_entry);
     out_data += nbytes_per_entry;
   }

diff --git a/backends/cadence/hifi/operators/op_full.cpp → ...s/cadence/reference/operators/op_full.cpp b/backends/cadence/hifi/operators/op_full.cpp → ...s/cadence/reference/operators/op_full.cpp
diff --git a/...s/cadence/hifi/operators/op_view_copy.cpp → ...ence/reference/operators/op_view_copy.cpp b/...s/cadence/hifi/operators/op_view_copy.cpp → ...ence/reference/operators/op_view_copy.cpp
@@ -7,7 +7,6 @@
  */
 
 #include <executorch/runtime/kernel/kernel_includes.h>
-#include "kernels.h"
 
 namespace torch {
 namespace executor {
@@ -21,8 +20,7 @@ Tensor& view_copy_out(
     const Tensor& input,
     const IntArrayRef size,
     Tensor& out) {
-  impl::HiFi::kernels::memcpy(
-      out.mutable_data_ptr(), input.const_data_ptr(), input.nbytes());
+  memcpy(out.mutable_data_ptr(), input.const_data_ptr(), input.nbytes());
   return out;
 }