tensorflow
diff --git a/‎src/tensorflow/lite/c/common.h
Lines changed: 0 additions & 6 deletions b/‎src/tensorflow/lite/c/common.h
Lines changed: 0 additions & 6 deletions
diff --git a/‎src/tensorflow/lite/kernels/internal/common.h
Lines changed: 7 additions & 6 deletions b/‎src/tensorflow/lite/kernels/internal/common.h
Lines changed: 7 additions & 6 deletions
diff --git a/‎src/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
Lines changed: 12 additions & 11 deletions b/‎src/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
Lines changed: 12 additions & 11 deletions
diff --git a/‎src/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h
Lines changed: 2 additions & 63 deletions b/‎src/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h
Lines changed: 2 additions & 63 deletions
diff --git a/‎src/tensorflow/lite/kernels/internal/reference/reduce.h
Lines changed: 42 additions & 93 deletions b/‎src/tensorflow/lite/kernels/internal/reference/reduce.h
Lines changed: 42 additions & 93 deletions
diff --git a/‎src/tensorflow/lite/micro/kernels/cmsis_nn/conv.cpp
Lines changed: 1 addition & 2 deletions b/‎src/tensorflow/lite/micro/kernels/cmsis_nn/conv.cpp
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/tensorflow/lite/micro/kernels/cmsis_nn/svdf.cpp
Lines changed: 1 addition & 2 deletions b/‎src/tensorflow/lite/micro/kernels/cmsis_nn/svdf.cpp
Lines changed: 1 addition & 2 deletions
@@ -38,10 +38,4 @@ limitations under the License.
 
 #include "tensorflow/lite/core/c/common.h"
 
-// TfLiteOpaqueDelegate: allows delegation of nodes to alternative backends.
-// TfLiteOpaqueDelegate is an abstract type that is intended to have the same
-// role as TfLiteDelegate, but without necessarily exposing the implementation
-// details of how delegates are implemented.
-typedef TfLiteDelegate TfLiteOpaqueDelegate;
-
 #endif  // TENSORFLOW_LITE_C_COMMON_H_
@@ -328,14 +328,16 @@ template <typename T>
 int CountLeadingZeros(T integer_input) {
   static_assert(std::is_unsigned<T>::value,
                 "Only unsigned integer types handled.");
-#if defined(__GNUC__)
-  return integer_input ? __builtin_clz(integer_input)
-                       : std::numeric_limits<T>::digits;
-#else
   if (integer_input == 0) {
     return std::numeric_limits<T>::digits;
   }
-
+#if defined(__GNUC__)
+  if (std::is_same<T, uint32_t>::value) {
+    return __builtin_clz(integer_input);
+  } else if (std::is_same<T, uint64_t>::value) {
+    return __builtin_clzll(integer_input);
+  }
+#endif
   const T one_in_leading_positive = static_cast<T>(1)
                                     << (std::numeric_limits<T>::digits - 1);
   int leading_zeros = 0;
@@ -344,7 +346,6 @@ int CountLeadingZeros(T integer_input) {
     ++leading_zeros;
   }
   return leading_zeros;
-#endif
 }
 
 template <typename T>
 
@@ -35,24 +35,25 @@ inline void CheckArithmeticParams(const ArithmeticParams& params) {
   TFLITE_DCHECK_LE(-params.input2_offset, std::numeric_limits<int8_t>::max());
 }
 
-inline void ElementWise(
-    int size, const ArithmeticParams& params, const int8_t* input1_data,
-    const int8_t* input2_data, int8_t* output_data,
-    void (*check_arithmetic_params)(const ArithmeticParams&),
-    int8_t (*binary_func)(int8_t, int8_t, const ArithmeticParams&)) {
+// TODO(b/270589088): move to a more appropriate file (b/270589088#comment2)
+template <typename T>
+void ElementWise(int size, const ArithmeticParams& params, const T* input1_data,
+                 const T* input2_data, T* output_data,
+                 void (*check_arithmetic_params)(const ArithmeticParams&),
+                 T (*binary_func)(T, T, const ArithmeticParams&)) {
   CheckArithmeticParams(params);
   for (int i = 0; i < size; ++i) {
     output_data[i] = binary_func(input1_data[i], input2_data[i], params);
   }
 }
-
-inline void BroadcastBinaryFunction4DSlow(
+// TODO(b/270589088): move to a more appropriate file. (b/270589088#comment2)
+template <typename T>
+void BroadcastBinaryFunction4DSlow(
     const ArithmeticParams& params, const RuntimeShape& input1_shape,
-    const int8_t* input1_data, const RuntimeShape& input2_shape,
-    const int8_t* input2_data, const RuntimeShape& output_shape,
-    int8_t* output_data,
+    const T* input1_data, const RuntimeShape& input2_shape,
+    const T* input2_data, const RuntimeShape& output_shape, T* output_data,
     void (*check_arithmetic_params)(const ArithmeticParams&),
-    int8_t (*binary_func)(int8_t, int8_t, const ArithmeticParams&)) {
+    T (*binary_func)(T, T, const ArithmeticParams&)) {
   NdArrayDesc<4> desc1;
   NdArrayDesc<4> desc2;
   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
 
@@ -1,10 +1,10 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -15,65 +15,4 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MEAN_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MEAN_H_
 
-#include <algorithm>
-
-#include "tensorflow/lite/kernels/internal/common.h"
-
-namespace tflite {
-namespace reference_integer_ops {
-
-template <typename integer_type>
-inline void Mean(const tflite::MeanParams& op_params, int32_t multiplier,
-                 int32_t shift, const RuntimeShape& unextended_input_shape,
-                 const integer_type* input_data, int32_t input_zero_point,
-                 const RuntimeShape& unextended_output_shape,
-                 integer_type* output_data, int32_t output_zero_point) {
-  // Current implementation only supports dimension equals 4 and simultaneous
-  // reduction over width and height.
-  TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
-  TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
-  const RuntimeShape input_shape =
-      RuntimeShape::ExtendedShape(4, unextended_input_shape);
-  const RuntimeShape output_shape =
-      RuntimeShape::ExtendedShape(4, unextended_output_shape);
-  const int output_batch = output_shape.Dims(0);
-  const int output_height = output_shape.Dims(1);
-  const int output_width = output_shape.Dims(2);
-  const int output_depth = output_shape.Dims(3);
-  const int input_height = input_shape.Dims(1);
-  const int input_width = input_shape.Dims(2);
-  const int num_elements_in_axis = input_width * input_height;
-
-  TFLITE_CHECK_EQ(op_params.axis_count, 2);
-  TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
-               (op_params.axis[0] == 2 && op_params.axis[1] == 1));
-  TFLITE_CHECK_EQ(output_height, 1);
-  TFLITE_CHECK_EQ(output_width, 1);
-
-  static constexpr int32_t kMinInt = std::numeric_limits<integer_type>::min();
-  static constexpr int32_t kMaxInt = std::numeric_limits<integer_type>::max();
-
-  for (int out_b = 0; out_b < output_batch; ++out_b) {
-    for (int out_d = 0; out_d < output_depth; ++out_d) {
-      int32_t acc = 0;
-      for (int in_h = 0; in_h < input_height; ++in_h) {
-        for (int in_w = 0; in_w < input_width; ++in_w) {
-          acc += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)] -
-                 input_zero_point;
-        }
-      }
-      acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift);
-      acc = acc > 0 ? (acc + num_elements_in_axis / 2) / num_elements_in_axis
-                    : (acc - num_elements_in_axis / 2) / num_elements_in_axis;
-      acc += output_zero_point;
-      acc = std::min(std::max(acc, kMinInt), kMaxInt);
-      output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
-          static_cast<integer_type>(acc);
-    }
-  }
-}
-
-}  // namespace reference_integer_ops
-}  // namespace tflite
-
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MEAN_H_
@@ -268,11 +268,11 @@ inline bool Mean(const T* input_data, const int* input_dims,
   return true;
 }
 
-template <typename T>
 inline void Mean(const tflite::MeanParams& op_params,
                  const RuntimeShape& unextended_input_shape,
-                 const T* input_data,
-                 const RuntimeShape& unextended_output_shape, T* output_data) {
+                 const float* input_data,
+                 const RuntimeShape& unextended_output_shape,
+                 float* output_data) {
   ruy::profiler::ScopeLabel label("Mean4D");
 
   // Current implementation only supports dimension equals 4 and simultaneous
@@ -312,78 +312,21 @@ inline void Mean(const tflite::MeanParams& op_params,
   }
 }
 
-inline void Mean(const tflite::MeanParams& op_params,
-                 const RuntimeShape& unextended_input_shape,
-                 const uint8_t* input_data, int32_t input_zero_point,
-                 float input_scale, const RuntimeShape& unextended_output_shape,
-                 uint8_t* output_data, int32_t output_zero_point,
-                 float output_scale) {
-  ruy::profiler::ScopeLabel label("Mean4D/Uint8");
-
-  // Current implementation only supports dimension equals 4 and simultaneous
-  // reduction over width and height.
-  TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
-  TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
-  const RuntimeShape input_shape =
-      RuntimeShape::ExtendedShape(4, unextended_input_shape);
-  const RuntimeShape output_shape =
-      RuntimeShape::ExtendedShape(4, unextended_output_shape);
-  const int output_batch = output_shape.Dims(0);
-  const int output_height = output_shape.Dims(1);
-  const int output_width = output_shape.Dims(2);
-  const int output_depth = output_shape.Dims(3);
-  const int input_height = input_shape.Dims(1);
-  const int input_width = input_shape.Dims(2);
-  const float num_elements_in_axis = input_width * input_height;
-
-  TFLITE_CHECK_EQ(op_params.axis_count, 2);
-  TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
-               (op_params.axis[0] == 2 && op_params.axis[1] == 1));
-  TFLITE_CHECK_EQ(output_height, 1);
-  TFLITE_CHECK_EQ(output_width, 1);
-
-  constexpr int32_t kMinValue = std::numeric_limits<uint8_t>::min();
-  constexpr int32_t kMaxValue = std::numeric_limits<uint8_t>::max();
-
-  float temp = input_zero_point * input_scale / output_scale;
-  temp = temp > 0 ? temp + 0.5f : temp - 0.5f;
-  int32_t bias = output_zero_point - static_cast<int32_t>(temp);
-  double real_scale =
-      static_cast<double>(input_scale / (num_elements_in_axis * output_scale));
-
-  int32_t multiplier;
-  int shift;
-  QuantizeMultiplier(real_scale, &multiplier, &shift);
-  for (int out_b = 0; out_b < output_batch; ++out_b) {
-    for (int out_d = 0; out_d < output_depth; ++out_d) {
-      int32_t acc = 0;
-      for (int in_h = 0; in_h < input_height; ++in_h) {
-        for (int in_w = 0; in_w < input_width; ++in_w) {
-          acc += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
-        }
-      }
-      acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift);
-      acc += bias;
-      acc = std::min(std::max(acc, kMinValue), kMaxValue);
-      output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
-          static_cast<uint8_t>(acc);
-    }
-  }
-}
-
 // Computes the mean of elements across dimensions given in axis.
 // It does so in two stages, first calculates the sum of elements along the axis
 // then divides it by the number of element in axis for quantized values.
 template <typename T, typename U>
 inline bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point,
-                               float input_scale, const int* input_dims,
-                               const int input_num_dims, T* output_data,
-                               int32_t output_zero_point, float output_scale,
+                               const int* input_dims, const int input_num_dims,
+                               T* output_data, int32_t output_multiplier,
+                               int output_shift, int32_t output_zero_point,
                                const int* output_dims,
                                const int output_num_dims, const int* axis,
                                const int num_axis_dimensions, bool keep_dims,
                                int* temp_index, int* resolved_axis, U* temp_sum,
                                bool compute_sum) {
+  const int32_t kMinValue = std::numeric_limits<T>::min();
+  const int32_t kMaxValue = std::numeric_limits<T>::max();
   const bool uint8_case = std::is_same<T, uint8_t>::value;
   const bool int16_case = std::is_same<T, int16_t>::value;
   if (uint8_case) {
@@ -430,40 +373,46 @@ inline bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point,
   }
 
   // Calculate mean by dividing output_data by num of aggregated element.
-  size_t num_elements_in_axis = 1;
+  int64_t num_elements_in_axis = 1;
   for (int idx = 0; idx < num_resolved_axis; ++idx) {
     size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
     // Overflow prevention.
-    if (current > (std::numeric_limits<size_t>::max() / num_elements_in_axis)) {
+    if (current > static_cast<size_t>(std::numeric_limits<int64_t>::max() /
+                                      num_elements_in_axis)) {
       return false;
     }
     num_elements_in_axis *= current;
   }
 
-  if (num_elements_in_axis > 0) {
-    const float scale = input_scale / output_scale;
-    if (compute_sum) {
-      // TODO(b/116341117): Eliminate float and do this completely in 8bit.
-      const float bias = -input_zero_point * scale * num_elements_in_axis;
-      for (size_t idx = 0; idx < num_outputs; ++idx) {
-        const U value =
-            static_cast<U>(TfLiteRound(temp_sum[idx] * scale + bias)) +
-            output_zero_point;
-        output_data[idx] = static_cast<T>(value);
-      }
-    } else {
-      const float bias = -input_zero_point * scale;
-      for (size_t idx = 0; idx < num_outputs; ++idx) {
-        float float_mean = static_cast<float>(temp_sum[idx]) /
-                           static_cast<float>(num_elements_in_axis);
-        float result = TfLiteMin(
-            TfLiteRound(float_mean * scale + bias) + output_zero_point,
-            static_cast<float>(std::numeric_limits<T>::max()));
-        result = TfLiteMax(result,
-                           static_cast<float>(std::numeric_limits<T>::min()));
-        output_data[idx] = static_cast<T>(result);
-      }
-    }
+  if (num_elements_in_axis == 0) {
+    return true;
+  }
+
+  // Readapt output rescaling when calculating the mean to integrate a
+  // 1/num_elements_in_axis multiplier.
+  if (!compute_sum) {
+    TFLITE_DCHECK_GE(num_elements_in_axis, 0);
+    int shift =
+        63 - CountLeadingZeros(static_cast<uint64_t>(num_elements_in_axis));
+    // To avoid any overflow risk 'shift' should be <= 32 and to satisfy
+    // 'MultiplyByQuantizedMultiplier' pre-conditions 'output_shift - shift'
+    // should be >= -31. Clamp the value at the price of some precision loss.
+    shift = std::min(shift, 32);
+    shift = std::min(shift, 31 + output_shift);
+    output_multiplier = static_cast<int32_t>(
+        (static_cast<int64_t>(output_multiplier) << shift) /
+        num_elements_in_axis);
+    output_shift = output_shift - shift;
+  }
+
+  for (size_t idx = 0; idx < num_outputs; ++idx) {
+    const U shifted_sum =
+        static_cast<U>(temp_sum[idx] - input_zero_point * num_elements_in_axis);
+    int32_t output = MultiplyByQuantizedMultiplier(
+                         shifted_sum, output_multiplier, output_shift) +
+                     output_zero_point;
+    output = std::min(std::max(output, kMinValue), kMaxValue);
+    output_data[idx] = static_cast<T>(output);
   }
   return true;
 }
@@ -478,8 +427,8 @@ inline bool QuantizedMeanOrSumExtraArgs(
     bool keep_dims, int* temp_index, int* resolved_axis, U* temp_sum,
     bool compute_sum) {
   return QuantizedMeanOrSum<T, U>(
-      input_data, input_zero_point, input_scale, input_dims, input_num_dims,
-      output_data, output_zero_point, output_scale, output_dims,
+      input_data, input_zero_point, input_dims, input_num_dims, output_data,
+      output_multiplier, output_shift, output_zero_point, output_dims,
       output_num_dims, axis, num_axis_dimensions, keep_dims, temp_index,
       resolved_axis, temp_sum, compute_sum);
 }
 
@@ -1,4 +1,4 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/kernels/conv.h"
 
-#include "third_party/cmsis_nn/Include/arm_nn_types.h"
 #include "third_party/cmsis_nn/Include/arm_nnfunctions.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 
@@ -1,4 +1,4 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/kernels/svdf.h"
 
-#include "third_party/cmsis_nn/Include/arm_nn_types.h"
 #include "third_party/cmsis_nn/Include/arm_nnfunctions.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"