Skip to content

Commit b94cb47

Browse files
committed
Added 16-bit version of ADD/SUB operators. Broadcasting is included.
1 parent a0c6417 commit b94cb47

File tree

5 files changed

+147
-44
lines changed

5 files changed

+147
-44
lines changed

tensorflow/lite/kernels/add.cc

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -93,12 +93,24 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
9393
output_size = TfLiteIntArrayCopy(input1->dims);
9494
}
9595

96-
if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
96+
// 8bit -> 8bit general quantized path, with general rescalings
97+
// as well as, 16bit -> 16bit with general rescalings
98+
bool general_16bit = input1->type == kTfLiteInt16 &&
99+
input2->type == kTfLiteInt16 &&
100+
output->type == kTfLiteInt16;
101+
102+
if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
103+
general_16bit) {
97104
// 8bit -> 8bit general quantized path, with general rescalings
105+
// as well as, 16bit -> 16bit with general rescalings
98106
data->input1_offset = -input1->params.zero_point;
99107
data->input2_offset = -input2->params.zero_point;
100108
data->output_offset = output->params.zero_point;
101-
data->left_shift = 20;
109+
110+
// The shift is set to 15 for 16-bit and 20 in case of 8-bit, accordingly.
111+
// In case of 16-bit we have 65535 << 15 which is less than 1 << 31,
112+
// therefore the addition will still fit in a 32 bit accumulator.
113+
data->left_shift = general_16bit ? 15 : 20;
102114
const double twice_max_input_scale =
103115
2 * std::max(input1->params.scale, input2->params.scale);
104116
const double real_input1_multiplier =
@@ -221,7 +233,12 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
221233
const TfLiteTensor* input1,
222234
const TfLiteTensor* input2,
223235
TfLiteTensor* output) {
224-
if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
236+
bool general_16bit = input1->type == kTfLiteInt16 &&
237+
input2->type == kTfLiteInt16 &&
238+
output->type == kTfLiteInt16;
239+
240+
if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
241+
general_16bit) {
225242
tflite::ArithmeticParams op_params;
226243
op_params.left_shift = data->left_shift;
227244
op_params.input1_offset = data->input1_offset;
@@ -256,6 +273,12 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
256273
TF_LITE_ADD(optimized_integer_ops, Add, int8_t);
257274
}
258275
}
276+
} else if (output->type == kTfLiteInt16) {
277+
if (need_broadcast) {
278+
TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, int16_t);
279+
} else {
280+
TF_LITE_ADD(reference_ops, Add, int16_t);
281+
}
259282
} else {
260283
if (kernel_type == kReference) {
261284
if (need_broadcast) {
@@ -286,7 +309,7 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
286309
// The quantized version of Add doesn't support activations, so we
287310
// always use BroadcastAdd.
288311
if (kernel_type == kReference) {
289-
TF_LITE_ADD(reference_ops, Add);
312+
TF_LITE_ADD(reference_ops, AddLSTM);
290313
} else {
291314
TF_LITE_ADD(optimized_ops, Add);
292315
}

tensorflow/lite/kernels/add_test.cc

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -306,15 +306,18 @@ TEST(QuantizedAddOpModel, QuantizedTestsNoActivationInt16) {
306306
const float kMin = -1.f;
307307
const float kMax = 32767.f / 32768.f;
308308
float kQuantizedTolerance = GetToleranceInt16(kMin, kMax);
309-
std::vector<std::vector<float>> inputs1 = {
310-
{0.1, 0.2, 0.3, 0.4}, {-0.8, 0.2, 0.4, 0.7}, {-0.8, 0.2, 0.7, 0.3}};
311-
std::vector<std::vector<float>> inputs2 = {
312-
{0.6, 0.4, 0.3, 0.1}, {0.6, 0.4, 0.5, -0.8}, {0.6, 0.4, -0.8, 0.5}};
313-
std::vector<std::vector<float>> results = {
314-
{0.7, 0.6, 0.6, 0.5}, {-0.2, 0.6, 0.9, -0.1}, {-0.2, 0.6, -0.1, 0.8}};
309+
std::vector<std::vector<float>> inputs1 = {{0.1, 0.2, 0.3, 0.4, 0.9, 0.7},
310+
{-0.8, 0.2, 0.4, 0.7, 0.1, 0.0},
311+
{-0.8, 0.2, 0.7, 0.3, 0.9, 0.1}};
312+
std::vector<std::vector<float>> inputs2 = {{0.6, 0.4, 0.3, 0.1, -0.1, 0.3},
313+
{0.6, 0.4, 0.5, -0.8, 0.0, -1.0},
314+
{0.6, 0.4, -0.8, 0.5, -0.9, 0.1}};
315+
std::vector<std::vector<float>> results = {{0.7, 0.6, 0.6, 0.5, 0.8, 1.0},
316+
{-0.2, 0.6, 0.9, -0.1, 0.1, -1.0},
317+
{-0.2, 0.6, -0.1, 0.8, 0.0, 0.2}};
315318
for (size_t i = 0; i < inputs1.size(); ++i) {
316-
QuantizedAddOpModel m({TensorType_INT16, {1, 2, 2, 1}, kMin, kMax},
317-
{TensorType_INT16, {1, 2, 2, 1}, kMin, kMax},
319+
QuantizedAddOpModel m({TensorType_INT16, {1, 2, 3, 1}, kMin, kMax},
320+
{TensorType_INT16, {1, 2, 3, 1}, kMin, kMax},
318321
{TensorType_INT16, {}, kMin, kMax},
319322
ActivationFunctionType_NONE);
320323
m.QuantizeAndPopulate<int16_t>(m.input1(), inputs1[i]);
@@ -435,6 +438,10 @@ TEST(QuantizedAddOpModel, QuantizedWithScalarBroadcastInt8) {
435438
QuantizedWithScalarBroadcast<TensorType_INT8, int8_t>();
436439
}
437440

441+
TEST(QuantizedAddOpModel, QuantizedWithScalarBroadcastInt16) {
442+
QuantizedWithScalarBroadcast<TensorType_INT16, int16_t>();
443+
}
444+
438445
template <enum TensorType tensor_type, typename integer_dtype>
439446
void QuantizedWithMixedBroadcast() {
440447
float kQuantizedTolerance = GetTolerance(-3.f, 3.f);
@@ -497,6 +504,10 @@ TEST(QuantizedAddOpModel, QuantizedWithMixedBroadcastInt8) {
497504
QuantizedWithMixedBroadcast<TensorType_INT8, int8_t>();
498505
}
499506

507+
TEST(QuantizedAddOpModel, QuantizedWithMixedBroadcastInt16) {
508+
QuantizedWithMixedBroadcast<TensorType_INT16, int16_t>();
509+
}
510+
500511
template <enum TensorType tensor_type, typename integer_dtype>
501512
void QuantizedWithGenericBroadcast() {
502513
float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
@@ -523,5 +534,9 @@ TEST(QuantizedAddOpModel, QuantizedWithGenericdBroadcastInt8) {
523534
QuantizedWithGenericBroadcast<TensorType_INT8, int8_t>();
524535
}
525536

537+
TEST(QuantizedAddOpModel, QuantizedWithGenericdBroadcastInt16) {
538+
QuantizedWithGenericBroadcast<TensorType_INT16, int16_t>();
539+
}
540+
526541
} // namespace
527542
} // namespace tflite

tensorflow/lite/kernels/internal/reference/add.h

Lines changed: 39 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -51,13 +51,18 @@ inline void Add(const ArithmeticParams& params,
5151

5252
// Element-wise add that can often be used for inner loop of broadcast add as
5353
// well as the non-broadcast add.
54+
55+
// This function is used for 8-bit as well as for 16-bit, but the accumulator
56+
// is 32-bit for both cases. The overflow does not happen due to the
57+
// choice of the shift (20 or 15, accordingly - see add.cc for more comments).
58+
template <typename T>
5459
inline void AddElementwise(int size, const ArithmeticParams& params,
55-
const uint8* input1_data, const uint8* input2_data,
56-
uint8* output_data) {
57-
TFLITE_DCHECK_GT(params.input1_offset, -256);
58-
TFLITE_DCHECK_GT(params.input2_offset, -256);
59-
TFLITE_DCHECK_LT(params.input1_offset, 256);
60-
TFLITE_DCHECK_LT(params.input2_offset, 256);
60+
const T* input1_data, const T* input2_data,
61+
T* output_data) {
62+
TFLITE_DCHECK_GT(params.input1_offset, -std::numeric_limits<T>::max());
63+
TFLITE_DCHECK_GT(params.input2_offset, -std::numeric_limits<T>::max());
64+
TFLITE_DCHECK_LT(params.input1_offset, std::numeric_limits<T>::max());
65+
TFLITE_DCHECK_LT(params.input2_offset, std::numeric_limits<T>::max());
6166

6267
for (int i = 0; i < size; ++i) {
6368
const int32 input1_val = params.input1_offset + input1_data[i];
@@ -78,7 +83,7 @@ inline void AddElementwise(int size, const ArithmeticParams& params,
7883
const int32 clamped_output =
7984
std::min(params.quantized_activation_max,
8085
std::max(params.quantized_activation_min, raw_output));
81-
output_data[i] = static_cast<uint8>(clamped_output);
86+
output_data[i] = static_cast<T>(clamped_output);
8287
}
8388
}
8489

@@ -138,6 +143,24 @@ inline void Add(const ArithmeticParams& params,
138143
const RuntimeShape& output_shape, int16* output_data) {
139144
TFLITE_DCHECK_LE(params.quantized_activation_min,
140145
params.quantized_activation_max);
146+
const int flat_size =
147+
MatchingElementsSize(input1_shape, input2_shape, output_shape);
148+
149+
int max_value = std::numeric_limits<int16>::max();
150+
151+
TFLITE_DCHECK_GT(params.input1_offset, -max_value);
152+
TFLITE_DCHECK_GT(params.input2_offset, -max_value);
153+
TFLITE_DCHECK_LT(params.input1_offset, max_value);
154+
TFLITE_DCHECK_LT(params.input2_offset, max_value);
155+
AddElementwise(flat_size, params, input1_data, input2_data, output_data);
156+
}
157+
158+
inline void AddLSTM(const ArithmeticParams& params,
159+
const RuntimeShape& input1_shape, const int16* input1_data,
160+
const RuntimeShape& input2_shape, const int16* input2_data,
161+
const RuntimeShape& output_shape, int16* output_data) {
162+
TFLITE_DCHECK_LE(params.quantized_activation_min,
163+
params.quantized_activation_max);
141164

142165
const int input1_shift = params.input1_shift;
143166
const int flat_size =
@@ -257,13 +280,14 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
257280
}
258281
}
259282

260-
inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
261-
const RuntimeShape& input1_shape,
262-
const uint8* input1_data,
263-
const RuntimeShape& input2_shape,
264-
const uint8* input2_data,
265-
const RuntimeShape& output_shape,
266-
uint8* output_data) {
283+
// This function is used for 8-bit as well as for 16-bit, but the accumulator
284+
// is 32-bit for both cases. The overflow does not happen due to the
285+
// choice of the shift (20 or 15, accordingly - see add.cc for more comments).
286+
template <typename T>
287+
inline void BroadcastAdd4DSlow(
288+
const ArithmeticParams& params, const RuntimeShape& input1_shape,
289+
const T* input1_data, const RuntimeShape& input2_shape,
290+
const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
267291
NdArrayDesc<4> desc1;
268292
NdArrayDesc<4> desc2;
269293
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
@@ -313,7 +337,7 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
313337
std::min(params.quantized_activation_max,
314338
std::max(params.quantized_activation_min, raw_output));
315339
output_data[Offset(extended_output_shape, b, y, x, c)] =
316-
static_cast<uint8>(clamped_output);
340+
static_cast<T>(clamped_output);
317341
}
318342
}
319343
}

tensorflow/lite/kernels/sub.cc

Lines changed: 46 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -72,13 +72,14 @@ void Free(TfLiteContext* context, void* buffer) {
7272
delete reinterpret_cast<OpData*>(buffer);
7373
}
7474

75-
TfLiteStatus Prepare8BitSubOp(TfLiteContext* context,
76-
const TfLiteTensor* input_1,
77-
const TfLiteTensor* input_2, TfLiteTensor* output,
78-
TfLiteSubParams* params, OpData* op_params,
79-
int op_sign) {
80-
TF_LITE_ENSURE(context,
81-
output->type == kTfLiteUInt8 || output->type == kTfLiteInt8);
75+
TfLiteStatus PrepareGeneralSubOp(TfLiteContext* context,
76+
const TfLiteTensor* input_1,
77+
const TfLiteTensor* input_2,
78+
TfLiteTensor* output, TfLiteSubParams* params,
79+
OpData* op_params, int op_sign) {
80+
TF_LITE_ENSURE(context, output->type == kTfLiteUInt8 ||
81+
output->type == kTfLiteInt8 ||
82+
output->type == kTfLiteInt16);
8283
const auto& input1_quantization_params = input_1->params;
8384
const auto& input2_quantization_params = input_2->params;
8485
const auto& output_quantization_params = output->params;
@@ -87,6 +88,9 @@ TfLiteStatus Prepare8BitSubOp(TfLiteContext* context,
8788
if (output->type == kTfLiteUInt8) {
8889
integer_type_min = std::numeric_limits<uint8_t>::min();
8990
integer_type_max = std::numeric_limits<uint8_t>::max();
91+
} else if (output->type == kTfLiteInt16) {
92+
integer_type_min = std::numeric_limits<int16_t>::min();
93+
integer_type_max = std::numeric_limits<int16_t>::max();
9094
} else {
9195
// output->type == kTfLiteInt8
9296
integer_type_min = std::numeric_limits<int8_t>::min();
@@ -109,7 +113,11 @@ TfLiteStatus Prepare8BitSubOp(TfLiteContext* context,
109113
op_params->input1_offset = -input1_quantization_params.zero_point;
110114
op_params->input2_offset = -input2_quantization_params.zero_point;
111115
op_params->output_offset = output_quantization_params.zero_point;
112-
op_params->left_shift = 20;
116+
117+
// The shift is set to 15 in case of 16-bit and 20 in case of 8-bit,
118+
// accordingly. In case of 16-bit we have 65535 << 15 which is less than 1 <<
119+
// 31, therefore the addition will still fit in a 32 bit accumulator.
120+
op_params->left_shift = output->type == kTfLiteInt16 ? 15 : 20;
113121
const double twice_max_input_scale =
114122
2 * std::max(input1_quantization_params.scale,
115123
input2_quantization_params.scale);
@@ -135,13 +143,14 @@ TfLiteStatus Prepare8BitSubOp(TfLiteContext* context,
135143
TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
136144
context, params->activation, output, &op_params->output_activation_min,
137145
&op_params->output_activation_max));
146+
138147
return kTfLiteOk;
139148
}
140149

141-
TfLiteStatus PrepareInt16SubOp(TfLiteContext* context,
142-
const TfLiteTensor* input1,
143-
const TfLiteTensor* input2, TfLiteTensor* output,
144-
TfLiteSubParams* params, OpData* data) {
150+
TfLiteStatus PrepareLSTMSubOp(TfLiteContext* context,
151+
const TfLiteTensor* input1,
152+
const TfLiteTensor* input2, TfLiteTensor* output,
153+
TfLiteSubParams* params, OpData* data) {
145154
// 16bit -> 16bit special quantized path, supporting only a rather
146155
// narrow case of quantization parameters: zero_points must all be 0
147156
// ("symmetric quantization") and scales must be power-of-two (which
@@ -208,12 +217,21 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
208217
output_size = TfLiteIntArrayCopy(input1->dims);
209218
}
210219

211-
if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
212-
TF_LITE_ENSURE_OK(context, Prepare8BitSubOp(context, input1, input2, output,
213-
params, data, -1));
220+
// 8bit -> 8bit general quantized path, with general rescalings
221+
// as well as, 16bit -> 16bit with general rescalings
222+
223+
bool general_16bit = output->type == kTfLiteInt16 &&
224+
input1->type == kTfLiteInt16 &&
225+
input2->type == kTfLiteInt16;
226+
227+
if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
228+
general_16bit) {
229+
TF_LITE_ENSURE_OK(context, PrepareGeneralSubOp(context, input1, input2,
230+
output, params, data, -1));
214231
} else if (output->type == kTfLiteInt16) {
215-
TF_LITE_ENSURE_OK(context, PrepareInt16SubOp(context, input1, input2,
216-
output, params, data));
232+
// LSTM-special case with scale parameter of POT
233+
TF_LITE_ENSURE_OK(context, PrepareLSTMSubOp(context, input1, input2, output,
234+
params, data));
217235
}
218236

219237
return context->ResizeTensor(context, output, output_size);
@@ -288,6 +306,11 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
288306
const bool need_broadcast = optimized_ops::ProcessBroadcastShapes(
289307
GetTensorShape(input1), GetTensorShape(input2), &op_params);
290308

309+
// 16bit -> 16bit with general rescaling
310+
bool general_16bit = output->type == kTfLiteInt16 &&
311+
input1->type == kTfLiteInt16 &&
312+
input2->type == kTfLiteInt16;
313+
291314
#define TF_LITE_SUB(type, opname, data_type) \
292315
type::opname(op_params, GetTensorShape(input1), \
293316
GetTensorData<data_type>(input1), GetTensorShape(input2), \
@@ -301,6 +324,12 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
301324
} else {
302325
TF_LITE_SUB(reference_integer_ops, Add, int8_t);
303326
}
327+
} else if (general_16bit) {
328+
if (need_broadcast) {
329+
TF_LITE_SUB(reference_ops, BroadcastAdd4DSlow, int16_t);
330+
} else {
331+
TF_LITE_SUB(reference_ops, Add, int16_t);
332+
}
304333
} else if (output->type == kTfLiteUInt8) {
305334
if (kernel_type == kReference) {
306335
if (need_broadcast) {

tensorflow/lite/kernels/sub_test.cc

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,10 @@ TEST(QuantizedSubOpModel, QuantizedTestsNoActivationInt8) {
226226
QuantizedTestsNoActivation<TensorType_INT8, int8_t>();
227227
}
228228

229+
TEST(QuantizedSubOpModel, QuantizedTestsNoActivationInt16Generic) {
230+
QuantizedTestsNoActivation<TensorType_INT16, int16_t>();
231+
}
232+
229233
template <TensorType tensor_type, typename integer_dtype>
230234
void QuantizedTestsActivationRELU_N1_TO_1() {
231235
float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
@@ -287,6 +291,10 @@ TEST(QuantizedSubOpModel, QuantizedVariousInputShapesInt8) {
287291
QuantizedVariousInputShapes<TensorType_INT8, int8_t>();
288292
}
289293

294+
TEST(QuantizedSubOpModel, QuantizedVariousInputShapesInt16) {
295+
QuantizedVariousInputShapes<TensorType_INT16, int16_t>();
296+
}
297+
290298
template <TensorType tensor_type, typename integer_dtype>
291299
void QuantizedWithBroadcast() {
292300
float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
@@ -315,6 +323,10 @@ TEST(QuantizedSubOpModel, QuantizedWithBroadcastInt8) {
315323
QuantizedWithBroadcast<TensorType_INT8, int8_t>();
316324
}
317325

326+
TEST(QuantizedSubOpModel, QuantizedWithBroadcastInt16) {
327+
QuantizedWithBroadcast<TensorType_INT16, int16_t>();
328+
}
329+
318330
TEST(QuantizedSubOpModel, QuantizedTestsNoActivationInt16) {
319331
const float kMin = -1.f;
320332
const float kMax =

0 commit comments

Comments
 (0)