2 * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
19 #include "kernels/Utils.h"
21 #include "PALConv2d.h"
23 namespace luci_interpreter
29 int32_t compute_padding_h(const circle::Tensor *input, const circle::Tensor *filter,
30 const circle::Conv2DOptions *options)
32 const int32_t input_height = Tensor::dim(input, 1);
33 const int32_t filter_height = Tensor::dim(filter, 1);
34 const int32_t output_height =
35 kernels::computeOutputSize(luci_padding(options->padding()), input_height, filter_height,
36 options->stride_h(), options->dilation_h_factor());
38 const auto padding_height = kernels::computePadding(
39 options->stride_h(), options->dilation_h_factor(), input_height, filter_height, output_height);
40 return padding_height;
43 int32_t compute_padding_w(const circle::Tensor *input, const circle::Tensor *filter,
44 const circle::Conv2DOptions *options)
46 const int32_t input_width = Tensor::dim(input, 2);
47 const int32_t filter_width = Tensor::dim(filter, 2);
48 const int32_t output_width =
49 kernels::computeOutputSize(luci_padding(options->padding()), input_width, filter_width,
50 options->stride_w(), options->dilation_w_factor());
52 const auto padding_width = kernels::computePadding(
53 options->stride_w(), options->dilation_w_factor(), input_width, filter_width, output_width);
60 void evalFloat(const circle::Tensor *input, const circle::Tensor *filter,
61 const circle::Tensor *bias, const circle::Tensor *output,
62 const circle::Conv2DOptions *options, BaseRuntimeGraph *runtime_graph)
64 float activation_min{};
65 float activation_max{};
66 kernels::calculateActivationRange(luci_actfunc(options->fused_activation_function()),
67 &activation_min, &activation_max);
69 tflite::ConvParams params{};
70 params.padding_values.height = compute_padding_h(input, filter, options);
71 params.padding_values.width = compute_padding_w(input, filter, options);
72 params.stride_height = options->stride_h();
73 params.stride_width = options->stride_w();
74 params.dilation_height_factor = options->dilation_h_factor();
75 params.dilation_width_factor = options->dilation_w_factor();
76 params.float_activation_min = activation_min;
77 params.float_activation_max = activation_max;
79 auto *input_data = runtime_graph->getDataByTensor(input);
80 auto *output_data = runtime_graph->getDataByTensor(output);
82 auto *filter_data = runtime_graph->getConstDataByTensor(filter);
83 auto *bias_data = runtime_graph->getConstDataByTensor(bias);
85 luci_interpreter_pal::Conv(
86 params, kernels::getTensorShape(input), kernels::getTensorData<float>(input_data),
87 kernels::getTensorShape(filter), kernels::getTensorData<float>(filter_data),
88 kernels::getTensorShape(bias), kernels::getTensorData<float>(bias_data),
89 kernels::getTensorShape(output), kernels::getTensorData<float>(output_data),
90 kernels::getTensorShape(nullptr), nullptr);
97 void evalQuantized(const circle::Tensor *input, const circle::Tensor *filter,
98 const circle::Tensor *bias, const circle::Tensor *output,
99 const circle::Conv2DOptions *options, BaseRuntimeGraph *runtime_graph)
101 const auto input_scale = static_cast<double>(Tensor::scale(input));
102 const auto filter_scale = static_cast<double>(Tensor::scale(filter));
103 const auto output_scale = static_cast<double>(Tensor::scale(output));
105 const double real_multiplier = input_scale * filter_scale / output_scale;
106 int32_t output_multiplier{};
108 kernels::quantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
110 int32_t activation_min{};
111 int32_t activation_max{};
112 kernels::calculateActivationRangeQuantized(luci_actfunc(options->fused_activation_function()),
113 output, &activation_min, &activation_max);
115 tflite::ConvParams params{};
116 params.padding_values.height = compute_padding_h(input, filter, options);
117 params.padding_values.width = compute_padding_w(input, filter, options);
118 params.stride_height = options->stride_h();
119 params.stride_width = options->stride_w();
120 params.dilation_height_factor = options->dilation_h_factor();
121 params.dilation_width_factor = options->dilation_w_factor();
122 // The kernel expects input and filter zero points to be negated.
123 params.input_offset = -Tensor::zero_point(input); // Note the '-'.
124 params.weights_offset = -Tensor::zero_point(filter); // Note the '-'.
125 params.output_offset = Tensor::zero_point(output);
126 params.output_multiplier = output_multiplier;
127 params.output_shift = output_shift;
128 params.quantized_activation_min = activation_min;
129 params.quantized_activation_max = activation_max;
131 auto *input_data = runtime_graph->getDataByTensor(input);
132 auto *output_data = runtime_graph->getDataByTensor(output);
134 auto *filter_data = runtime_graph->getConstDataByTensor(filter);
135 auto *bias_data = runtime_graph->getConstDataByTensor(bias);
137 luci_interpreter_pal::Conv(
138 params, kernels::getTensorShape(input), kernels::getTensorData<uint8_t>(input_data),
139 kernels::getTensorShape(filter), kernels::getTensorData<uint8_t>(filter_data),
140 kernels::getTensorShape(bias), kernels::getTensorData<int32_t>(bias_data),
141 kernels::getTensorShape(output), kernels::getTensorData<uint8_t>(output_data),
142 kernels::getTensorShape(nullptr), nullptr);
145 void evalQuantizedPerChannel(const circle::Tensor *input, const circle::Tensor *filter,
146 const circle::Tensor *bias, const circle::Tensor *output,
147 const circle::Conv2DOptions *options, BaseRuntimeGraph *runtime_graph)
149 auto *raw_input_data = runtime_graph->getDataByTensor(input);
150 auto *raw_output_data = runtime_graph->getDataByTensor(output);
152 auto *raw_filter_data = runtime_graph->getConstDataByTensor(filter);
153 auto *raw_bias_data = runtime_graph->getConstDataByTensor(bias);
155 const auto *input_data = kernels::getTensorData<uint8_t>(raw_input_data);
156 const auto *filter_data = kernels::getTensorData<uint8_t>(raw_filter_data);
157 const auto *bias_data = kernels::getTensorData<int32_t>(raw_bias_data);
158 auto *output_data = kernels::getTensorData<uint8_t>(raw_output_data);
160 const int32_t batches = Tensor::dim(input, 0);
161 const int32_t input_height = Tensor::dim(input, 1);
162 const int32_t input_width = Tensor::dim(input, 2);
163 const int32_t input_depth = Tensor::dim(input, 3);
164 const int32_t output_depth = Tensor::dim(filter, 0);
165 const int32_t filter_height = Tensor::dim(filter, 1);
166 const int32_t filter_width = Tensor::dim(filter, 2);
167 const int32_t output_height = Tensor::dim(output, 1);
168 const int32_t output_width = Tensor::dim(output, 2);
170 const int32_t stride_height = options->stride_h();
171 const int32_t stride_width = options->stride_w();
172 const int32_t dilation_height_factor = options->dilation_h_factor();
173 const int32_t dilation_width_factor = options->dilation_w_factor();
175 int32_t activation_min{};
176 int32_t activation_max{};
177 kernels::calculateActivationRangeQuantized(luci_actfunc(options->fused_activation_function()),
178 output, &activation_min, &activation_max);
180 const std::vector<double> effective_output_scale = kernels::getQuantizedConvolutionMultiplers(
181 Tensor::scale(input), Tensor::scales(filter), Tensor::scale(output));
183 const std::vector<kernels::ChannelQuantMultipliers> multipliers_raw =
184 kernels::quantizeMultipliers(effective_output_scale);
185 kernels::BroadcastableWrapper<kernels::ChannelQuantMultipliers> quant_multipliers(
188 for (int32_t batch = 0; batch < batches; ++batch)
190 for (int32_t out_y = 0; out_y < output_height; ++out_y)
192 for (int32_t out_x = 0; out_x < output_width; ++out_x)
194 for (int32_t out_c = 0; out_c < output_depth; ++out_c)
196 const int32_t in_y_origin =
197 out_y * stride_height - compute_padding_h(input, filter, options);
198 const int32_t in_x_origin =
199 out_x * stride_width - compute_padding_w(input, filter, options);
201 for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y)
203 for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x)
205 const int32_t in_y = in_y_origin + dilation_height_factor * filter_y;
206 const int32_t in_x = in_x_origin + dilation_width_factor * filter_x;
207 if ((in_y >= 0 && in_y < input_height) && (in_x >= 0 && in_x < input_width))
209 for (int32_t in_c = 0; in_c < input_depth; ++in_c)
211 const uint8_t input_val =
212 input_data[kernels::calcOffset(input, batch, in_y, in_x, in_c)];
213 const uint8_t filter_val =
214 filter_data[kernels::calcOffset(filter, out_c, filter_y, filter_x, in_c)];
215 acc += static_cast<int32_t>(input_val - Tensor::zero_point(input)) *
216 static_cast<int32_t>(filter_val - Tensor::zero_points(filter)[out_c]);
223 acc += bias_data[out_c];
226 int32_t scaled_acc = tflite::MultiplyByQuantizedMultiplier(
227 acc, quant_multipliers[out_c].multiplier, quant_multipliers[out_c].shift);
229 scaled_acc += Tensor::zero_point(output);
230 scaled_acc = std::max(scaled_acc, activation_min);
231 scaled_acc = std::min(scaled_acc, activation_max);
232 output_data[kernels::calcOffset(output, batch, out_y, out_x, out_c)] = scaled_acc;
239 void evalQuantizedS8PerChannel(const circle::Tensor *input, const circle::Tensor *filter,
240 const circle::Tensor *bias, const circle::Tensor *output,
241 const circle::Conv2DOptions *options,
242 BaseRuntimeGraph *runtime_graph)
244 int32_t activation_min{};
245 int32_t activation_max{};
246 kernels::calculateActivationRangeQuantized(luci_actfunc(options->fused_activation_function()),
247 output, &activation_min, &activation_max);
249 tflite::ConvParams params{};
250 params.padding_values.height = compute_padding_h(input, filter, options);
251 params.padding_values.width = compute_padding_w(input, filter, options);
252 params.stride_height = options->stride_h();
253 params.stride_width = options->stride_w();
254 params.dilation_height_factor = options->dilation_h_factor();
255 params.dilation_width_factor = options->dilation_w_factor();
256 // The kernel expects filter zero points to be negated.
257 params.input_offset = -Tensor::zero_point(input); // Note the '-'.
258 params.weights_offset = 0; // Unused in tflite code
259 params.output_offset = Tensor::zero_point(output);
260 params.quantized_activation_min = activation_min;
261 params.quantized_activation_max = activation_max;
263 const std::vector<double> effective_output_scales = kernels::getQuantizedConvolutionMultiplers(
264 Tensor::scale(input), Tensor::scales(filter), Tensor::scale(output));
266 std::vector<kernels::ChannelQuantMultipliers> quant_multipliers =
267 kernels::quantizeMultipliers(effective_output_scales);
269 std::vector<int32_t> shifts;
270 std::transform(quant_multipliers.begin(), quant_multipliers.end(), std::back_inserter(shifts),
271 [](kernels::ChannelQuantMultipliers cm) { return cm.shift; });
272 std::vector<int32_t> multipliers;
273 std::transform(quant_multipliers.begin(), quant_multipliers.end(),
274 std::back_inserter(multipliers),
275 [](kernels::ChannelQuantMultipliers cm) { return cm.multiplier; });
277 auto *input_data = runtime_graph->getDataByTensor(input);
278 auto *output_data = runtime_graph->getDataByTensor(output);
280 auto *filter_data = runtime_graph->getConstDataByTensor(filter);
281 auto *bias_data = runtime_graph->getConstDataByTensor(bias);
283 luci_interpreter_pal::ConvPerChannel(
284 params, multipliers.data(), shifts.data(), kernels::getTensorShape(input),
285 kernels::getTensorData<int8_t>(input_data), kernels::getTensorShape(filter),
286 kernels::getTensorData<int8_t>(filter_data), kernels::getTensorShape(bias),
287 kernels::getTensorData<int32_t>(bias_data), kernels::getTensorShape(output),
288 kernels::getTensorData<int8_t>(output_data), kernels::getTensorShape(nullptr), nullptr);
291 void evalQuantizedS16(const circle::Tensor *input, const circle::Tensor *filter,
292 const circle::Tensor *bias, const circle::Tensor *output,
293 const circle::Conv2DOptions *options, BaseRuntimeGraph *runtime_graph)
295 auto *raw_input_data = runtime_graph->getDataByTensor(input);
296 auto *raw_output_data = runtime_graph->getDataByTensor(output);
298 auto *raw_filter_data = runtime_graph->getConstDataByTensor(filter);
299 auto *raw_bias_data = runtime_graph->getConstDataByTensor(bias);
301 const auto *input_data = kernels::getTensorData<uint8_t>(raw_input_data);
302 const auto *filter_data = kernels::getTensorData<uint8_t>(raw_filter_data);
303 const auto *bias_data = kernels::getTensorData<int32_t>(raw_bias_data);
304 auto *output_data = kernels::getTensorData<uint8_t>(raw_output_data);
306 const int32_t batches = Tensor::dim(input, 0);
307 const int32_t input_height = Tensor::dim(input, 1);
308 const int32_t input_width = Tensor::dim(input, 2);
309 const int32_t input_depth = Tensor::dim(input, 3);
310 const int32_t output_depth = Tensor::dim(filter, 0);
311 const int32_t filter_height = Tensor::dim(filter, 1);
312 const int32_t filter_width = Tensor::dim(filter, 2);
313 const int32_t output_height = Tensor::dim(output, 1);
314 const int32_t output_width = Tensor::dim(output, 2);
316 const int32_t stride_height = options->stride_h();
317 const int32_t stride_width = options->stride_w();
318 const int32_t dilation_height_factor = options->dilation_h_factor();
319 const int32_t dilation_width_factor = options->dilation_w_factor();
321 int32_t activation_min{};
322 int32_t activation_max{};
323 kernels::calculateActivationRangeQuantized(luci_actfunc(options->fused_activation_function()),
324 output, &activation_min, &activation_max);
326 const std::vector<double> effective_output_scale = kernels::getQuantizedConvolutionMultiplers(
327 Tensor::scale(input), Tensor::scales(filter), Tensor::scale(output));
329 const std::vector<kernels::ChannelQuantMultipliers> multipliers_raw =
330 kernels::quantizeMultipliers(effective_output_scale);
331 kernels::BroadcastableWrapper<kernels::ChannelQuantMultipliers> multipliers(multipliers_raw);
333 for (int32_t batch = 0; batch < batches; ++batch)
335 for (int32_t out_y = 0; out_y < output_height; ++out_y)
337 for (int32_t out_x = 0; out_x < output_width; ++out_x)
339 for (int32_t out_c = 0; out_c < output_depth; ++out_c)
341 const int32_t in_y_origin =
342 out_y * stride_height - compute_padding_h(input, filter, options);
343 const int32_t in_x_origin =
344 out_x * stride_width - compute_padding_w(input, filter, options);
346 for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y)
348 for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x)
350 const int32_t in_y = in_y_origin + dilation_height_factor * filter_y;
351 const int32_t in_x = in_x_origin + dilation_width_factor * filter_x;
352 if ((in_y >= 0 && in_y < input_height) && (in_x >= 0 && in_x < input_width))
354 for (int32_t in_c = 0; in_c < input_depth; ++in_c)
356 const int16_t input_val =
357 input_data[kernels::calcOffset(input, batch, in_y, in_x, in_c)];
358 const int16_t filter_val =
359 filter_data[kernels::calcOffset(filter, out_c, filter_y, filter_x, in_c)];
360 acc += static_cast<int64_t>(input_val) * static_cast<int64_t>(filter_val);
367 acc += bias_data[out_c];
370 int32_t scaled_acc = tflite::MultiplyByQuantizedMultiplier(
371 acc, multipliers[out_c].multiplier, multipliers[out_c].shift);
373 scaled_acc = std::max(scaled_acc, activation_min);
374 scaled_acc = std::min(scaled_acc, activation_max);
376 output_data[kernels::calcOffset(output, batch, out_y, out_x, out_c)] = scaled_acc;
386 void configure_kernel_CircleConv2D(const circle::Operator *cur_op, BaseRuntimeGraph *runtime_graph)
388 const auto input_index = cur_op->inputs()->operator[](0);
389 const auto filter_index = cur_op->inputs()->operator[](1);
390 const auto bias_index = cur_op->inputs()->operator[](2);
391 const auto output_index = cur_op->outputs()->operator[](0);
393 assert(input_index != -1);
394 assert(filter_index != -1);
395 assert(output_index != -1);
397 const auto input = runtime_graph->getCircleTensorByIndex(input_index);
398 const auto filter = runtime_graph->getCircleTensorByIndex(filter_index);
399 const auto bias = runtime_graph->getCircleTensorByIndex(bias_index);
400 const auto output = runtime_graph->getCircleTensorByIndex(output_index);
402 assert(input != nullptr);
403 assert(filter != nullptr);
405 auto filter_data = runtime_graph->getConstDataByTensor(filter);
407 assert(filter_data != nullptr);
409 const auto *options = cur_op->builtin_options_as_Conv2DOptions();
411 if (Tensor::element_type(input) == DataType::FLOAT32 &&
412 Tensor::element_type(filter) == DataType::FLOAT32)
414 LUCI_INTERPRETER_CHECK(bias == nullptr || Tensor::element_type(bias) == DataType::FLOAT32);
417 else if (Tensor::element_type(input) == DataType::U8 &&
418 Tensor::element_type(filter) == DataType::U8)
420 LUCI_INTERPRETER_CHECK(bias == nullptr || Tensor::element_type(bias) == DataType::S32);
422 else if (Tensor::element_type(input) == DataType::S8 &&
423 Tensor::element_type(filter) == DataType::S8)
425 LUCI_INTERPRETER_CHECK(bias == nullptr || Tensor::element_type(bias) == DataType::S32);
426 LUCI_INTERPRETER_CHECK(Tensor::num_dims(filter) == 4);
427 LUCI_INTERPRETER_CHECK(Tensor::scales(filter).size() ==
428 static_cast<size_t>(Tensor::dim(filter, 0)));
429 for (auto zerop : Tensor::zero_points(filter))
431 LUCI_INTERPRETER_CHECK(zerop == 0);
434 else if (Tensor::element_type(input) == DataType::S16 &&
435 Tensor::element_type(filter) == DataType::S16)
437 LUCI_INTERPRETER_CHECK(bias == nullptr || Tensor::element_type(bias) == DataType::S64);
442 assert(false && "Unsupported type.");
444 LUCI_INTERPRETER_CHECK(Tensor::element_type(output) == Tensor::element_type(input));
445 LUCI_INTERPRETER_CHECK(Tensor::num_dims(input) == 4 && Tensor::num_dims(filter) == 4);
447 const int32_t output_depth = Tensor::dim(filter, 0);
448 LUCI_INTERPRETER_CHECK(Tensor::dim(filter, 3) == Tensor::dim(input, 3));
450 LUCI_INTERPRETER_CHECK(bias == nullptr ||
451 (Tensor::num_dims(bias) == 1 && Tensor::dim(bias, 0) == output_depth));
453 switch (options->fused_activation_function())
455 case circle::ActivationFunctionType_NONE:
456 case circle::ActivationFunctionType_RELU:
457 case circle::ActivationFunctionType_RELU6:
458 case circle::ActivationFunctionType_RELU_N1_TO_1:
461 assert(false && "Unsupported fused activation");
465 void execute_kernel_CircleConv2D(const circle::Operator *cur_op, BaseRuntimeGraph *runtime_graph,
468 const auto input_index = cur_op->inputs()->operator[](0);
469 const auto weight_index = cur_op->inputs()->operator[](1);
470 const auto bias_index = cur_op->inputs()->operator[](2);
471 const auto output_index = cur_op->outputs()->operator[](0);
473 assert(input_index != -1);
474 assert(weight_index != -1);
475 assert(output_index != -1);
477 const auto input = runtime_graph->getCircleTensorByIndex(input_index);
478 const auto weights = runtime_graph->getCircleTensorByIndex(weight_index);
479 const auto bias = runtime_graph->getCircleTensorByIndex(bias_index);
480 const auto output = runtime_graph->getCircleTensorByIndex(output_index);
482 assert(input != nullptr);
483 assert(weights != nullptr);
484 assert(output != nullptr);
486 const auto *options = cur_op->builtin_options_as_Conv2DOptions();
488 switch (Tensor::element_type(input))
491 case DataType::FLOAT32:
492 if (Tensor::element_type(weights) == DataType::FLOAT32)
494 evalFloat(input, weights, bias, output, options, runtime_graph);
500 if (Tensor::scales(weights).size() == 1)
502 evalQuantized(input, weights, bias, output, options, runtime_graph);
504 else if (Tensor::scales(weights).size() > 1)
506 LUCI_INTERPRETER_CHECK(Tensor::num_dims(weights) == 4);
507 LUCI_INTERPRETER_CHECK(Tensor::scales(weights).size() ==
508 static_cast<size_t>(Tensor::dim(weights, 0)));
509 evalQuantizedPerChannel(input, weights, bias, output, options, runtime_graph);
513 evalQuantizedS8PerChannel(input, weights, bias, output, options, runtime_graph);
516 evalQuantizedS16(input, weights, bias, output, options, runtime_graph);
520 assert(false && "Unsupported type.");
524 } // namespace luci_interpreter