2 * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
18 #include "kernels/Conv2D.h"
20 #include "kernels/Utils.h"
22 #include "PALConv2d.h"
27 namespace luci_interpreter
32 Conv2D::Conv2D(const Tensor *input, const Tensor *filter, const Tensor *bias, Tensor *output,
33 Tensor *im2col, const Conv2DParams ¶ms)
34 : KernelWithParams<Conv2DParams>({input, filter, bias}, {output, im2col}, params)
38 void Conv2D::configure()
40 // TensorFlow Lite (as of v2.2.0) supports the following combinations of types:
41 // | input filter bias output |
42 // ----+---------------------------+
43 // (1) | float float float float |
44 // (2) | float int8 float float | hybrid
45 // (3) | uint8 uint8 int32 uint8 | quantized
46 // (4) | int8 int8 int32 int8 | quantized per channel
48 // We only support (1), (3) and (4) for now, and additionally the following:
49 // | input filter bias output |
50 // ----+---------------------------+
51 // (5) | int16 int16 int64 int16 |
53 if (input()->element_type() == DataType::FLOAT32 && filter()->element_type() == DataType::FLOAT32)
55 LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::FLOAT32);
57 else if (input()->element_type() == DataType::U8 && filter()->element_type() == DataType::U8)
59 LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S32);
61 else if (input()->element_type() == DataType::S8 && filter()->element_type() == DataType::S8)
63 LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S32);
64 LUCI_INTERPRETER_CHECK(filter()->shape().num_dims() == 4);
65 LUCI_INTERPRETER_CHECK(filter()->scales().size() ==
66 static_cast<size_t>(filter()->shape().dim(0)));
67 for (auto zerop : filter()->zero_points())
69 LUCI_INTERPRETER_CHECK(zerop == 0);
72 else if (input()->element_type() == DataType::S16 && filter()->element_type() == DataType::S16)
74 LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S64);
78 throw std::runtime_error("Unsupported type.");
80 LUCI_INTERPRETER_CHECK(output()->element_type() == input()->element_type());
82 const Shape &input_shape = input()->shape();
83 const Shape &filter_shape = filter()->shape();
84 LUCI_INTERPRETER_CHECK(input_shape.num_dims() == 4 && filter_shape.num_dims() == 4);
86 const int32_t batches = input_shape.dim(0);
87 const int32_t input_height = input_shape.dim(1);
88 const int32_t input_width = input_shape.dim(2);
89 const int32_t output_depth = filter_shape.dim(0);
90 const int32_t filter_height = filter_shape.dim(1);
91 const int32_t filter_width = filter_shape.dim(2);
92 LUCI_INTERPRETER_CHECK(filter_shape.dim(3) == input_shape.dim(3));
94 LUCI_INTERPRETER_CHECK(bias() == nullptr || (bias()->shape().num_dims() == 1 &&
95 bias()->shape().dim(0) == output_depth));
97 const int32_t output_height =
98 computeOutputSize(_params.padding, input_height, filter_height, _params.stride_height,
99 _params.dilation_height_factor);
100 const int32_t output_width =
101 computeOutputSize(_params.padding, input_width, filter_width, _params.stride_width,
102 _params.dilation_width_factor);
104 _padding_height = computePadding(_params.stride_height, _params.dilation_height_factor,
105 input_height, filter_height, output_height);
106 _padding_width = computePadding(_params.stride_width, _params.dilation_width_factor, input_width,
107 filter_width, output_width);
109 output()->resize({batches, output_height, output_width, output_depth});
111 // Allocate tensor for Im2Col, if needed.
112 // The checks here should be aligned with the actual implementation.
113 const bool need_dilated_im2col =
114 _params.dilation_height_factor != 1 || _params.dilation_width_factor != 1;
115 const bool need_non_dilated_im2col = _params.stride_height != 1 || _params.stride_width != 1 ||
116 filter_height != 1 || filter_width != 1;
118 input()->element_type() != DataType::S16 && (need_dilated_im2col || need_non_dilated_im2col);
121 const int input_depth = input_shape.dim(3);
122 Shape im2col_shape{batches, output_height, output_width,
123 input_depth * filter_height * filter_width};
124 auto im2col = getOutputTensors()[1];
125 im2col->resize(im2col_shape);
129 auto im2col = getOutputTensors()[1];
130 im2col->set_allocatable(false);
133 switch (_params.activation)
135 case Activation::NONE:
136 case Activation::RELU:
137 case Activation::RELU6:
138 case Activation::RELU_N1_TO_1:
141 throw std::runtime_error("Unsupported fused activation");
145 void Conv2D::execute() const
147 switch (input()->element_type())
149 case DataType::FLOAT32:
150 if (filter()->element_type() == DataType::FLOAT32)
155 throw std::runtime_error("Unsupported type.");
157 if (filter()->scales().size() == 1)
161 else if (filter()->scales().size() > 1)
163 LUCI_INTERPRETER_CHECK(filter()->shape().num_dims() == 4);
164 LUCI_INTERPRETER_CHECK(filter()->scales().size() ==
165 static_cast<size_t>(filter()->shape().dim(0)));
166 evalQuantizedPerChannel();
170 evalQuantizedS8PerChannel();
176 throw std::runtime_error("Unsupported type.");
180 void Conv2D::evalFloat() const
182 float activation_min{};
183 float activation_max{};
184 calculateActivationRange(_params.activation, &activation_min, &activation_max);
186 tflite::ConvParams params{};
187 params.padding_values.height = _padding_height;
188 params.padding_values.width = _padding_width;
189 params.stride_height = _params.stride_height;
190 params.stride_width = _params.stride_width;
191 params.dilation_height_factor = _params.dilation_height_factor;
192 params.dilation_width_factor = _params.dilation_width_factor;
193 params.float_activation_min = activation_min;
194 params.float_activation_max = activation_max;
196 float *im2col_data = nullptr;
197 auto im2col = getOutputTensors()[1];
200 im2col_data = im2col->data<float>();
202 luci_interpreter_pal::Conv(
203 params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(filter()),
204 getTensorData<float>(filter()), getTensorShape(bias()), getTensorData<float>(bias()),
205 getTensorShape(output()), getTensorData<float>(output()), getTensorShape(im2col), im2col_data);
208 void Conv2D::evalQuantized() const
210 const auto input_scale = static_cast<double>(input()->scale());
211 const auto filter_scale = static_cast<double>(filter()->scale());
212 const auto output_scale = static_cast<double>(output()->scale());
214 const double real_multiplier = input_scale * filter_scale / output_scale;
215 int32_t output_multiplier{};
217 quantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
219 int32_t activation_min{};
220 int32_t activation_max{};
221 calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
223 tflite::ConvParams params{};
224 params.padding_values.height = _padding_height;
225 params.padding_values.width = _padding_width;
226 params.stride_height = _params.stride_height;
227 params.stride_width = _params.stride_width;
228 params.dilation_height_factor = _params.dilation_height_factor;
229 params.dilation_width_factor = _params.dilation_width_factor;
230 // The kernel expects input and filter zero points to be negated.
231 params.input_offset = -input()->zero_point(); // Note the '-'.
232 params.weights_offset = -filter()->zero_point(); // Note the '-'.
233 params.output_offset = output()->zero_point();
234 params.output_multiplier = output_multiplier;
235 params.output_shift = output_shift;
236 params.quantized_activation_min = activation_min;
237 params.quantized_activation_max = activation_max;
239 auto im2col = getOutputTensors()[1];
240 luci_interpreter_pal::Conv(params, getTensorShape(input()), getTensorData<uint8_t>(input()),
241 getTensorShape(filter()), getTensorData<uint8_t>(filter()),
242 getTensorShape(bias()), getTensorData<int32_t>(bias()),
243 getTensorShape(output()), getTensorData<uint8_t>(output()),
244 getTensorShape(im2col), getTensorData<uint8_t>(im2col));
247 void Conv2D::evalQuantizedPerChannel() const
249 const auto *input_data = getTensorData<uint8_t>(input());
250 const auto *filter_data = getTensorData<uint8_t>(filter());
251 const auto *bias_data = getTensorData<int32_t>(bias());
252 auto *output_data = getTensorData<uint8_t>(output());
254 const Shape &input_shape = input()->shape();
255 const Shape &filter_shape = filter()->shape();
256 const Shape &output_shape = output()->shape();
258 const int32_t batches = input_shape.dim(0);
259 const int32_t input_height = input_shape.dim(1);
260 const int32_t input_width = input_shape.dim(2);
261 const int32_t input_depth = input_shape.dim(3);
262 const int32_t output_depth = filter_shape.dim(0);
263 const int32_t filter_height = filter_shape.dim(1);
264 const int32_t filter_width = filter_shape.dim(2);
265 const int32_t output_height = output_shape.dim(1);
266 const int32_t output_width = output_shape.dim(2);
268 const int32_t stride_height = _params.stride_height;
269 const int32_t stride_width = _params.stride_width;
270 const int32_t dilation_height_factor = _params.dilation_height_factor;
271 const int32_t dilation_width_factor = _params.dilation_width_factor;
273 int32_t activation_min{};
274 int32_t activation_max{};
275 calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
277 const std::vector<double> effective_output_scale =
278 getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
280 const std::vector<ChannelQuantMultipliers> multipliers_raw =
281 quantizeMultipliers(effective_output_scale);
282 BroadcastableWrapper<ChannelQuantMultipliers> quant_multipliers(multipliers_raw);
284 for (int32_t batch = 0; batch < batches; ++batch)
286 for (int32_t out_y = 0; out_y < output_height; ++out_y)
288 for (int32_t out_x = 0; out_x < output_width; ++out_x)
290 for (int32_t out_c = 0; out_c < output_depth; ++out_c)
292 const int32_t in_y_origin = out_y * stride_height - _padding_height;
293 const int32_t in_x_origin = out_x * stride_width - _padding_width;
295 for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y)
297 for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x)
299 const int32_t in_y = in_y_origin + dilation_height_factor * filter_y;
300 const int32_t in_x = in_x_origin + dilation_width_factor * filter_x;
301 if ((in_y >= 0 && in_y < input_height) && (in_x >= 0 && in_x < input_width))
303 for (int32_t in_c = 0; in_c < input_depth; ++in_c)
305 const uint8_t input_val =
306 input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
307 const uint8_t filter_val =
308 filter_data[calcOffset(filter_shape, out_c, filter_y, filter_x, in_c)];
309 acc += static_cast<int32_t>(input_val - input()->zero_point()) *
310 static_cast<int32_t>(filter_val - filter()->zero_points()[out_c]);
317 acc += bias_data[out_c];
320 int32_t scaled_acc = tflite::MultiplyByQuantizedMultiplier(
321 acc, quant_multipliers[out_c].multiplier, quant_multipliers[out_c].shift);
323 scaled_acc += output()->zero_point();
324 scaled_acc = std::max(scaled_acc, activation_min);
325 scaled_acc = std::min(scaled_acc, activation_max);
326 output_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] = scaled_acc;
333 void Conv2D::evalQuantizedS8PerChannel() const
335 int32_t activation_min{};
336 int32_t activation_max{};
337 calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
339 tflite::ConvParams params{};
340 params.padding_values.height = _padding_height;
341 params.padding_values.width = _padding_width;
342 params.stride_height = _params.stride_height;
343 params.stride_width = _params.stride_width;
344 params.dilation_height_factor = _params.dilation_height_factor;
345 params.dilation_width_factor = _params.dilation_width_factor;
346 // The kernel expects filter zero points to be negated.
347 params.input_offset = -input()->zero_point(); // Note the '-'.
348 params.weights_offset = 0; // Unused in tflite code
349 params.output_offset = output()->zero_point();
350 params.quantized_activation_min = activation_min;
351 params.quantized_activation_max = activation_max;
353 const std::vector<double> effective_output_scales =
354 getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
356 std::vector<ChannelQuantMultipliers> quant_multipliers =
357 quantizeMultipliers(effective_output_scales);
359 std::vector<int32_t> shifts;
360 std::transform(quant_multipliers.begin(), quant_multipliers.end(), std::back_inserter(shifts),
361 [](ChannelQuantMultipliers cm) { return cm.shift; });
362 std::vector<int32_t> multipliers;
363 std::transform(quant_multipliers.begin(), quant_multipliers.end(),
364 std::back_inserter(multipliers),
365 [](ChannelQuantMultipliers cm) { return cm.multiplier; });
367 int8_t *im2col_data = nullptr;
368 auto im2col = getOutputTensors()[1];
371 im2col_data = im2col->data<int8_t>();
374 luci_interpreter_pal::ConvPerChannel(
375 params, multipliers.data(), shifts.data(), getTensorShape(input()),
376 getTensorData<int8_t>(input()), getTensorShape(filter()), getTensorData<int8_t>(filter()),
377 getTensorShape(bias()), getTensorData<int32_t>(bias()), getTensorShape(output()),
378 getTensorData<int8_t>(output()), getTensorShape(im2col), im2col_data);
381 void Conv2D::evalQuantizedS16() const
383 const auto *input_data = getTensorData<int16_t>(input());
384 const auto *filter_data = getTensorData<int16_t>(filter());
385 const auto *bias_data = getTensorData<int64_t>(bias());
386 auto *output_data = getTensorData<int16_t>(output());
388 const Shape &input_shape = input()->shape();
389 const Shape &filter_shape = filter()->shape();
390 const Shape &output_shape = output()->shape();
392 const int32_t batches = input_shape.dim(0);
393 const int32_t input_height = input_shape.dim(1);
394 const int32_t input_width = input_shape.dim(2);
395 const int32_t input_depth = input_shape.dim(3);
396 const int32_t output_depth = filter_shape.dim(0);
397 const int32_t filter_height = filter_shape.dim(1);
398 const int32_t filter_width = filter_shape.dim(2);
399 const int32_t output_height = output_shape.dim(1);
400 const int32_t output_width = output_shape.dim(2);
402 const int32_t stride_height = _params.stride_height;
403 const int32_t stride_width = _params.stride_width;
404 const int32_t dilation_height_factor = _params.dilation_height_factor;
405 const int32_t dilation_width_factor = _params.dilation_width_factor;
407 int32_t activation_min{};
408 int32_t activation_max{};
409 calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
411 const std::vector<double> effective_output_scale =
412 getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
414 const std::vector<ChannelQuantMultipliers> multipliers_raw =
415 quantizeMultipliers(effective_output_scale);
416 BroadcastableWrapper<ChannelQuantMultipliers> multipliers(multipliers_raw);
418 for (int32_t batch = 0; batch < batches; ++batch)
420 for (int32_t out_y = 0; out_y < output_height; ++out_y)
422 for (int32_t out_x = 0; out_x < output_width; ++out_x)
424 for (int32_t out_c = 0; out_c < output_depth; ++out_c)
426 const int32_t in_y_origin = out_y * stride_height - _padding_height;
427 const int32_t in_x_origin = out_x * stride_width - _padding_width;
429 for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y)
431 for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x)
433 const int32_t in_y = in_y_origin + dilation_height_factor * filter_y;
434 const int32_t in_x = in_x_origin + dilation_width_factor * filter_x;
435 if ((in_y >= 0 && in_y < input_height) && (in_x >= 0 && in_x < input_width))
437 for (int32_t in_c = 0; in_c < input_depth; ++in_c)
439 const int16_t input_val =
440 input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
441 const int16_t filter_val =
442 filter_data[calcOffset(filter_shape, out_c, filter_y, filter_x, in_c)];
443 acc += static_cast<int64_t>(input_val) * static_cast<int64_t>(filter_val);
450 acc += bias_data[out_c];
453 int32_t scaled_acc = tflite::MultiplyByQuantizedMultiplier(
454 acc, multipliers[out_c].multiplier, multipliers[out_c].shift);
456 scaled_acc = std::max(scaled_acc, activation_min);
457 scaled_acc = std::min(scaled_acc, activation_max);
459 output_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] = scaled_acc;
466 } // namespace kernels
467 } // namespace luci_interpreter