2 * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
18 #include "kernels/Conv2D.h"
20 #include "kernels/Utils.h"
22 #include <tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h>
27 namespace luci_interpreter
32 Conv2D::Conv2D(const Tensor *input, const Tensor *filter, const Tensor *bias, Tensor *output,
33 const Conv2DParams ¶ms)
34 : KernelWithParams<Conv2DParams>({input, filter, bias}, {output}, params)
38 void Conv2D::configure()
40 // TensorFlow Lite (as of v2.2.0) supports the following combinations of types:
41 // | input filter bias output |
42 // ----+---------------------------+
43 // (1) | float float float float |
44 // (2) | float int8 float float | hybrid
45 // (3) | uint8 uint8 int32 uint8 | quantized
46 // (4) | int8 int8 int32 int8 | quantized per channel
48 // We only support (1) and (3) for now, and additionally the following:
49 // | input filter bias output |
50 // ----+---------------------------+
51 // (5) | int16 int16 int64 int16 |
53 if (input()->element_type() == DataType::FLOAT32 && filter()->element_type() == DataType::FLOAT32)
55 LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::FLOAT32);
57 else if (input()->element_type() == DataType::U8 && filter()->element_type() == DataType::U8)
59 LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S32);
61 else if (input()->element_type() == DataType::S16 && filter()->element_type() == DataType::S16)
63 LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S64);
67 throw std::runtime_error("Unsupported type.");
69 LUCI_INTERPRETER_CHECK(output()->element_type() == input()->element_type());
71 const Shape &input_shape = input()->shape();
72 const Shape &filter_shape = filter()->shape();
73 LUCI_INTERPRETER_CHECK(input_shape.num_dims() == 4 && filter_shape.num_dims() == 4);
75 const int32_t batches = input_shape.dim(0);
76 const int32_t input_height = input_shape.dim(1);
77 const int32_t input_width = input_shape.dim(2);
78 const int32_t output_depth = filter_shape.dim(0);
79 const int32_t filter_height = filter_shape.dim(1);
80 const int32_t filter_width = filter_shape.dim(2);
81 LUCI_INTERPRETER_CHECK(filter_shape.dim(3) == input_shape.dim(3));
83 LUCI_INTERPRETER_CHECK(bias() == nullptr || (bias()->shape().num_dims() == 1 &&
84 bias()->shape().dim(0) == output_depth));
86 const int32_t output_height =
87 computeOutputSize(_params.padding, input_height, filter_height, _params.stride_height,
88 _params.dilation_height_factor);
89 const int32_t output_width =
90 computeOutputSize(_params.padding, input_width, filter_width, _params.stride_width,
91 _params.dilation_width_factor);
93 _padding_height = computePadding(_params.stride_height, _params.dilation_height_factor,
94 input_height, filter_height, output_height);
95 _padding_width = computePadding(_params.stride_width, _params.dilation_width_factor, input_width,
96 filter_width, output_width);
98 output()->resize({batches, output_height, output_width, output_depth});
100 // Allocate tensor for Im2Col, if needed.
101 // The checks here should be aligned with the actual implementation.
102 const bool need_dilated_im2col =
103 _params.dilation_height_factor != 1 || _params.dilation_width_factor != 1;
104 const bool need_non_dilated_im2col = _params.stride_height != 1 || _params.stride_width != 1 ||
105 filter_height != 1 || filter_width != 1;
106 const bool need_im2col =
107 input()->element_type() != DataType::S16 && (need_dilated_im2col || need_non_dilated_im2col);
110 const int input_depth = input_shape.dim(3);
111 Shape im2col_shape{batches, output_height, output_width,
112 input_depth * filter_height * filter_width};
116 std::make_unique<Tensor>(input()->element_type(), im2col_shape, AffineQuantization{}, "");
118 catch (std::bad_alloc &ba)
120 // Failed memory allocation
126 void Conv2D::execute() const
128 switch (input()->element_type())
130 case DataType::FLOAT32:
131 if (filter()->element_type() == DataType::FLOAT32)
136 throw std::runtime_error("Unsupported type.");
138 if (filter()->scales().size() == 1)
142 else if (filter()->scales().size() > 1)
144 LUCI_INTERPRETER_CHECK(filter()->shape().num_dims() == 4);
145 LUCI_INTERPRETER_CHECK(filter()->scales().size() ==
146 static_cast<size_t>(filter()->shape().dim(0)));
147 evalQuantizedPerChannel();
154 throw std::runtime_error("Unsupported type.");
157 _im2col->deallocate();
160 void Conv2D::evalFloat() const
162 float activation_min{};
163 float activation_max{};
164 calculateActivationRange(_params.activation, &activation_min, &activation_max);
166 tflite::ConvParams params{};
167 params.padding_values.height = _padding_height;
168 params.padding_values.width = _padding_width;
169 params.stride_height = _params.stride_height;
170 params.stride_width = _params.stride_width;
171 params.dilation_height_factor = _params.dilation_height_factor;
172 params.dilation_width_factor = _params.dilation_width_factor;
173 params.float_activation_min = activation_min;
174 params.float_activation_max = activation_max;
177 tflite::optimized_ops::Conv(params, getTensorShape(input()), getTensorData<float>(input()),
178 getTensorShape(filter()), getTensorData<float>(filter()),
179 getTensorShape(bias()), getTensorData<float>(bias()),
180 getTensorShape(output()), getTensorData<float>(output()),
181 getTensorShape(_im2col.get()), getTensorData<float>(_im2col.get()));
183 tflite::reference_ops::Conv(
184 params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(filter()),
185 getTensorData<float>(filter()), getTensorShape(bias()), getTensorData<float>(bias()),
186 getTensorShape(output()), getTensorData<float>(output()), tflite::RuntimeShape(), nullptr);
189 void Conv2D::evalQuantized() const
191 const auto input_scale = static_cast<double>(input()->scale());
192 const auto filter_scale = static_cast<double>(filter()->scale());
193 const auto output_scale = static_cast<double>(output()->scale());
195 const double real_multiplier = input_scale * filter_scale / output_scale;
196 int32_t output_multiplier{};
198 quantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
200 int32_t activation_min{};
201 int32_t activation_max{};
202 calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
204 tflite::ConvParams params{};
205 params.padding_values.height = _padding_height;
206 params.padding_values.width = _padding_width;
207 params.stride_height = _params.stride_height;
208 params.stride_width = _params.stride_width;
209 params.dilation_height_factor = _params.dilation_height_factor;
210 params.dilation_width_factor = _params.dilation_width_factor;
211 // The kernel expects input and filter zero points to be negated.
212 params.input_offset = -input()->zero_point(); // Note the '-'.
213 params.weights_offset = -filter()->zero_point(); // Note the '-'.
214 params.output_offset = output()->zero_point();
215 params.output_multiplier = output_multiplier;
216 params.output_shift = output_shift;
217 params.quantized_activation_min = activation_min;
218 params.quantized_activation_max = activation_max;
220 // TODO This should only be done once (although it takes only a few microseconds).
221 // Also, the user should be able to adjust the number of threads.
222 auto gemmlowp_context = std::make_unique<gemmlowp::GemmContext>();
223 gemmlowp_context->set_max_num_threads(static_cast<int>(std::thread::hardware_concurrency()));
225 tflite::optimized_ops::Conv(
226 params, getTensorShape(input()), getTensorData<uint8_t>(input()), getTensorShape(filter()),
227 getTensorData<uint8_t>(filter()), getTensorShape(bias()), getTensorData<int32_t>(bias()),
228 getTensorShape(output()), getTensorData<uint8_t>(output()), getTensorShape(_im2col.get()),
229 getTensorData<uint8_t>(_im2col.get()), gemmlowp_context.get());
232 void Conv2D::evalQuantizedPerChannel() const
234 const auto *input_data = getTensorData<uint8_t>(input());
235 const auto *filter_data = getTensorData<uint8_t>(filter());
236 const auto *bias_data = getTensorData<int32_t>(bias());
237 auto *output_data = getTensorData<uint8_t>(output());
239 const Shape &input_shape = input()->shape();
240 const Shape &filter_shape = filter()->shape();
241 const Shape &output_shape = output()->shape();
243 const int32_t batches = input_shape.dim(0);
244 const int32_t input_height = input_shape.dim(1);
245 const int32_t input_width = input_shape.dim(2);
246 const int32_t input_depth = input_shape.dim(3);
247 const int32_t output_depth = filter_shape.dim(0);
248 const int32_t filter_height = filter_shape.dim(1);
249 const int32_t filter_width = filter_shape.dim(2);
250 const int32_t output_height = output_shape.dim(1);
251 const int32_t output_width = output_shape.dim(2);
253 const int32_t stride_height = _params.stride_height;
254 const int32_t stride_width = _params.stride_width;
255 const int32_t dilation_height_factor = _params.dilation_height_factor;
256 const int32_t dilation_width_factor = _params.dilation_width_factor;
258 int32_t activation_min{};
259 int32_t activation_max{};
260 calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
262 const std::vector<double> effective_output_scale =
263 getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
265 const std::vector<ChannelQuantMultipliers> multipliers_raw =
266 quantizeMultipliers(effective_output_scale);
267 BroadcastableWrapper<ChannelQuantMultipliers> quant_multipliers(multipliers_raw);
269 for (int32_t batch = 0; batch < batches; ++batch)
271 for (int32_t out_y = 0; out_y < output_height; ++out_y)
273 for (int32_t out_x = 0; out_x < output_width; ++out_x)
275 for (int32_t out_c = 0; out_c < output_depth; ++out_c)
277 const int32_t in_y_origin = out_y * stride_height - _padding_height;
278 const int32_t in_x_origin = out_x * stride_width - _padding_width;
280 for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y)
282 for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x)
284 const int32_t in_y = in_y_origin + dilation_height_factor * filter_y;
285 const int32_t in_x = in_x_origin + dilation_width_factor * filter_x;
286 if ((in_y >= 0 && in_y < input_height) && (in_x >= 0 && in_x < input_width))
288 for (int32_t in_c = 0; in_c < input_depth; ++in_c)
290 const uint8_t input_val =
291 input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
292 const uint8_t filter_val =
293 filter_data[calcOffset(filter_shape, out_c, filter_y, filter_x, in_c)];
294 acc += static_cast<int32_t>(input_val - input()->zero_point()) *
295 static_cast<int32_t>(filter_val - filter()->zero_points()[out_c]);
302 acc += bias_data[out_c];
305 int32_t scaled_acc = tflite::MultiplyByQuantizedMultiplier(
306 acc, quant_multipliers[out_c].multiplier, quant_multipliers[out_c].shift);
308 scaled_acc += output()->zero_point();
309 scaled_acc = std::max(scaled_acc, activation_min);
310 scaled_acc = std::min(scaled_acc, activation_max);
311 output_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] = scaled_acc;
318 void Conv2D::evalQuantizedS16() const
320 const auto *input_data = getTensorData<int16_t>(input());
321 const auto *filter_data = getTensorData<int16_t>(filter());
322 const auto *bias_data = getTensorData<int64_t>(bias());
323 auto *output_data = getTensorData<int16_t>(output());
325 const Shape &input_shape = input()->shape();
326 const Shape &filter_shape = filter()->shape();
327 const Shape &output_shape = output()->shape();
329 const int32_t batches = input_shape.dim(0);
330 const int32_t input_height = input_shape.dim(1);
331 const int32_t input_width = input_shape.dim(2);
332 const int32_t input_depth = input_shape.dim(3);
333 const int32_t output_depth = filter_shape.dim(0);
334 const int32_t filter_height = filter_shape.dim(1);
335 const int32_t filter_width = filter_shape.dim(2);
336 const int32_t output_height = output_shape.dim(1);
337 const int32_t output_width = output_shape.dim(2);
339 const int32_t stride_height = _params.stride_height;
340 const int32_t stride_width = _params.stride_width;
341 const int32_t dilation_height_factor = _params.dilation_height_factor;
342 const int32_t dilation_width_factor = _params.dilation_width_factor;
344 int32_t activation_min{};
345 int32_t activation_max{};
346 calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
348 const std::vector<double> effective_output_scale =
349 getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
351 const std::vector<ChannelQuantMultipliers> multipliers_raw =
352 quantizeMultipliers(effective_output_scale);
353 BroadcastableWrapper<ChannelQuantMultipliers> multipliers(multipliers_raw);
355 for (int32_t batch = 0; batch < batches; ++batch)
357 for (int32_t out_y = 0; out_y < output_height; ++out_y)
359 for (int32_t out_x = 0; out_x < output_width; ++out_x)
361 for (int32_t out_c = 0; out_c < output_depth; ++out_c)
363 const int32_t in_y_origin = out_y * stride_height - _padding_height;
364 const int32_t in_x_origin = out_x * stride_width - _padding_width;
366 for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y)
368 for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x)
370 const int32_t in_y = in_y_origin + dilation_height_factor * filter_y;
371 const int32_t in_x = in_x_origin + dilation_width_factor * filter_x;
372 if ((in_y >= 0 && in_y < input_height) && (in_x >= 0 && in_x < input_width))
374 for (int32_t in_c = 0; in_c < input_depth; ++in_c)
376 const int16_t input_val =
377 input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
378 const int16_t filter_val =
379 filter_data[calcOffset(filter_shape, out_c, filter_y, filter_x, in_c)];
380 acc += static_cast<int64_t>(input_val) * static_cast<int64_t>(filter_val);
387 acc += bias_data[out_c];
390 int32_t scaled_acc = tflite::MultiplyByQuantizedMultiplier(
391 acc, multipliers[out_c].multiplier, multipliers[out_c].shift);
393 scaled_acc = std::max(scaled_acc, activation_min);
394 scaled_acc = std::min(scaled_acc, activation_max);
396 output_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] = scaled_acc;
403 } // namespace kernels
404 } // namespace luci_interpreter