2 * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include "kernels/DepthwiseConv2D.h"
19 #include "kernels/Utils.h"
21 #include <tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h>
22 #include <tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h>
23 #include <tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h>
27 namespace luci_interpreter
32 DepthwiseConv2D::DepthwiseConv2D(const Tensor *input, const Tensor *filter, const Tensor *bias,
33 Tensor *output, const DepthwiseConv2DParams ¶ms)
34 : KernelWithParams<DepthwiseConv2DParams>({input, filter, bias}, {output}, params)
38 void DepthwiseConv2D::configure()
40 // TensorFlow Lite (as of v2.2.0) supports the following combinations of types:
41 // | input filter bias output |
42 // ----+---------------------------+
43 // (1) | float float float float |
44 // (2) | float int8 float float | hybrid
45 // (3) | uint8 uint8 int32 uint8 | quantized
46 // (4) | int8 int8 int32 int8 | quantized per channel
47 // (5) | int16 int8 int64 int16 | quantized per channel 16x8
49 // We only support (1), (3) and (4) for now, and additionally the following:
50 // | input filter bias output |
51 // ----+---------------------------+
52 // (5) | int16 int16 int64 int16 |
54 if (input()->element_type() == DataType::FLOAT32 && filter()->element_type() == DataType::FLOAT32)
56 LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::FLOAT32);
58 else if (input()->element_type() == DataType::U8 && filter()->element_type() == DataType::U8)
60 LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S32);
62 else if (input()->element_type() == DataType::S8 && filter()->element_type() == DataType::S8)
64 LUCI_INTERPRETER_CHECK(filter()->shape().num_dims() == 4);
65 LUCI_INTERPRETER_CHECK(static_cast<uint32_t>(filter()->shape().dim(3)) ==
66 filter()->scales().size());
67 for (auto zerop : filter()->zero_points())
69 LUCI_INTERPRETER_CHECK(zerop == 0);
71 LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S32);
73 else if (input()->element_type() == DataType::S16 && filter()->element_type() == DataType::S16)
75 LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S64);
79 throw std::runtime_error("Unsupported type.");
81 LUCI_INTERPRETER_CHECK(output()->element_type() == input()->element_type());
83 const Shape &input_shape = input()->shape();
84 const Shape &filter_shape = filter()->shape();
85 LUCI_INTERPRETER_CHECK(input_shape.num_dims() == 4 && filter_shape.num_dims() == 4);
87 const int32_t batches = input_shape.dim(0);
88 const int32_t input_height = input_shape.dim(1);
89 const int32_t input_width = input_shape.dim(2);
90 // Filter format: [1, H, W, O].
91 LUCI_INTERPRETER_CHECK(filter_shape.dim(0) == 1);
92 const int32_t filter_height = filter_shape.dim(1);
93 const int32_t filter_width = filter_shape.dim(2);
94 const int32_t channels_out = filter_shape.dim(3);
96 LUCI_INTERPRETER_CHECK(bias() == nullptr || (bias()->shape().num_dims() == 1 &&
97 bias()->shape().dim(0) == channels_out));
99 const int32_t output_height =
100 computeOutputSize(_params.padding, input_height, filter_height, _params.stride_height,
101 _params.dilation_height_factor);
102 const int32_t output_width =
103 computeOutputSize(_params.padding, input_width, filter_width, _params.stride_width,
104 _params.dilation_width_factor);
106 _padding_height = computePadding(_params.stride_height, _params.dilation_height_factor,
107 input_height, filter_height, output_height);
108 _padding_width = computePadding(_params.stride_width, _params.dilation_width_factor, input_width,
109 filter_width, output_width);
111 output()->resize({batches, output_height, output_width, channels_out});
114 void DepthwiseConv2D::execute() const
116 switch (input()->element_type())
118 case DataType::FLOAT32:
119 if (filter()->element_type() == DataType::FLOAT32)
124 throw std::runtime_error("Unsupported type.");
126 if (filter()->scales().size() == 1)
130 else if (filter()->scales().size() > 1)
132 LUCI_INTERPRETER_CHECK(filter()->shape().num_dims() == 4);
133 LUCI_INTERPRETER_CHECK(filter()->scales().size() ==
134 static_cast<size_t>(filter()->shape().dim(3)));
135 evalQuantizedPerChannel();
139 evalQuantizedS8PerChannel();
145 throw std::runtime_error("Unsupported type.");
149 void DepthwiseConv2D::evalFloat() const
151 float activation_min{};
152 float activation_max{};
153 calculateActivationRange(_params.activation, &activation_min, &activation_max);
155 tflite::DepthwiseParams params{};
156 params.padding_values.height = _padding_height;
157 params.padding_values.width = _padding_width;
158 params.stride_height = _params.stride_height;
159 params.stride_width = _params.stride_width;
160 params.dilation_height_factor = _params.dilation_height_factor;
161 params.dilation_width_factor = _params.dilation_width_factor;
162 params.depth_multiplier = _params.depth_multiplier;
163 params.float_activation_min = activation_min;
164 params.float_activation_max = activation_max;
166 tflite::reference_ops::DepthwiseConv(
167 params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(filter()),
168 getTensorData<float>(filter()), getTensorShape(bias()), getTensorData<float>(bias()),
169 getTensorShape(output()), getTensorData<float>(output()));
172 void DepthwiseConv2D::evalQuantizedPerChannel() const
174 const auto *input_data = getTensorData<uint8_t>(input());
175 const auto *filter_data = getTensorData<uint8_t>(filter());
176 const auto *bias_data = getTensorData<int32_t>(bias());
177 auto *output_data = getTensorData<uint8_t>(output());
179 const Shape &input_shape = input()->shape();
180 const Shape &filter_shape = filter()->shape();
181 const Shape &output_shape = output()->shape();
183 const int32_t batches = input_shape.dim(0);
184 const int32_t input_height = input_shape.dim(1);
185 const int32_t input_width = input_shape.dim(2);
186 const int32_t input_depth = input_shape.dim(3);
187 const int32_t filter_height = filter_shape.dim(1);
188 const int32_t filter_width = filter_shape.dim(2);
189 const int32_t output_height = output_shape.dim(1);
190 const int32_t output_width = output_shape.dim(2);
192 const int32_t stride_height = _params.stride_height;
193 const int32_t stride_width = _params.stride_width;
194 const int32_t dilation_height_factor = _params.dilation_height_factor;
195 const int32_t dilation_width_factor = _params.dilation_width_factor;
196 const int32_t depth_multiplier = _params.depth_multiplier;
198 int32_t activation_min{};
199 int32_t activation_max{};
200 calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
202 const std::vector<double> effective_output_scales =
203 getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
205 std::vector<ChannelQuantMultipliers> quant_multipliers_raw =
206 quantizeMultipliers(effective_output_scales);
207 BroadcastableWrapper<ChannelQuantMultipliers> quant_multipliers(quant_multipliers_raw);
209 for (int batch = 0; batch < batches; ++batch)
211 for (int out_y = 0; out_y < output_height; ++out_y)
213 for (int out_x = 0; out_x < output_width; ++out_x)
215 for (int in_channel = 0; in_channel < input_depth; ++in_channel)
217 for (int m = 0; m < depth_multiplier; ++m)
219 const int output_channel = m + in_channel * depth_multiplier;
220 const int in_x_origin = (out_x * stride_width) - _padding_width;
221 const int in_y_origin = (out_y * stride_height) - _padding_height;
223 for (int filter_y = 0; filter_y < filter_height; ++filter_y)
225 for (int filter_x = 0; filter_x < filter_width; ++filter_x)
227 const int in_x = in_x_origin + dilation_width_factor * filter_x;
228 const int in_y = in_y_origin + dilation_height_factor * filter_y;
229 // Zero padding by omitting the areas outside the image.
230 const bool is_point_inside_image =
231 (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height);
232 if (is_point_inside_image)
235 input_data[calcOffset(input_shape, batch, in_y, in_x, in_channel)];
237 filter_data[calcOffset(filter_shape, 0, filter_y, filter_x, output_channel)];
238 acc += (filter_val - filter()->zero_points()[output_channel]) *
239 (input_val - input()->zero_point());
245 acc += bias_data[output_channel];
247 int32_t output_multiplier = quant_multipliers[output_channel].multiplier;
248 int output_shift = quant_multipliers[output_channel].shift;
250 tflite::MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
251 scaled_acc += output()->zero_point();
252 scaled_acc = std::max(scaled_acc, activation_min);
253 scaled_acc = std::min(scaled_acc, activation_max);
254 output_data[calcOffset(output_shape, batch, out_y, out_x, output_channel)] =
255 static_cast<uint8_t>(scaled_acc);
263 void DepthwiseConv2D::evalQuantized() const
265 const auto input_scale = static_cast<double>(input()->scale());
266 const auto filter_scale = static_cast<double>(filter()->scale());
267 const auto output_scale = static_cast<double>(output()->scale());
269 const double real_multiplier = input_scale * filter_scale / output_scale;
270 int32_t output_multiplier{};
272 quantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
274 int32_t activation_min{};
275 int32_t activation_max{};
276 calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
278 tflite::DepthwiseParams params{};
279 params.padding_values.height = _padding_height;
280 params.padding_values.width = _padding_width;
281 params.stride_height = _params.stride_height;
282 params.stride_width = _params.stride_width;
283 params.dilation_height_factor = _params.dilation_height_factor;
284 params.dilation_width_factor = _params.dilation_width_factor;
285 params.depth_multiplier = _params.depth_multiplier;
286 // The kernel expects input and filter zero points to be negated.
287 params.input_offset = -input()->zero_point(); // Note the '-'.
288 params.weights_offset = -filter()->zero_point(); // Note the '-'.
289 params.output_offset = output()->zero_point();
290 params.output_multiplier = output_multiplier;
291 params.output_shift = output_shift;
292 params.quantized_activation_min = activation_min;
293 params.quantized_activation_max = activation_max;
295 tflite::reference_ops::DepthwiseConv(
296 params, getTensorShape(input()), getTensorData<uint8_t>(input()), getTensorShape(filter()),
297 getTensorData<uint8_t>(filter()), getTensorShape(bias()), getTensorData<int32_t>(bias()),
298 getTensorShape(output()), getTensorData<uint8_t>(output()));
301 void DepthwiseConv2D::evalQuantizedS8PerChannel() const
303 int32_t activation_min{};
304 int32_t activation_max{};
305 calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
307 tflite::DepthwiseParams params{};
309 params.padding_type = tflite::PaddingType::kSame;
310 params.padding_values.height = _padding_height;
311 params.padding_values.width = _padding_width;
312 params.stride_height = _params.stride_height;
313 params.stride_width = _params.stride_width;
314 params.dilation_height_factor = _params.dilation_height_factor;
315 params.dilation_width_factor = _params.dilation_width_factor;
316 params.depth_multiplier = _params.depth_multiplier;
317 // The kernel expects input and filter zero points to be negated.
318 params.input_offset = -input()->zero_point(); // Note the '-'.
319 params.weights_offset = 0;
320 params.output_offset = output()->zero_point();
321 params.output_multiplier = 1; // unused in tflite code
322 params.output_shift = 0; // unused in tflite code
323 params.quantized_activation_min = activation_min;
324 params.quantized_activation_max = activation_max;
326 const std::vector<double> effective_output_scales =
327 getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
329 std::vector<ChannelQuantMultipliers> quant_multipliers =
330 quantizeMultipliers(effective_output_scales);
332 std::vector<int32_t> shifts;
333 std::transform(quant_multipliers.begin(), quant_multipliers.end(), std::back_inserter(shifts),
334 [](ChannelQuantMultipliers cm) { return cm.shift; });
335 std::vector<int32_t> multipliers;
336 std::transform(quant_multipliers.begin(), quant_multipliers.end(),
337 std::back_inserter(multipliers),
338 [](ChannelQuantMultipliers cm) { return cm.multiplier; });
340 tflite::reference_integer_ops::DepthwiseConvPerChannel(
341 params, multipliers.data(), shifts.data(), getTensorShape(input()),
342 getTensorData<int8_t>(input()), getTensorShape(filter()), getTensorData<int8_t>(filter()),
343 getTensorShape(bias()), getTensorData<int32_t>(bias()), getTensorShape(output()),
344 getTensorData<int8_t>(output()));
347 void DepthwiseConv2D::evalQuantizedS16() const
349 const auto *input_data = getTensorData<int16_t>(input());
350 const auto *filter_data = getTensorData<int16_t>(filter());
351 const auto *bias_data = getTensorData<int64_t>(bias());
352 auto *output_data = getTensorData<int16_t>(output());
354 const Shape &input_shape = input()->shape();
355 const Shape &filter_shape = filter()->shape();
356 const Shape &output_shape = output()->shape();
358 const int32_t batches = input_shape.dim(0);
359 const int32_t input_height = input_shape.dim(1);
360 const int32_t input_width = input_shape.dim(2);
361 const int32_t input_depth = input_shape.dim(3);
362 const int32_t filter_height = filter_shape.dim(1);
363 const int32_t filter_width = filter_shape.dim(2);
364 const int32_t output_height = output_shape.dim(1);
365 const int32_t output_width = output_shape.dim(2);
367 const int32_t stride_height = _params.stride_height;
368 const int32_t stride_width = _params.stride_width;
369 const int32_t dilation_height_factor = _params.dilation_height_factor;
370 const int32_t dilation_width_factor = _params.dilation_width_factor;
371 const int32_t depth_multiplier = _params.depth_multiplier;
373 const std::vector<double> effective_output_scales =
374 getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
376 std::vector<ChannelQuantMultipliers> quant_multipliers_raw =
377 quantizeMultipliers(effective_output_scales);
379 BroadcastableWrapper<ChannelQuantMultipliers> quant_multipliers(quant_multipliers_raw);
381 int32_t activation_min{};
382 int32_t activation_max{};
383 calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
385 for (int32_t batch = 0; batch < batches; ++batch)
387 for (int32_t out_y = 0; out_y < output_height; ++out_y)
389 for (int32_t out_x = 0; out_x < output_width; ++out_x)
391 for (int32_t in_c = 0; in_c < input_depth; ++in_c)
393 for (int32_t m = 0; m < depth_multiplier; ++m)
395 const int32_t out_c = m + in_c * depth_multiplier;
396 const int32_t in_y_origin = out_y * stride_height - _padding_height;
397 const int32_t in_x_origin = out_x * stride_width - _padding_width;
399 for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y)
401 for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x)
403 const int32_t in_y = in_y_origin + dilation_height_factor * filter_y;
404 const int32_t in_x = in_x_origin + dilation_width_factor * filter_x;
405 if ((in_y >= 0 && in_y < input_height) && (in_x >= 0 && in_x < input_width))
407 const int16_t input_val =
408 input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
409 const int16_t filter_val =
410 filter_data[calcOffset(filter_shape, 0, filter_y, filter_x, out_c)];
411 acc += static_cast<int64_t>(input_val) * static_cast<int64_t>(filter_val);
415 if (bias_data != nullptr)
417 acc += bias_data[out_c];
420 int32_t output_multiplier = quant_multipliers[out_c].multiplier;
421 int output_shift = quant_multipliers[out_c].shift;
423 tflite::MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
425 scaled_acc = std::max(scaled_acc, activation_min);
426 scaled_acc = std::min(scaled_acc, activation_max);
428 output_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] = scaled_acc;
436 } // namespace kernels
437 } // namespace luci_interpreter