compiler/luci-interpreter/src/kernels/DepthwiseConv2D.cpp

   1 /*
   2  * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *    http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include "kernels/DepthwiseConv2D.h"
  18
  19 #include "kernels/Utils.h"
  20
  21 #include <tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h>
  22 #include <tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h>
  23
  24 #include <stdexcept>
  25
  26 namespace luci_interpreter
  27 {
  28 namespace kernels
  29 {
  30
  31 DepthwiseConv2D::DepthwiseConv2D(const Tensor *input, const Tensor *filter, const Tensor *bias,
  32                                  Tensor *output, const DepthwiseConv2DParams &params)
  33     : KernelWithParams<DepthwiseConv2DParams>({input, filter, bias}, {output}, params)
  34 {
  35 }
  36
  37 void DepthwiseConv2D::configure()
  38 {
  39   // TensorFlow Lite (as of v2.2.0) supports the following combinations of types:
  40   //     | input filter bias  output |
  41   // ----+---------------------------+
  42   // (1) | float float  float float  |
  43   // (2) | float int8   float float  | hybrid
  44   // (3) | uint8 uint8  int32 uint8  | quantized
  45   // (4) | int8  int8   int32 int8   | quantized per channel
  46   // (5) | int16 int8   int64 int16  | quantized per channel 16x8
  47   //
  48   // We only support (1) and (3) for now, and additionally the following:
  49   //     | input filter bias  output |
  50   // ----+---------------------------+
  51   // (5) | int16 int16  int64 int16  |
  52   //
  53   if (input()->element_type() == DataType::FLOAT32 && filter()->element_type() == DataType::FLOAT32)
  54   {
  55     LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::FLOAT32);
  56   }
  57   else if (input()->element_type() == DataType::U8 && filter()->element_type() == DataType::U8)
  58   {
  59     LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S32);
  60   }
  61   else if (input()->element_type() == DataType::S16 && filter()->element_type() == DataType::S16)
  62   {
  63     LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S64);
  64   }
  65   else
  66   {
  67     throw std::runtime_error("Unsupported type.");
  68   }
  69   LUCI_INTERPRETER_CHECK(output()->element_type() == input()->element_type());
  70
  71   const Shape &input_shape = input()->shape();
  72   const Shape &filter_shape = filter()->shape();
  73   LUCI_INTERPRETER_CHECK(input_shape.num_dims() == 4 && filter_shape.num_dims() == 4);
  74
  75   const int32_t batches = input_shape.dim(0);
  76   const int32_t input_height = input_shape.dim(1);
  77   const int32_t input_width = input_shape.dim(2);
  78   // Filter format: [1, H, W, O].
  79   LUCI_INTERPRETER_CHECK(filter_shape.dim(0) == 1);
  80   const int32_t filter_height = filter_shape.dim(1);
  81   const int32_t filter_width = filter_shape.dim(2);
  82   const int32_t channels_out = filter_shape.dim(3);
  83
  84   LUCI_INTERPRETER_CHECK(bias() == nullptr || (bias()->shape().num_dims() == 1 &&
  85                                                bias()->shape().dim(0) == channels_out));
  86
  87   const int32_t output_height =
  88       computeOutputSize(_params.padding, input_height, filter_height, _params.stride_height,
  89                         _params.dilation_height_factor);
  90   const int32_t output_width =
  91       computeOutputSize(_params.padding, input_width, filter_width, _params.stride_width,
  92                         _params.dilation_width_factor);
  93
  94   _padding_height = computePadding(_params.stride_height, _params.dilation_height_factor,
  95                                    input_height, filter_height, output_height);
  96   _padding_width = computePadding(_params.stride_width, _params.dilation_width_factor, input_width,
  97                                   filter_width, output_width);
  98
  99   output()->resize({batches, output_height, output_width, channels_out});
 100 }
 101
 102 void DepthwiseConv2D::execute() const
 103 {
 104   switch (input()->element_type())
 105   {
 106     case DataType::FLOAT32:
 107       if (filter()->element_type() == DataType::FLOAT32)
 108       {
 109         evalFloat();
 110         break;
 111       }
 112       throw std::runtime_error("Unsupported type.");
 113     case DataType::U8:
 114       if (filter()->scales().size() == 1)
 115       {
 116         evalQuantized();
 117       }
 118       else if (filter()->scales().size() > 1)
 119       {
 120         LUCI_INTERPRETER_CHECK(filter()->shape().num_dims() == 4);
 121         LUCI_INTERPRETER_CHECK(filter()->scales().size() ==
 122                                static_cast<size_t>(filter()->shape().dim(3)));
 123         evalQuantizedPerChannel();
 124       }
 125       break;
 126     case DataType::S16:
 127       evalQuantizedS16();
 128       break;
 129     default:
 130       throw std::runtime_error("Unsupported type.");
 131   }
 132 }
 133
 134 void DepthwiseConv2D::evalFloat() const
 135 {
 136   float activation_min{};
 137   float activation_max{};
 138   calculateActivationRange(_params.activation, &activation_min, &activation_max);
 139
 140   tflite::DepthwiseParams params{};
 141   params.padding_values.height = _padding_height;
 142   params.padding_values.width = _padding_width;
 143   params.stride_height = _params.stride_height;
 144   params.stride_width = _params.stride_width;
 145   params.dilation_height_factor = _params.dilation_height_factor;
 146   params.dilation_width_factor = _params.dilation_width_factor;
 147   params.depth_multiplier = _params.depth_multiplier;
 148   params.float_activation_min = activation_min;
 149   params.float_activation_max = activation_max;
 150
 151   tflite::reference_ops::DepthwiseConv(
 152       params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(filter()),
 153       getTensorData<float>(filter()), getTensorShape(bias()), getTensorData<float>(bias()),
 154       getTensorShape(output()), getTensorData<float>(output()));
 155 }
 156
 157 void DepthwiseConv2D::evalQuantizedPerChannel() const
 158 {
 159   const auto *input_data = getTensorData<uint8_t>(input());
 160   const auto *filter_data = getTensorData<uint8_t>(filter());
 161   const auto *bias_data = getTensorData<int32_t>(bias());
 162   auto *output_data = getTensorData<uint8_t>(output());
 163
 164   const Shape &input_shape = input()->shape();
 165   const Shape &filter_shape = filter()->shape();
 166   const Shape &output_shape = output()->shape();
 167
 168   const int32_t batches = input_shape.dim(0);
 169   const int32_t input_height = input_shape.dim(1);
 170   const int32_t input_width = input_shape.dim(2);
 171   const int32_t input_depth = input_shape.dim(3);
 172   const int32_t filter_height = filter_shape.dim(1);
 173   const int32_t filter_width = filter_shape.dim(2);
 174   const int32_t output_height = output_shape.dim(1);
 175   const int32_t output_width = output_shape.dim(2);
 176
 177   const int32_t stride_height = _params.stride_height;
 178   const int32_t stride_width = _params.stride_width;
 179   const int32_t dilation_height_factor = _params.dilation_height_factor;
 180   const int32_t dilation_width_factor = _params.dilation_width_factor;
 181   const int32_t depth_multiplier = _params.depth_multiplier;
 182
 183   int32_t activation_min{};
 184   int32_t activation_max{};
 185   calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
 186
 187   const std::vector<double> effective_output_scales =
 188       getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
 189
 190   std::vector<ChannelQuantMultipliers> quant_multipliers_raw =
 191       quantizeMultipliers(effective_output_scales);
 192   BroadcastableWrapper<ChannelQuantMultipliers> quant_multipliers(quant_multipliers_raw);
 193
 194   for (int batch = 0; batch < batches; ++batch)
 195   {
 196     for (int out_y = 0; out_y < output_height; ++out_y)
 197     {
 198       for (int out_x = 0; out_x < output_width; ++out_x)
 199       {
 200         for (int in_channel = 0; in_channel < input_depth; ++in_channel)
 201         {
 202           for (int m = 0; m < depth_multiplier; ++m)
 203           {
 204             const int output_channel = m + in_channel * depth_multiplier;
 205             const int in_x_origin = (out_x * stride_width) - _padding_width;
 206             const int in_y_origin = (out_y * stride_height) - _padding_height;
 207             int32 acc = 0;
 208             for (int filter_y = 0; filter_y < filter_height; ++filter_y)
 209             {
 210               for (int filter_x = 0; filter_x < filter_width; ++filter_x)
 211               {
 212                 const int in_x = in_x_origin + dilation_width_factor * filter_x;
 213                 const int in_y = in_y_origin + dilation_height_factor * filter_y;
 214                 // Zero padding by omitting the areas outside the image.
 215                 const bool is_point_inside_image =
 216                     (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height);
 217                 if (is_point_inside_image)
 218                 {
 219                   int32 input_val =
 220                       input_data[calcOffset(input_shape, batch, in_y, in_x, in_channel)];
 221                   int32 filter_val =
 222                       filter_data[calcOffset(filter_shape, 0, filter_y, filter_x, output_channel)];
 223                   acc += (filter_val - filter()->zero_points()[output_channel]) *
 224                          (input_val - input()->zero_point());
 225                 }
 226               }
 227             }
 228             if (bias_data)
 229             {
 230               acc += bias_data[output_channel];
 231             }
 232             int32_t output_multiplier = quant_multipliers[output_channel].multiplier;
 233             int output_shift = quant_multipliers[output_channel].shift;
 234             int32_t scaled_acc =
 235                 tflite::MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
 236             scaled_acc += output()->zero_point();
 237             scaled_acc = std::max(scaled_acc, activation_min);
 238             scaled_acc = std::min(scaled_acc, activation_max);
 239             output_data[calcOffset(output_shape, batch, out_y, out_x, output_channel)] =
 240                 static_cast<uint8_t>(scaled_acc);
 241           }
 242         }
 243       }
 244     }
 245   }
 246 }
 247
 248 void DepthwiseConv2D::evalQuantized() const
 249 {
 250   const auto input_scale = static_cast<double>(input()->scale());
 251   const auto filter_scale = static_cast<double>(filter()->scale());
 252   const auto output_scale = static_cast<double>(output()->scale());
 253
 254   const double real_multiplier = input_scale * filter_scale / output_scale;
 255   int32_t output_multiplier{};
 256   int output_shift{};
 257   quantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
 258
 259   int32_t activation_min{};
 260   int32_t activation_max{};
 261   calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
 262
 263   tflite::DepthwiseParams params{};
 264   params.padding_values.height = _padding_height;
 265   params.padding_values.width = _padding_width;
 266   params.stride_height = _params.stride_height;
 267   params.stride_width = _params.stride_width;
 268   params.dilation_height_factor = _params.dilation_height_factor;
 269   params.dilation_width_factor = _params.dilation_width_factor;
 270   params.depth_multiplier = _params.depth_multiplier;
 271   // The kernel expects input and filter zero points to be negated.
 272   params.input_offset = -input()->zero_point();    // Note the '-'.
 273   params.weights_offset = -filter()->zero_point(); // Note the '-'.
 274   params.output_offset = output()->zero_point();
 275   params.output_multiplier = output_multiplier;
 276   params.output_shift = output_shift;
 277   params.quantized_activation_min = activation_min;
 278   params.quantized_activation_max = activation_max;
 279
 280   tflite::reference_ops::DepthwiseConv(
 281       params, getTensorShape(input()), getTensorData<uint8_t>(input()), getTensorShape(filter()),
 282       getTensorData<uint8_t>(filter()), getTensorShape(bias()), getTensorData<int32_t>(bias()),
 283       getTensorShape(output()), getTensorData<uint8_t>(output()));
 284 }
 285
 286 void DepthwiseConv2D::evalQuantizedS16() const
 287 {
 288   const auto *input_data = getTensorData<int16_t>(input());
 289   const auto *filter_data = getTensorData<int16_t>(filter());
 290   const auto *bias_data = getTensorData<int64_t>(bias());
 291   auto *output_data = getTensorData<int16_t>(output());
 292
 293   const Shape &input_shape = input()->shape();
 294   const Shape &filter_shape = filter()->shape();
 295   const Shape &output_shape = output()->shape();
 296
 297   const int32_t batches = input_shape.dim(0);
 298   const int32_t input_height = input_shape.dim(1);
 299   const int32_t input_width = input_shape.dim(2);
 300   const int32_t input_depth = input_shape.dim(3);
 301   const int32_t filter_height = filter_shape.dim(1);
 302   const int32_t filter_width = filter_shape.dim(2);
 303   const int32_t output_height = output_shape.dim(1);
 304   const int32_t output_width = output_shape.dim(2);
 305
 306   const int32_t stride_height = _params.stride_height;
 307   const int32_t stride_width = _params.stride_width;
 308   const int32_t dilation_height_factor = _params.dilation_height_factor;
 309   const int32_t dilation_width_factor = _params.dilation_width_factor;
 310   const int32_t depth_multiplier = _params.depth_multiplier;
 311
 312   const std::vector<double> effective_output_scales =
 313       getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
 314
 315   std::vector<ChannelQuantMultipliers> quant_multipliers_raw =
 316       quantizeMultipliers(effective_output_scales);
 317
 318   BroadcastableWrapper<ChannelQuantMultipliers> quant_multipliers(quant_multipliers_raw);
 319
 320   int32_t activation_min{};
 321   int32_t activation_max{};
 322   calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
 323
 324   for (int32_t batch = 0; batch < batches; ++batch)
 325   {
 326     for (int32_t out_y = 0; out_y < output_height; ++out_y)
 327     {
 328       for (int32_t out_x = 0; out_x < output_width; ++out_x)
 329       {
 330         for (int32_t in_c = 0; in_c < input_depth; ++in_c)
 331         {
 332           for (int32_t m = 0; m < depth_multiplier; ++m)
 333           {
 334             const int32_t out_c = m + in_c * depth_multiplier;
 335             const int32_t in_y_origin = out_y * stride_height - _padding_height;
 336             const int32_t in_x_origin = out_x * stride_width - _padding_width;
 337             int64_t acc = 0;
 338             for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y)
 339             {
 340               for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x)
 341               {
 342                 const int32_t in_y = in_y_origin + dilation_height_factor * filter_y;
 343                 const int32_t in_x = in_x_origin + dilation_width_factor * filter_x;
 344                 if ((in_y >= 0 && in_y < input_height) && (in_x >= 0 && in_x < input_width))
 345                 {
 346                   const int16_t input_val =
 347                       input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
 348                   const int16_t filter_val =
 349                       filter_data[calcOffset(filter_shape, 0, filter_y, filter_x, out_c)];
 350                   acc += static_cast<int64_t>(input_val) * static_cast<int64_t>(filter_val);
 351                 }
 352               }
 353             }
 354             if (bias_data != nullptr)
 355             {
 356               acc += bias_data[out_c];
 357             }
 358
 359             int32_t output_multiplier = quant_multipliers[out_c].multiplier;
 360             int output_shift = quant_multipliers[out_c].shift;
 361             int32_t scaled_acc =
 362                 tflite::MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
 363
 364             scaled_acc = std::max(scaled_acc, activation_min);
 365             scaled_acc = std::min(scaled_acc, activation_max);
 366
 367             output_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] = scaled_acc;
 368           }
 369         }
 370       }
 371     }
 372   }
 373 }
 374
 375 } // namespace kernels
 376 } // namespace luci_interpreter