compiler/luci-interpreter/src/kernels/DepthwiseConv2D.cpp

   1 /*
   2  * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *    http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include "kernels/DepthwiseConv2D.h"
  18
  19 #include "kernels/Utils.h"
  20
  21 #include <tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h>
  22 #include <tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h>
  23 #include <tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h>
  24
  25 #include <stdexcept>
  26
  27 namespace luci_interpreter
  28 {
  29 namespace kernels
  30 {
  31
  32 DepthwiseConv2D::DepthwiseConv2D(const Tensor *input, const Tensor *filter, const Tensor *bias,
  33                                  Tensor *output, const DepthwiseConv2DParams &params)
  34   : KernelWithParams<DepthwiseConv2DParams>({input, filter, bias}, {output}, params)
  35 {
  36 }
  37
  38 void DepthwiseConv2D::configure()
  39 {
  40   // TensorFlow Lite (as of v2.2.0) supports the following combinations of types:
  41   //     | input filter bias  output |
  42   // ----+---------------------------+
  43   // (1) | float float  float float  |
  44   // (2) | float int8   float float  | hybrid
  45   // (3) | uint8 uint8  int32 uint8  | quantized
  46   // (4) | int8  int8   int32 int8   | quantized per channel
  47   // (5) | int16 int8   int64 int16  | quantized per channel 16x8
  48   //
  49   // We only support (1), (3) and (4) for now, and additionally the following:
  50   //     | input filter bias  output |
  51   // ----+---------------------------+
  52   // (5) | int16 int16  int64 int16  |
  53   //
  54   if (input()->element_type() == DataType::FLOAT32 && filter()->element_type() == DataType::FLOAT32)
  55   {
  56     LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::FLOAT32);
  57   }
  58   else if (input()->element_type() == DataType::U8 && filter()->element_type() == DataType::U8)
  59   {
  60     LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S32);
  61   }
  62   else if (input()->element_type() == DataType::S8 && filter()->element_type() == DataType::S8)
  63   {
  64     LUCI_INTERPRETER_CHECK(filter()->shape().num_dims() == 4);
  65     LUCI_INTERPRETER_CHECK(static_cast<uint32_t>(filter()->shape().dim(3)) ==
  66                            filter()->scales().size());
  67     for (auto zerop : filter()->zero_points())
  68     {
  69       LUCI_INTERPRETER_CHECK(zerop == 0);
  70     }
  71     LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S32);
  72   }
  73   else if (input()->element_type() == DataType::S16 && filter()->element_type() == DataType::S16)
  74   {
  75     LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S64);
  76   }
  77   else
  78   {
  79     throw std::runtime_error("Unsupported type.");
  80   }
  81   LUCI_INTERPRETER_CHECK(output()->element_type() == input()->element_type());
  82
  83   const Shape &input_shape = input()->shape();
  84   const Shape &filter_shape = filter()->shape();
  85   LUCI_INTERPRETER_CHECK(input_shape.num_dims() == 4 && filter_shape.num_dims() == 4);
  86
  87   const int32_t batches = input_shape.dim(0);
  88   const int32_t input_height = input_shape.dim(1);
  89   const int32_t input_width = input_shape.dim(2);
  90   // Filter format: [1, H, W, O].
  91   LUCI_INTERPRETER_CHECK(filter_shape.dim(0) == 1);
  92   const int32_t filter_height = filter_shape.dim(1);
  93   const int32_t filter_width = filter_shape.dim(2);
  94   const int32_t channels_out = filter_shape.dim(3);
  95
  96   LUCI_INTERPRETER_CHECK(bias() == nullptr || (bias()->shape().num_dims() == 1 &&
  97                                                bias()->shape().dim(0) == channels_out));
  98
  99   const int32_t output_height =
 100     computeOutputSize(_params.padding, input_height, filter_height, _params.stride_height,
 101                       _params.dilation_height_factor);
 102   const int32_t output_width =
 103     computeOutputSize(_params.padding, input_width, filter_width, _params.stride_width,
 104                       _params.dilation_width_factor);
 105
 106   _padding_height = computePadding(_params.stride_height, _params.dilation_height_factor,
 107                                    input_height, filter_height, output_height);
 108   _padding_width = computePadding(_params.stride_width, _params.dilation_width_factor, input_width,
 109                                   filter_width, output_width);
 110
 111   output()->resize({batches, output_height, output_width, channels_out});
 112 }
 113
 114 void DepthwiseConv2D::execute() const
 115 {
 116   switch (input()->element_type())
 117   {
 118     case DataType::FLOAT32:
 119       if (filter()->element_type() == DataType::FLOAT32)
 120       {
 121         evalFloat();
 122         break;
 123       }
 124       throw std::runtime_error("Unsupported type.");
 125     case DataType::U8:
 126       if (filter()->scales().size() == 1)
 127       {
 128         evalQuantized();
 129       }
 130       else if (filter()->scales().size() > 1)
 131       {
 132         LUCI_INTERPRETER_CHECK(filter()->shape().num_dims() == 4);
 133         LUCI_INTERPRETER_CHECK(filter()->scales().size() ==
 134                                static_cast<size_t>(filter()->shape().dim(3)));
 135         evalQuantizedPerChannel();
 136       }
 137       break;
 138     case DataType::S8:
 139       evalQuantizedS8PerChannel();
 140       break;
 141     case DataType::S16:
 142       evalQuantizedS16();
 143       break;
 144     default:
 145       throw std::runtime_error("Unsupported type.");
 146   }
 147 }
 148
 149 void DepthwiseConv2D::evalFloat() const
 150 {
 151   float activation_min{};
 152   float activation_max{};
 153   calculateActivationRange(_params.activation, &activation_min, &activation_max);
 154
 155   tflite::DepthwiseParams params{};
 156   params.padding_values.height = _padding_height;
 157   params.padding_values.width = _padding_width;
 158   params.stride_height = _params.stride_height;
 159   params.stride_width = _params.stride_width;
 160   params.dilation_height_factor = _params.dilation_height_factor;
 161   params.dilation_width_factor = _params.dilation_width_factor;
 162   params.depth_multiplier = _params.depth_multiplier;
 163   params.float_activation_min = activation_min;
 164   params.float_activation_max = activation_max;
 165
 166   tflite::reference_ops::DepthwiseConv(
 167     params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(filter()),
 168     getTensorData<float>(filter()), getTensorShape(bias()), getTensorData<float>(bias()),
 169     getTensorShape(output()), getTensorData<float>(output()));
 170 }
 171
 172 void DepthwiseConv2D::evalQuantizedPerChannel() const
 173 {
 174   const auto *input_data = getTensorData<uint8_t>(input());
 175   const auto *filter_data = getTensorData<uint8_t>(filter());
 176   const auto *bias_data = getTensorData<int32_t>(bias());
 177   auto *output_data = getTensorData<uint8_t>(output());
 178
 179   const Shape &input_shape = input()->shape();
 180   const Shape &filter_shape = filter()->shape();
 181   const Shape &output_shape = output()->shape();
 182
 183   const int32_t batches = input_shape.dim(0);
 184   const int32_t input_height = input_shape.dim(1);
 185   const int32_t input_width = input_shape.dim(2);
 186   const int32_t input_depth = input_shape.dim(3);
 187   const int32_t filter_height = filter_shape.dim(1);
 188   const int32_t filter_width = filter_shape.dim(2);
 189   const int32_t output_height = output_shape.dim(1);
 190   const int32_t output_width = output_shape.dim(2);
 191
 192   const int32_t stride_height = _params.stride_height;
 193   const int32_t stride_width = _params.stride_width;
 194   const int32_t dilation_height_factor = _params.dilation_height_factor;
 195   const int32_t dilation_width_factor = _params.dilation_width_factor;
 196   const int32_t depth_multiplier = _params.depth_multiplier;
 197
 198   int32_t activation_min{};
 199   int32_t activation_max{};
 200   calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
 201
 202   const std::vector<double> effective_output_scales =
 203     getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
 204
 205   std::vector<ChannelQuantMultipliers> quant_multipliers_raw =
 206     quantizeMultipliers(effective_output_scales);
 207   BroadcastableWrapper<ChannelQuantMultipliers> quant_multipliers(quant_multipliers_raw);
 208
 209   for (int batch = 0; batch < batches; ++batch)
 210   {
 211     for (int out_y = 0; out_y < output_height; ++out_y)
 212     {
 213       for (int out_x = 0; out_x < output_width; ++out_x)
 214       {
 215         for (int in_channel = 0; in_channel < input_depth; ++in_channel)
 216         {
 217           for (int m = 0; m < depth_multiplier; ++m)
 218           {
 219             const int output_channel = m + in_channel * depth_multiplier;
 220             const int in_x_origin = (out_x * stride_width) - _padding_width;
 221             const int in_y_origin = (out_y * stride_height) - _padding_height;
 222             int32 acc = 0;
 223             for (int filter_y = 0; filter_y < filter_height; ++filter_y)
 224             {
 225               for (int filter_x = 0; filter_x < filter_width; ++filter_x)
 226               {
 227                 const int in_x = in_x_origin + dilation_width_factor * filter_x;
 228                 const int in_y = in_y_origin + dilation_height_factor * filter_y;
 229                 // Zero padding by omitting the areas outside the image.
 230                 const bool is_point_inside_image =
 231                   (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height);
 232                 if (is_point_inside_image)
 233                 {
 234                   int32 input_val =
 235                     input_data[calcOffset(input_shape, batch, in_y, in_x, in_channel)];
 236                   int32 filter_val =
 237                     filter_data[calcOffset(filter_shape, 0, filter_y, filter_x, output_channel)];
 238                   acc += (filter_val - filter()->zero_points()[output_channel]) *
 239                          (input_val - input()->zero_point());
 240                 }
 241               }
 242             }
 243             if (bias_data)
 244             {
 245               acc += bias_data[output_channel];
 246             }
 247             int32_t output_multiplier = quant_multipliers[output_channel].multiplier;
 248             int output_shift = quant_multipliers[output_channel].shift;
 249             int32_t scaled_acc =
 250               tflite::MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
 251             scaled_acc += output()->zero_point();
 252             scaled_acc = std::max(scaled_acc, activation_min);
 253             scaled_acc = std::min(scaled_acc, activation_max);
 254             output_data[calcOffset(output_shape, batch, out_y, out_x, output_channel)] =
 255               static_cast<uint8_t>(scaled_acc);
 256           }
 257         }
 258       }
 259     }
 260   }
 261 }
 262
 263 void DepthwiseConv2D::evalQuantized() const
 264 {
 265   const auto input_scale = static_cast<double>(input()->scale());
 266   const auto filter_scale = static_cast<double>(filter()->scale());
 267   const auto output_scale = static_cast<double>(output()->scale());
 268
 269   const double real_multiplier = input_scale * filter_scale / output_scale;
 270   int32_t output_multiplier{};
 271   int output_shift{};
 272   quantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
 273
 274   int32_t activation_min{};
 275   int32_t activation_max{};
 276   calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
 277
 278   tflite::DepthwiseParams params{};
 279   params.padding_values.height = _padding_height;
 280   params.padding_values.width = _padding_width;
 281   params.stride_height = _params.stride_height;
 282   params.stride_width = _params.stride_width;
 283   params.dilation_height_factor = _params.dilation_height_factor;
 284   params.dilation_width_factor = _params.dilation_width_factor;
 285   params.depth_multiplier = _params.depth_multiplier;
 286   // The kernel expects input and filter zero points to be negated.
 287   params.input_offset = -input()->zero_point();    // Note the '-'.
 288   params.weights_offset = -filter()->zero_point(); // Note the '-'.
 289   params.output_offset = output()->zero_point();
 290   params.output_multiplier = output_multiplier;
 291   params.output_shift = output_shift;
 292   params.quantized_activation_min = activation_min;
 293   params.quantized_activation_max = activation_max;
 294
 295   tflite::reference_ops::DepthwiseConv(
 296     params, getTensorShape(input()), getTensorData<uint8_t>(input()), getTensorShape(filter()),
 297     getTensorData<uint8_t>(filter()), getTensorShape(bias()), getTensorData<int32_t>(bias()),
 298     getTensorShape(output()), getTensorData<uint8_t>(output()));
 299 }
 300
 301 void DepthwiseConv2D::evalQuantizedS8PerChannel() const
 302 {
 303   int32_t activation_min{};
 304   int32_t activation_max{};
 305   calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
 306
 307   tflite::DepthwiseParams params{};
 308
 309   params.padding_type = tflite::PaddingType::kSame;
 310   params.padding_values.height = _padding_height;
 311   params.padding_values.width = _padding_width;
 312   params.stride_height = _params.stride_height;
 313   params.stride_width = _params.stride_width;
 314   params.dilation_height_factor = _params.dilation_height_factor;
 315   params.dilation_width_factor = _params.dilation_width_factor;
 316   params.depth_multiplier = _params.depth_multiplier;
 317   // The kernel expects input and filter zero points to be negated.
 318   params.input_offset = -input()->zero_point(); // Note the '-'.
 319   params.weights_offset = 0;
 320   params.output_offset = output()->zero_point();
 321   params.output_multiplier = 1; // unused in tflite code
 322   params.output_shift = 0;      // unused in tflite code
 323   params.quantized_activation_min = activation_min;
 324   params.quantized_activation_max = activation_max;
 325
 326   const std::vector<double> effective_output_scales =
 327     getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
 328
 329   std::vector<ChannelQuantMultipliers> quant_multipliers =
 330     quantizeMultipliers(effective_output_scales);
 331
 332   std::vector<int32_t> shifts;
 333   std::transform(quant_multipliers.begin(), quant_multipliers.end(), std::back_inserter(shifts),
 334                  [](ChannelQuantMultipliers cm) { return cm.shift; });
 335   std::vector<int32_t> multipliers;
 336   std::transform(quant_multipliers.begin(), quant_multipliers.end(),
 337                  std::back_inserter(multipliers),
 338                  [](ChannelQuantMultipliers cm) { return cm.multiplier; });
 339
 340   tflite::reference_integer_ops::DepthwiseConvPerChannel(
 341     params, multipliers.data(), shifts.data(), getTensorShape(input()),
 342     getTensorData<int8_t>(input()), getTensorShape(filter()), getTensorData<int8_t>(filter()),
 343     getTensorShape(bias()), getTensorData<int32_t>(bias()), getTensorShape(output()),
 344     getTensorData<int8_t>(output()));
 345 }
 346
 347 void DepthwiseConv2D::evalQuantizedS16() const
 348 {
 349   const auto *input_data = getTensorData<int16_t>(input());
 350   const auto *filter_data = getTensorData<int16_t>(filter());
 351   const auto *bias_data = getTensorData<int64_t>(bias());
 352   auto *output_data = getTensorData<int16_t>(output());
 353
 354   const Shape &input_shape = input()->shape();
 355   const Shape &filter_shape = filter()->shape();
 356   const Shape &output_shape = output()->shape();
 357
 358   const int32_t batches = input_shape.dim(0);
 359   const int32_t input_height = input_shape.dim(1);
 360   const int32_t input_width = input_shape.dim(2);
 361   const int32_t input_depth = input_shape.dim(3);
 362   const int32_t filter_height = filter_shape.dim(1);
 363   const int32_t filter_width = filter_shape.dim(2);
 364   const int32_t output_height = output_shape.dim(1);
 365   const int32_t output_width = output_shape.dim(2);
 366
 367   const int32_t stride_height = _params.stride_height;
 368   const int32_t stride_width = _params.stride_width;
 369   const int32_t dilation_height_factor = _params.dilation_height_factor;
 370   const int32_t dilation_width_factor = _params.dilation_width_factor;
 371   const int32_t depth_multiplier = _params.depth_multiplier;
 372
 373   const std::vector<double> effective_output_scales =
 374     getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
 375
 376   std::vector<ChannelQuantMultipliers> quant_multipliers_raw =
 377     quantizeMultipliers(effective_output_scales);
 378
 379   BroadcastableWrapper<ChannelQuantMultipliers> quant_multipliers(quant_multipliers_raw);
 380
 381   int32_t activation_min{};
 382   int32_t activation_max{};
 383   calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
 384
 385   for (int32_t batch = 0; batch < batches; ++batch)
 386   {
 387     for (int32_t out_y = 0; out_y < output_height; ++out_y)
 388     {
 389       for (int32_t out_x = 0; out_x < output_width; ++out_x)
 390       {
 391         for (int32_t in_c = 0; in_c < input_depth; ++in_c)
 392         {
 393           for (int32_t m = 0; m < depth_multiplier; ++m)
 394           {
 395             const int32_t out_c = m + in_c * depth_multiplier;
 396             const int32_t in_y_origin = out_y * stride_height - _padding_height;
 397             const int32_t in_x_origin = out_x * stride_width - _padding_width;
 398             int64_t acc = 0;
 399             for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y)
 400             {
 401               for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x)
 402               {
 403                 const int32_t in_y = in_y_origin + dilation_height_factor * filter_y;
 404                 const int32_t in_x = in_x_origin + dilation_width_factor * filter_x;
 405                 if ((in_y >= 0 && in_y < input_height) && (in_x >= 0 && in_x < input_width))
 406                 {
 407                   const int16_t input_val =
 408                     input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
 409                   const int16_t filter_val =
 410                     filter_data[calcOffset(filter_shape, 0, filter_y, filter_x, out_c)];
 411                   acc += static_cast<int64_t>(input_val) * static_cast<int64_t>(filter_val);
 412                 }
 413               }
 414             }
 415             if (bias_data != nullptr)
 416             {
 417               acc += bias_data[out_c];
 418             }
 419
 420             int32_t output_multiplier = quant_multipliers[out_c].multiplier;
 421             int output_shift = quant_multipliers[out_c].shift;
 422             int32_t scaled_acc =
 423               tflite::MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
 424
 425             scaled_acc = std::max(scaled_acc, activation_min);
 426             scaled_acc = std::min(scaled_acc, activation_max);
 427
 428             output_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] = scaled_acc;
 429           }
 430         }
 431       }
 432     }
 433   }
 434 }
 435
 436 } // namespace kernels
 437 } // namespace luci_interpreter