compiler/luci-interpreter/src/kernels/Conv2D.cpp

   1 /*
   2  * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
   3  * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
   4  *
   5  * Licensed under the Apache License, Version 2.0 (the "License");
   6  * you may not use this file except in compliance with the License.
   7  * You may obtain a copy of the License at
   8  *
   9  *    http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 #include "kernels/Conv2D.h"
  19
  20 #include "kernels/Utils.h"
  21
  22 #include "PALConv2d.h"
  23
  24 #include <stdexcept>
  25 #include <thread>
  26
  27 namespace luci_interpreter
  28 {
  29 namespace kernels
  30 {
  31
  32 Conv2D::Conv2D(const Tensor *input, const Tensor *filter, const Tensor *bias, Tensor *output,
  33                Tensor *im2col, const Conv2DParams &params)
  34   : KernelWithParams<Conv2DParams>({input, filter, bias}, {output, im2col}, params)
  35 {
  36 }
  37
  38 void Conv2D::configure()
  39 {
  40   // TensorFlow Lite (as of v2.2.0) supports the following combinations of types:
  41   //     | input filter bias  output |
  42   // ----+---------------------------+
  43   // (1) | float float  float float  |
  44   // (2) | float int8   float float  | hybrid
  45   // (3) | uint8 uint8  int32 uint8  | quantized
  46   // (4) | int8  int8   int32 int8   | quantized per channel
  47   //
  48   // We only support (1), (3) and (4) for now, and additionally the following:
  49   //     | input filter bias  output |
  50   // ----+---------------------------+
  51   // (5) | int16 int16  int64 int16  |
  52   //
  53   if (input()->element_type() == DataType::FLOAT32 && filter()->element_type() == DataType::FLOAT32)
  54   {
  55     LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::FLOAT32);
  56   }
  57   else if (input()->element_type() == DataType::U8 && filter()->element_type() == DataType::U8)
  58   {
  59     LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S32);
  60   }
  61   else if (input()->element_type() == DataType::S8 && filter()->element_type() == DataType::S8)
  62   {
  63     LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S32);
  64     LUCI_INTERPRETER_CHECK(filter()->shape().num_dims() == 4);
  65     LUCI_INTERPRETER_CHECK(filter()->scales().size() ==
  66                            static_cast<size_t>(filter()->shape().dim(0)));
  67     for (auto zerop : filter()->zero_points())
  68     {
  69       LUCI_INTERPRETER_CHECK(zerop == 0);
  70     }
  71   }
  72   else if (input()->element_type() == DataType::S16 && filter()->element_type() == DataType::S16)
  73   {
  74     LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S64);
  75   }
  76   else
  77   {
  78     throw std::runtime_error("Unsupported type.");
  79   }
  80   LUCI_INTERPRETER_CHECK(output()->element_type() == input()->element_type());
  81
  82   const Shape &input_shape = input()->shape();
  83   const Shape &filter_shape = filter()->shape();
  84   LUCI_INTERPRETER_CHECK(input_shape.num_dims() == 4 && filter_shape.num_dims() == 4);
  85
  86   const int32_t batches = input_shape.dim(0);
  87   const int32_t input_height = input_shape.dim(1);
  88   const int32_t input_width = input_shape.dim(2);
  89   const int32_t output_depth = filter_shape.dim(0);
  90   const int32_t filter_height = filter_shape.dim(1);
  91   const int32_t filter_width = filter_shape.dim(2);
  92   LUCI_INTERPRETER_CHECK(filter_shape.dim(3) == input_shape.dim(3));
  93
  94   LUCI_INTERPRETER_CHECK(bias() == nullptr || (bias()->shape().num_dims() == 1 &&
  95                                                bias()->shape().dim(0) == output_depth));
  96
  97   const int32_t output_height =
  98     computeOutputSize(_params.padding, input_height, filter_height, _params.stride_height,
  99                       _params.dilation_height_factor);
 100   const int32_t output_width =
 101     computeOutputSize(_params.padding, input_width, filter_width, _params.stride_width,
 102                       _params.dilation_width_factor);
 103
 104   _padding_height = computePadding(_params.stride_height, _params.dilation_height_factor,
 105                                    input_height, filter_height, output_height);
 106   _padding_width = computePadding(_params.stride_width, _params.dilation_width_factor, input_width,
 107                                   filter_width, output_width);
 108
 109   output()->resize({batches, output_height, output_width, output_depth});
 110
 111   // Allocate tensor for Im2Col, if needed.
 112   // The checks here should be aligned with the actual implementation.
 113   const bool need_dilated_im2col =
 114     _params.dilation_height_factor != 1 || _params.dilation_width_factor != 1;
 115   const bool need_non_dilated_im2col = _params.stride_height != 1 || _params.stride_width != 1 ||
 116                                        filter_height != 1 || filter_width != 1;
 117   _need_im2col =
 118     input()->element_type() != DataType::S16 && (need_dilated_im2col || need_non_dilated_im2col);
 119   if (_need_im2col)
 120   {
 121     const int input_depth = input_shape.dim(3);
 122     Shape im2col_shape{batches, output_height, output_width,
 123                        input_depth * filter_height * filter_width};
 124     auto im2col = getOutputTensors()[1];
 125     im2col->resize(im2col_shape);
 126   }
 127   else
 128   {
 129     auto im2col = getOutputTensors()[1];
 130     im2col->set_allocatable(false);
 131   }
 132 }
 133
 134 void Conv2D::execute() const
 135 {
 136   switch (input()->element_type())
 137   {
 138     case DataType::FLOAT32:
 139       if (filter()->element_type() == DataType::FLOAT32)
 140       {
 141         evalFloat();
 142         break;
 143       }
 144       throw std::runtime_error("Unsupported type.");
 145     case DataType::U8:
 146       if (filter()->scales().size() == 1)
 147       {
 148         evalQuantized();
 149       }
 150       else if (filter()->scales().size() > 1)
 151       {
 152         LUCI_INTERPRETER_CHECK(filter()->shape().num_dims() == 4);
 153         LUCI_INTERPRETER_CHECK(filter()->scales().size() ==
 154                                static_cast<size_t>(filter()->shape().dim(0)));
 155         evalQuantizedPerChannel();
 156       }
 157       break;
 158     case DataType::S8:
 159       evalQuantizedS8PerChannel();
 160       break;
 161     case DataType::S16:
 162       evalQuantizedS16();
 163       break;
 164     default:
 165       throw std::runtime_error("Unsupported type.");
 166   }
 167 }
 168
 169 void Conv2D::evalFloat() const
 170 {
 171   float activation_min{};
 172   float activation_max{};
 173   calculateActivationRange(_params.activation, &activation_min, &activation_max);
 174
 175   tflite::ConvParams params{};
 176   params.padding_values.height = _padding_height;
 177   params.padding_values.width = _padding_width;
 178   params.stride_height = _params.stride_height;
 179   params.stride_width = _params.stride_width;
 180   params.dilation_height_factor = _params.dilation_height_factor;
 181   params.dilation_width_factor = _params.dilation_width_factor;
 182   params.float_activation_min = activation_min;
 183   params.float_activation_max = activation_max;
 184
 185   float *im2col_data = nullptr;
 186   auto im2col = getOutputTensors()[1];
 187   if (_need_im2col)
 188   {
 189     im2col_data = im2col->data<float>();
 190   }
 191   luci_interpreter_pal::Conv(
 192     params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(filter()),
 193     getTensorData<float>(filter()), getTensorShape(bias()), getTensorData<float>(bias()),
 194     getTensorShape(output()), getTensorData<float>(output()), getTensorShape(im2col), im2col_data);
 195 }
 196
 197 void Conv2D::evalQuantized() const
 198 {
 199   const auto input_scale = static_cast<double>(input()->scale());
 200   const auto filter_scale = static_cast<double>(filter()->scale());
 201   const auto output_scale = static_cast<double>(output()->scale());
 202
 203   const double real_multiplier = input_scale * filter_scale / output_scale;
 204   int32_t output_multiplier{};
 205   int output_shift{};
 206   quantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
 207
 208   int32_t activation_min{};
 209   int32_t activation_max{};
 210   calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
 211
 212   tflite::ConvParams params{};
 213   params.padding_values.height = _padding_height;
 214   params.padding_values.width = _padding_width;
 215   params.stride_height = _params.stride_height;
 216   params.stride_width = _params.stride_width;
 217   params.dilation_height_factor = _params.dilation_height_factor;
 218   params.dilation_width_factor = _params.dilation_width_factor;
 219   // The kernel expects input and filter zero points to be negated.
 220   params.input_offset = -input()->zero_point();    // Note the '-'.
 221   params.weights_offset = -filter()->zero_point(); // Note the '-'.
 222   params.output_offset = output()->zero_point();
 223   params.output_multiplier = output_multiplier;
 224   params.output_shift = output_shift;
 225   params.quantized_activation_min = activation_min;
 226   params.quantized_activation_max = activation_max;
 227
 228   auto im2col = getOutputTensors()[1];
 229   luci_interpreter_pal::Conv(params, getTensorShape(input()), getTensorData<uint8_t>(input()),
 230                              getTensorShape(filter()), getTensorData<uint8_t>(filter()),
 231                              getTensorShape(bias()), getTensorData<int32_t>(bias()),
 232                              getTensorShape(output()), getTensorData<uint8_t>(output()),
 233                              getTensorShape(im2col), getTensorData<uint8_t>(im2col));
 234 }
 235
 236 void Conv2D::evalQuantizedPerChannel() const
 237 {
 238   const auto *input_data = getTensorData<uint8_t>(input());
 239   const auto *filter_data = getTensorData<uint8_t>(filter());
 240   const auto *bias_data = getTensorData<int32_t>(bias());
 241   auto *output_data = getTensorData<uint8_t>(output());
 242
 243   const Shape &input_shape = input()->shape();
 244   const Shape &filter_shape = filter()->shape();
 245   const Shape &output_shape = output()->shape();
 246
 247   const int32_t batches = input_shape.dim(0);
 248   const int32_t input_height = input_shape.dim(1);
 249   const int32_t input_width = input_shape.dim(2);
 250   const int32_t input_depth = input_shape.dim(3);
 251   const int32_t output_depth = filter_shape.dim(0);
 252   const int32_t filter_height = filter_shape.dim(1);
 253   const int32_t filter_width = filter_shape.dim(2);
 254   const int32_t output_height = output_shape.dim(1);
 255   const int32_t output_width = output_shape.dim(2);
 256
 257   const int32_t stride_height = _params.stride_height;
 258   const int32_t stride_width = _params.stride_width;
 259   const int32_t dilation_height_factor = _params.dilation_height_factor;
 260   const int32_t dilation_width_factor = _params.dilation_width_factor;
 261
 262   int32_t activation_min{};
 263   int32_t activation_max{};
 264   calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
 265
 266   const std::vector<double> effective_output_scale =
 267     getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
 268
 269   const std::vector<ChannelQuantMultipliers> multipliers_raw =
 270     quantizeMultipliers(effective_output_scale);
 271   BroadcastableWrapper<ChannelQuantMultipliers> quant_multipliers(multipliers_raw);
 272
 273   for (int32_t batch = 0; batch < batches; ++batch)
 274   {
 275     for (int32_t out_y = 0; out_y < output_height; ++out_y)
 276     {
 277       for (int32_t out_x = 0; out_x < output_width; ++out_x)
 278       {
 279         for (int32_t out_c = 0; out_c < output_depth; ++out_c)
 280         {
 281           const int32_t in_y_origin = out_y * stride_height - _padding_height;
 282           const int32_t in_x_origin = out_x * stride_width - _padding_width;
 283           int32_t acc = 0;
 284           for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y)
 285           {
 286             for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x)
 287             {
 288               const int32_t in_y = in_y_origin + dilation_height_factor * filter_y;
 289               const int32_t in_x = in_x_origin + dilation_width_factor * filter_x;
 290               if ((in_y >= 0 && in_y < input_height) && (in_x >= 0 && in_x < input_width))
 291               {
 292                 for (int32_t in_c = 0; in_c < input_depth; ++in_c)
 293                 {
 294                   const uint8_t input_val =
 295                     input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
 296                   const uint8_t filter_val =
 297                     filter_data[calcOffset(filter_shape, out_c, filter_y, filter_x, in_c)];
 298                   acc += static_cast<int32_t>(input_val - input()->zero_point()) *
 299                          static_cast<int32_t>(filter_val - filter()->zero_points()[out_c]);
 300                 }
 301               }
 302             }
 303           }
 304           if (bias_data)
 305           {
 306             acc += bias_data[out_c];
 307           }
 308
 309           int32_t scaled_acc = tflite::MultiplyByQuantizedMultiplier(
 310             acc, quant_multipliers[out_c].multiplier, quant_multipliers[out_c].shift);
 311
 312           scaled_acc += output()->zero_point();
 313           scaled_acc = std::max(scaled_acc, activation_min);
 314           scaled_acc = std::min(scaled_acc, activation_max);
 315           output_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] = scaled_acc;
 316         }
 317       }
 318     }
 319   }
 320 }
 321
 322 void Conv2D::evalQuantizedS8PerChannel() const
 323 {
 324   int32_t activation_min{};
 325   int32_t activation_max{};
 326   calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
 327
 328   tflite::ConvParams params{};
 329   params.padding_values.height = _padding_height;
 330   params.padding_values.width = _padding_width;
 331   params.stride_height = _params.stride_height;
 332   params.stride_width = _params.stride_width;
 333   params.dilation_height_factor = _params.dilation_height_factor;
 334   params.dilation_width_factor = _params.dilation_width_factor;
 335   // The kernel expects filter zero points to be negated.
 336   params.input_offset = -input()->zero_point(); // Note the '-'.
 337   params.weights_offset = 0;                    // Unused in tflite code
 338   params.output_offset = output()->zero_point();
 339   params.quantized_activation_min = activation_min;
 340   params.quantized_activation_max = activation_max;
 341
 342   const std::vector<double> effective_output_scales =
 343     getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
 344
 345   std::vector<ChannelQuantMultipliers> quant_multipliers =
 346     quantizeMultipliers(effective_output_scales);
 347
 348   std::vector<int32_t> shifts;
 349   std::transform(quant_multipliers.begin(), quant_multipliers.end(), std::back_inserter(shifts),
 350                  [](ChannelQuantMultipliers cm) { return cm.shift; });
 351   std::vector<int32_t> multipliers;
 352   std::transform(quant_multipliers.begin(), quant_multipliers.end(),
 353                  std::back_inserter(multipliers),
 354                  [](ChannelQuantMultipliers cm) { return cm.multiplier; });
 355
 356   int8_t *im2col_data = nullptr;
 357   auto im2col = getOutputTensors()[1];
 358   if (_need_im2col)
 359   {
 360     im2col_data = im2col->data<int8_t>();
 361   }
 362
 363   luci_interpreter_pal::ConvPerChannel(
 364     params, multipliers.data(), shifts.data(), getTensorShape(input()),
 365     getTensorData<int8_t>(input()), getTensorShape(filter()), getTensorData<int8_t>(filter()),
 366     getTensorShape(bias()), getTensorData<int32_t>(bias()), getTensorShape(output()),
 367     getTensorData<int8_t>(output()), getTensorShape(im2col), im2col_data);
 368 }
 369
 370 void Conv2D::evalQuantizedS16() const
 371 {
 372   const auto *input_data = getTensorData<int16_t>(input());
 373   const auto *filter_data = getTensorData<int16_t>(filter());
 374   const auto *bias_data = getTensorData<int64_t>(bias());
 375   auto *output_data = getTensorData<int16_t>(output());
 376
 377   const Shape &input_shape = input()->shape();
 378   const Shape &filter_shape = filter()->shape();
 379   const Shape &output_shape = output()->shape();
 380
 381   const int32_t batches = input_shape.dim(0);
 382   const int32_t input_height = input_shape.dim(1);
 383   const int32_t input_width = input_shape.dim(2);
 384   const int32_t input_depth = input_shape.dim(3);
 385   const int32_t output_depth = filter_shape.dim(0);
 386   const int32_t filter_height = filter_shape.dim(1);
 387   const int32_t filter_width = filter_shape.dim(2);
 388   const int32_t output_height = output_shape.dim(1);
 389   const int32_t output_width = output_shape.dim(2);
 390
 391   const int32_t stride_height = _params.stride_height;
 392   const int32_t stride_width = _params.stride_width;
 393   const int32_t dilation_height_factor = _params.dilation_height_factor;
 394   const int32_t dilation_width_factor = _params.dilation_width_factor;
 395
 396   int32_t activation_min{};
 397   int32_t activation_max{};
 398   calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
 399
 400   const std::vector<double> effective_output_scale =
 401     getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
 402
 403   const std::vector<ChannelQuantMultipliers> multipliers_raw =
 404     quantizeMultipliers(effective_output_scale);
 405   BroadcastableWrapper<ChannelQuantMultipliers> multipliers(multipliers_raw);
 406
 407   for (int32_t batch = 0; batch < batches; ++batch)
 408   {
 409     for (int32_t out_y = 0; out_y < output_height; ++out_y)
 410     {
 411       for (int32_t out_x = 0; out_x < output_width; ++out_x)
 412       {
 413         for (int32_t out_c = 0; out_c < output_depth; ++out_c)
 414         {
 415           const int32_t in_y_origin = out_y * stride_height - _padding_height;
 416           const int32_t in_x_origin = out_x * stride_width - _padding_width;
 417           int64_t acc = 0;
 418           for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y)
 419           {
 420             for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x)
 421             {
 422               const int32_t in_y = in_y_origin + dilation_height_factor * filter_y;
 423               const int32_t in_x = in_x_origin + dilation_width_factor * filter_x;
 424               if ((in_y >= 0 && in_y < input_height) && (in_x >= 0 && in_x < input_width))
 425               {
 426                 for (int32_t in_c = 0; in_c < input_depth; ++in_c)
 427                 {
 428                   const int16_t input_val =
 429                     input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
 430                   const int16_t filter_val =
 431                     filter_data[calcOffset(filter_shape, out_c, filter_y, filter_x, in_c)];
 432                   acc += static_cast<int64_t>(input_val) * static_cast<int64_t>(filter_val);
 433                 }
 434               }
 435             }
 436           }
 437           if (bias_data)
 438           {
 439             acc += bias_data[out_c];
 440           }
 441
 442           int32_t scaled_acc = tflite::MultiplyByQuantizedMultiplier(
 443             acc, multipliers[out_c].multiplier, multipliers[out_c].shift);
 444
 445           scaled_acc = std::max(scaled_acc, activation_min);
 446           scaled_acc = std::min(scaled_acc, activation_max);
 447
 448           output_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] = scaled_acc;
 449         }
 450       }
 451     }
 452   }
 453 }
 454
 455 } // namespace kernels
 456 } // namespace luci_interpreter