compiler/luci-interpreter/src/kernels/Conv2D.cpp

   1 /*
   2  * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
   3  * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
   4  *
   5  * Licensed under the Apache License, Version 2.0 (the "License");
   6  * you may not use this file except in compliance with the License.
   7  * You may obtain a copy of the License at
   8  *
   9  *    http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 #include "kernels/Conv2D.h"
  19
  20 #include "kernels/Utils.h"
  21
  22 #include <tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h>
  23
  24 #include <stdexcept>
  25 #include <thread>
  26
  27 namespace luci_interpreter
  28 {
  29 namespace kernels
  30 {
  31
  32 Conv2D::Conv2D(const Tensor *input, const Tensor *filter, const Tensor *bias, Tensor *output,
  33                const Conv2DParams &params)
  34     : KernelWithParams<Conv2DParams>({input, filter, bias}, {output}, params)
  35 {
  36 }
  37
  38 void Conv2D::configure()
  39 {
  40   // TensorFlow Lite (as of v2.2.0) supports the following combinations of types:
  41   //     | input filter bias  output |
  42   // ----+---------------------------+
  43   // (1) | float float  float float  |
  44   // (2) | float int8   float float  | hybrid
  45   // (3) | uint8 uint8  int32 uint8  | quantized
  46   // (4) | int8  int8   int32 int8   | quantized per channel
  47   //
  48   // We only support (1) and (3) for now, and additionally the following:
  49   //     | input filter bias  output |
  50   // ----+---------------------------+
  51   // (5) | int16 int16  int64 int16  |
  52   //
  53   if (input()->element_type() == DataType::FLOAT32 && filter()->element_type() == DataType::FLOAT32)
  54   {
  55     LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::FLOAT32);
  56   }
  57   else if (input()->element_type() == DataType::U8 && filter()->element_type() == DataType::U8)
  58   {
  59     LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S32);
  60   }
  61   else if (input()->element_type() == DataType::S16 && filter()->element_type() == DataType::S16)
  62   {
  63     LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S64);
  64   }
  65   else
  66   {
  67     throw std::runtime_error("Unsupported type.");
  68   }
  69   LUCI_INTERPRETER_CHECK(output()->element_type() == input()->element_type());
  70
  71   const Shape &input_shape = input()->shape();
  72   const Shape &filter_shape = filter()->shape();
  73   LUCI_INTERPRETER_CHECK(input_shape.num_dims() == 4 && filter_shape.num_dims() == 4);
  74
  75   const int32_t batches = input_shape.dim(0);
  76   const int32_t input_height = input_shape.dim(1);
  77   const int32_t input_width = input_shape.dim(2);
  78   const int32_t output_depth = filter_shape.dim(0);
  79   const int32_t filter_height = filter_shape.dim(1);
  80   const int32_t filter_width = filter_shape.dim(2);
  81   LUCI_INTERPRETER_CHECK(filter_shape.dim(3) == input_shape.dim(3));
  82
  83   LUCI_INTERPRETER_CHECK(bias() == nullptr || (bias()->shape().num_dims() == 1 &&
  84                                                bias()->shape().dim(0) == output_depth));
  85
  86   const int32_t output_height =
  87       computeOutputSize(_params.padding, input_height, filter_height, _params.stride_height,
  88                         _params.dilation_height_factor);
  89   const int32_t output_width =
  90       computeOutputSize(_params.padding, input_width, filter_width, _params.stride_width,
  91                         _params.dilation_width_factor);
  92
  93   _padding_height = computePadding(_params.stride_height, _params.dilation_height_factor,
  94                                    input_height, filter_height, output_height);
  95   _padding_width = computePadding(_params.stride_width, _params.dilation_width_factor, input_width,
  96                                   filter_width, output_width);
  97
  98   output()->resize({batches, output_height, output_width, output_depth});
  99
 100   // Allocate tensor for Im2Col, if needed.
 101   // The checks here should be aligned with the actual implementation.
 102   const bool need_dilated_im2col =
 103       _params.dilation_height_factor != 1 || _params.dilation_width_factor != 1;
 104   const bool need_non_dilated_im2col = _params.stride_height != 1 || _params.stride_width != 1 ||
 105                                        filter_height != 1 || filter_width != 1;
 106   const bool need_im2col =
 107       input()->element_type() != DataType::S16 && (need_dilated_im2col || need_non_dilated_im2col);
 108   if (need_im2col)
 109   {
 110     const int input_depth = input_shape.dim(3);
 111     Shape im2col_shape{batches, output_height, output_width,
 112                        input_depth * filter_height * filter_width};
 113     try
 114     {
 115       _im2col =
 116           std::make_unique<Tensor>(input()->element_type(), im2col_shape, AffineQuantization{}, "");
 117     }
 118     catch (std::bad_alloc &ba)
 119     {
 120       // Failed memory allocation
 121       _im2col = nullptr;
 122     }
 123   }
 124 }
 125
 126 void Conv2D::execute() const
 127 {
 128   switch (input()->element_type())
 129   {
 130     case DataType::FLOAT32:
 131       if (filter()->element_type() == DataType::FLOAT32)
 132       {
 133         evalFloat();
 134         break;
 135       }
 136       throw std::runtime_error("Unsupported type.");
 137     case DataType::U8:
 138       if (filter()->scales().size() == 1)
 139       {
 140         evalQuantized();
 141       }
 142       else if (filter()->scales().size() > 1)
 143       {
 144         LUCI_INTERPRETER_CHECK(filter()->shape().num_dims() == 4);
 145         LUCI_INTERPRETER_CHECK(filter()->scales().size() ==
 146                                static_cast<size_t>(filter()->shape().dim(0)));
 147         evalQuantizedPerChannel();
 148       }
 149       break;
 150     case DataType::S16:
 151       evalQuantizedS16();
 152       break;
 153     default:
 154       throw std::runtime_error("Unsupported type.");
 155   }
 156   if (!!_im2col)
 157     _im2col->deallocate();
 158 }
 159
 160 void Conv2D::evalFloat() const
 161 {
 162   float activation_min{};
 163   float activation_max{};
 164   calculateActivationRange(_params.activation, &activation_min, &activation_max);
 165
 166   tflite::ConvParams params{};
 167   params.padding_values.height = _padding_height;
 168   params.padding_values.width = _padding_width;
 169   params.stride_height = _params.stride_height;
 170   params.stride_width = _params.stride_width;
 171   params.dilation_height_factor = _params.dilation_height_factor;
 172   params.dilation_width_factor = _params.dilation_width_factor;
 173   params.float_activation_min = activation_min;
 174   params.float_activation_max = activation_max;
 175
 176   if (_im2col)
 177     tflite::optimized_ops::Conv(params, getTensorShape(input()), getTensorData<float>(input()),
 178                                 getTensorShape(filter()), getTensorData<float>(filter()),
 179                                 getTensorShape(bias()), getTensorData<float>(bias()),
 180                                 getTensorShape(output()), getTensorData<float>(output()),
 181                                 getTensorShape(_im2col.get()), getTensorData<float>(_im2col.get()));
 182   else
 183     tflite::reference_ops::Conv(
 184         params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(filter()),
 185         getTensorData<float>(filter()), getTensorShape(bias()), getTensorData<float>(bias()),
 186         getTensorShape(output()), getTensorData<float>(output()), tflite::RuntimeShape(), nullptr);
 187 }
 188
 189 void Conv2D::evalQuantized() const
 190 {
 191   const auto input_scale = static_cast<double>(input()->scale());
 192   const auto filter_scale = static_cast<double>(filter()->scale());
 193   const auto output_scale = static_cast<double>(output()->scale());
 194
 195   const double real_multiplier = input_scale * filter_scale / output_scale;
 196   int32_t output_multiplier{};
 197   int output_shift{};
 198   quantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
 199
 200   int32_t activation_min{};
 201   int32_t activation_max{};
 202   calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
 203
 204   tflite::ConvParams params{};
 205   params.padding_values.height = _padding_height;
 206   params.padding_values.width = _padding_width;
 207   params.stride_height = _params.stride_height;
 208   params.stride_width = _params.stride_width;
 209   params.dilation_height_factor = _params.dilation_height_factor;
 210   params.dilation_width_factor = _params.dilation_width_factor;
 211   // The kernel expects input and filter zero points to be negated.
 212   params.input_offset = -input()->zero_point();    // Note the '-'.
 213   params.weights_offset = -filter()->zero_point(); // Note the '-'.
 214   params.output_offset = output()->zero_point();
 215   params.output_multiplier = output_multiplier;
 216   params.output_shift = output_shift;
 217   params.quantized_activation_min = activation_min;
 218   params.quantized_activation_max = activation_max;
 219
 220   // TODO This should only be done once (although it takes only a few microseconds).
 221   //  Also, the user should be able to adjust the number of threads.
 222   auto gemmlowp_context = std::make_unique<gemmlowp::GemmContext>();
 223   gemmlowp_context->set_max_num_threads(static_cast<int>(std::thread::hardware_concurrency()));
 224
 225   tflite::optimized_ops::Conv(
 226       params, getTensorShape(input()), getTensorData<uint8_t>(input()), getTensorShape(filter()),
 227       getTensorData<uint8_t>(filter()), getTensorShape(bias()), getTensorData<int32_t>(bias()),
 228       getTensorShape(output()), getTensorData<uint8_t>(output()), getTensorShape(_im2col.get()),
 229       getTensorData<uint8_t>(_im2col.get()), gemmlowp_context.get());
 230 }
 231
 232 void Conv2D::evalQuantizedPerChannel() const
 233 {
 234   const auto *input_data = getTensorData<uint8_t>(input());
 235   const auto *filter_data = getTensorData<uint8_t>(filter());
 236   const auto *bias_data = getTensorData<int32_t>(bias());
 237   auto *output_data = getTensorData<uint8_t>(output());
 238
 239   const Shape &input_shape = input()->shape();
 240   const Shape &filter_shape = filter()->shape();
 241   const Shape &output_shape = output()->shape();
 242
 243   const int32_t batches = input_shape.dim(0);
 244   const int32_t input_height = input_shape.dim(1);
 245   const int32_t input_width = input_shape.dim(2);
 246   const int32_t input_depth = input_shape.dim(3);
 247   const int32_t output_depth = filter_shape.dim(0);
 248   const int32_t filter_height = filter_shape.dim(1);
 249   const int32_t filter_width = filter_shape.dim(2);
 250   const int32_t output_height = output_shape.dim(1);
 251   const int32_t output_width = output_shape.dim(2);
 252
 253   const int32_t stride_height = _params.stride_height;
 254   const int32_t stride_width = _params.stride_width;
 255   const int32_t dilation_height_factor = _params.dilation_height_factor;
 256   const int32_t dilation_width_factor = _params.dilation_width_factor;
 257
 258   int32_t activation_min{};
 259   int32_t activation_max{};
 260   calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
 261
 262   const std::vector<double> effective_output_scale =
 263       getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
 264
 265   const std::vector<ChannelQuantMultipliers> multipliers_raw =
 266       quantizeMultipliers(effective_output_scale);
 267   BroadcastableWrapper<ChannelQuantMultipliers> quant_multipliers(multipliers_raw);
 268
 269   for (int32_t batch = 0; batch < batches; ++batch)
 270   {
 271     for (int32_t out_y = 0; out_y < output_height; ++out_y)
 272     {
 273       for (int32_t out_x = 0; out_x < output_width; ++out_x)
 274       {
 275         for (int32_t out_c = 0; out_c < output_depth; ++out_c)
 276         {
 277           const int32_t in_y_origin = out_y * stride_height - _padding_height;
 278           const int32_t in_x_origin = out_x * stride_width - _padding_width;
 279           int32_t acc = 0;
 280           for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y)
 281           {
 282             for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x)
 283             {
 284               const int32_t in_y = in_y_origin + dilation_height_factor * filter_y;
 285               const int32_t in_x = in_x_origin + dilation_width_factor * filter_x;
 286               if ((in_y >= 0 && in_y < input_height) && (in_x >= 0 && in_x < input_width))
 287               {
 288                 for (int32_t in_c = 0; in_c < input_depth; ++in_c)
 289                 {
 290                   const uint8_t input_val =
 291                       input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
 292                   const uint8_t filter_val =
 293                       filter_data[calcOffset(filter_shape, out_c, filter_y, filter_x, in_c)];
 294                   acc += static_cast<int32_t>(input_val - input()->zero_point()) *
 295                          static_cast<int32_t>(filter_val - filter()->zero_points()[out_c]);
 296                 }
 297               }
 298             }
 299           }
 300           if (bias_data)
 301           {
 302             acc += bias_data[out_c];
 303           }
 304
 305           int32_t scaled_acc = tflite::MultiplyByQuantizedMultiplier(
 306               acc, quant_multipliers[out_c].multiplier, quant_multipliers[out_c].shift);
 307
 308           scaled_acc += output()->zero_point();
 309           scaled_acc = std::max(scaled_acc, activation_min);
 310           scaled_acc = std::min(scaled_acc, activation_max);
 311           output_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] = scaled_acc;
 312         }
 313       }
 314     }
 315   }
 316 }
 317
 318 void Conv2D::evalQuantizedS16() const
 319 {
 320   const auto *input_data = getTensorData<int16_t>(input());
 321   const auto *filter_data = getTensorData<int16_t>(filter());
 322   const auto *bias_data = getTensorData<int64_t>(bias());
 323   auto *output_data = getTensorData<int16_t>(output());
 324
 325   const Shape &input_shape = input()->shape();
 326   const Shape &filter_shape = filter()->shape();
 327   const Shape &output_shape = output()->shape();
 328
 329   const int32_t batches = input_shape.dim(0);
 330   const int32_t input_height = input_shape.dim(1);
 331   const int32_t input_width = input_shape.dim(2);
 332   const int32_t input_depth = input_shape.dim(3);
 333   const int32_t output_depth = filter_shape.dim(0);
 334   const int32_t filter_height = filter_shape.dim(1);
 335   const int32_t filter_width = filter_shape.dim(2);
 336   const int32_t output_height = output_shape.dim(1);
 337   const int32_t output_width = output_shape.dim(2);
 338
 339   const int32_t stride_height = _params.stride_height;
 340   const int32_t stride_width = _params.stride_width;
 341   const int32_t dilation_height_factor = _params.dilation_height_factor;
 342   const int32_t dilation_width_factor = _params.dilation_width_factor;
 343
 344   int32_t activation_min{};
 345   int32_t activation_max{};
 346   calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
 347
 348   const std::vector<double> effective_output_scale =
 349       getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
 350
 351   const std::vector<ChannelQuantMultipliers> multipliers_raw =
 352       quantizeMultipliers(effective_output_scale);
 353   BroadcastableWrapper<ChannelQuantMultipliers> multipliers(multipliers_raw);
 354
 355   for (int32_t batch = 0; batch < batches; ++batch)
 356   {
 357     for (int32_t out_y = 0; out_y < output_height; ++out_y)
 358     {
 359       for (int32_t out_x = 0; out_x < output_width; ++out_x)
 360       {
 361         for (int32_t out_c = 0; out_c < output_depth; ++out_c)
 362         {
 363           const int32_t in_y_origin = out_y * stride_height - _padding_height;
 364           const int32_t in_x_origin = out_x * stride_width - _padding_width;
 365           int64_t acc = 0;
 366           for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y)
 367           {
 368             for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x)
 369             {
 370               const int32_t in_y = in_y_origin + dilation_height_factor * filter_y;
 371               const int32_t in_x = in_x_origin + dilation_width_factor * filter_x;
 372               if ((in_y >= 0 && in_y < input_height) && (in_x >= 0 && in_x < input_width))
 373               {
 374                 for (int32_t in_c = 0; in_c < input_depth; ++in_c)
 375                 {
 376                   const int16_t input_val =
 377                       input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
 378                   const int16_t filter_val =
 379                       filter_data[calcOffset(filter_shape, out_c, filter_y, filter_x, in_c)];
 380                   acc += static_cast<int64_t>(input_val) * static_cast<int64_t>(filter_val);
 381                 }
 382               }
 383             }
 384           }
 385           if (bias_data)
 386           {
 387             acc += bias_data[out_c];
 388           }
 389
 390           int32_t scaled_acc = tflite::MultiplyByQuantizedMultiplier(
 391               acc, multipliers[out_c].multiplier, multipliers[out_c].shift);
 392
 393           scaled_acc = std::max(scaled_acc, activation_min);
 394           scaled_acc = std::min(scaled_acc, activation_max);
 395
 396           output_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] = scaled_acc;
 397         }
 398       }
 399     }
 400   }
 401 }
 402
 403 } // namespace kernels
 404 } // namespace luci_interpreter