compiler/luci-interpreter/src/kernels/Conv2D.cpp

   1 /*
   2  * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
   3  * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
   4  *
   5  * Licensed under the Apache License, Version 2.0 (the "License");
   6  * you may not use this file except in compliance with the License.
   7  * You may obtain a copy of the License at
   8  *
   9  *    http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 #include "kernels/Conv2D.h"
  19
  20 #include "kernels/Utils.h"
  21
  22 #include <tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h>
  23
  24 #include <stdexcept>
  25 #include <thread>
  26
  27 namespace luci_interpreter
  28 {
  29 namespace kernels
  30 {
  31
  32 Conv2D::Conv2D(const Tensor *input, const Tensor *filter, const Tensor *bias, Tensor *output,
  33                const Conv2DParams &params)
  34     : KernelWithParams<Conv2DParams>({input, filter, bias}, {output}, params)
  35 {
  36 }
  37
  38 void Conv2D::configure()
  39 {
  40   // TensorFlow Lite (as of v2.2.0) supports the following combinations of types:
  41   //     | input filter bias  output |
  42   // ----+---------------------------+
  43   // (1) | float float  float float  |
  44   // (2) | float int8   float float  | hybrid
  45   // (3) | uint8 uint8  int32 uint8  | quantized
  46   // (4) | int8  int8   int32 int8   | quantized per channel
  47   //
  48   // We only support (1) and (3) for now, and additionally the following:
  49   //     | input filter bias  output |
  50   // ----+---------------------------+
  51   // (5) | int16 int16  int64 int16  |
  52   //
  53   if (input()->element_type() == DataType::FLOAT32 && filter()->element_type() == DataType::FLOAT32)
  54   {
  55     LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::FLOAT32);
  56   }
  57   else if (input()->element_type() == DataType::U8 && filter()->element_type() == DataType::U8)
  58   {
  59     LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S32);
  60   }
  61   else if (input()->element_type() == DataType::S16 && filter()->element_type() == DataType::S16)
  62   {
  63     LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S64);
  64   }
  65   else
  66   {
  67     throw std::runtime_error("Unsupported type.");
  68   }
  69   LUCI_INTERPRETER_CHECK(output()->element_type() == input()->element_type());
  70
  71   const Shape &input_shape = input()->shape();
  72   const Shape &filter_shape = filter()->shape();
  73   LUCI_INTERPRETER_CHECK(input_shape.num_dims() == 4 && filter_shape.num_dims() == 4);
  74
  75   const int32_t batches = input_shape.dim(0);
  76   const int32_t input_height = input_shape.dim(1);
  77   const int32_t input_width = input_shape.dim(2);
  78   const int32_t output_depth = filter_shape.dim(0);
  79   const int32_t filter_height = filter_shape.dim(1);
  80   const int32_t filter_width = filter_shape.dim(2);
  81   LUCI_INTERPRETER_CHECK(filter_shape.dim(3) == input_shape.dim(3));
  82
  83   LUCI_INTERPRETER_CHECK(bias() == nullptr || (bias()->shape().num_dims() == 1 &&
  84                                                bias()->shape().dim(0) == output_depth));
  85
  86   const int32_t output_height =
  87       computeOutputSize(_params.padding, input_height, filter_height, _params.stride_height,
  88                         _params.dilation_height_factor);
  89   const int32_t output_width =
  90       computeOutputSize(_params.padding, input_width, filter_width, _params.stride_width,
  91                         _params.dilation_width_factor);
  92
  93   _padding_height = computePadding(_params.stride_height, _params.dilation_height_factor,
  94                                    input_height, filter_height, output_height);
  95   _padding_width = computePadding(_params.stride_width, _params.dilation_width_factor, input_width,
  96                                   filter_width, output_width);
  97
  98   output()->resize({batches, output_height, output_width, output_depth});
  99
 100   // Allocate tensor for Im2Col, if needed.
 101   // The checks here should be aligned with the actual implementation.
 102   const bool need_dilated_im2col =
 103       _params.dilation_height_factor != 1 || _params.dilation_width_factor != 1;
 104   const bool need_non_dilated_im2col = _params.stride_height != 1 || _params.stride_width != 1 ||
 105                                        filter_height != 1 || filter_width != 1;
 106   const bool need_im2col =
 107       input()->element_type() != DataType::S16 && (need_dilated_im2col || need_non_dilated_im2col);
 108   if (need_im2col)
 109   {
 110     const int input_depth = input_shape.dim(3);
 111     Shape im2col_shape{batches, output_height, output_width,
 112                        input_depth * filter_height * filter_width};
 113     try
 114     {
 115       _im2col =
 116           std::make_unique<Tensor>(input()->element_type(), im2col_shape, AffineQuantization{}, "");
 117     }
 118     catch (std::bad_alloc &ba)
 119     {
 120       // Failed memory allocation
 121       _im2col = nullptr;
 122     }
 123   }
 124 }
 125
 126 void Conv2D::execute() const
 127 {
 128   switch (input()->element_type())
 129   {
 130     case DataType::FLOAT32:
 131       if (filter()->element_type() == DataType::FLOAT32)
 132       {
 133         evalFloat();
 134         break;
 135       }
 136       throw std::runtime_error("Unsupported type.");
 137     case DataType::U8:
 138       evalQuantized();
 139       break;
 140     case DataType::S16:
 141       evalQuantizedS16();
 142       break;
 143     default:
 144       throw std::runtime_error("Unsupported type.");
 145   }
 146   if (!!_im2col)
 147     _im2col->deallocate();
 148 }
 149
 150 void Conv2D::evalFloat() const
 151 {
 152   float activation_min{};
 153   float activation_max{};
 154   calculateActivationRange(_params.activation, &activation_min, &activation_max);
 155
 156   tflite::ConvParams params{};
 157   params.padding_values.height = _padding_height;
 158   params.padding_values.width = _padding_width;
 159   params.stride_height = _params.stride_height;
 160   params.stride_width = _params.stride_width;
 161   params.dilation_height_factor = _params.dilation_height_factor;
 162   params.dilation_width_factor = _params.dilation_width_factor;
 163   params.float_activation_min = activation_min;
 164   params.float_activation_max = activation_max;
 165
 166   if (_im2col)
 167     tflite::optimized_ops::Conv(params, getTensorShape(input()), getTensorData<float>(input()),
 168                                 getTensorShape(filter()), getTensorData<float>(filter()),
 169                                 getTensorShape(bias()), getTensorData<float>(bias()),
 170                                 getTensorShape(output()), getTensorData<float>(output()),
 171                                 getTensorShape(_im2col.get()), getTensorData<float>(_im2col.get()));
 172   else
 173     tflite::reference_ops::Conv(
 174         params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(filter()),
 175         getTensorData<float>(filter()), getTensorShape(bias()), getTensorData<float>(bias()),
 176         getTensorShape(output()), getTensorData<float>(output()), tflite::RuntimeShape(), nullptr);
 177 }
 178
 179 void Conv2D::evalQuantized() const
 180 {
 181   const auto input_scale = static_cast<double>(input()->scale());
 182   const auto filter_scale = static_cast<double>(filter()->scale());
 183   const auto output_scale = static_cast<double>(output()->scale());
 184
 185   const double real_multiplier = input_scale * filter_scale / output_scale;
 186   int32_t output_multiplier{};
 187   int output_shift{};
 188   quantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
 189
 190   int32_t activation_min{};
 191   int32_t activation_max{};
 192   calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
 193
 194   tflite::ConvParams params{};
 195   params.padding_values.height = _padding_height;
 196   params.padding_values.width = _padding_width;
 197   params.stride_height = _params.stride_height;
 198   params.stride_width = _params.stride_width;
 199   params.dilation_height_factor = _params.dilation_height_factor;
 200   params.dilation_width_factor = _params.dilation_width_factor;
 201   // The kernel expects input and filter zero points to be negated.
 202   params.input_offset = -input()->zero_point();    // Note the '-'.
 203   params.weights_offset = -filter()->zero_point(); // Note the '-'.
 204   params.output_offset = output()->zero_point();
 205   params.output_multiplier = output_multiplier;
 206   params.output_shift = output_shift;
 207   params.quantized_activation_min = activation_min;
 208   params.quantized_activation_max = activation_max;
 209
 210   // TODO This should only be done once (although it takes only a few microseconds).
 211   //  Also, the user should be able to adjust the number of threads.
 212   auto gemmlowp_context = std::make_unique<gemmlowp::GemmContext>();
 213   gemmlowp_context->set_max_num_threads(static_cast<int>(std::thread::hardware_concurrency()));
 214
 215   tflite::optimized_ops::Conv(
 216       params, getTensorShape(input()), getTensorData<uint8_t>(input()), getTensorShape(filter()),
 217       getTensorData<uint8_t>(filter()), getTensorShape(bias()), getTensorData<int32_t>(bias()),
 218       getTensorShape(output()), getTensorData<uint8_t>(output()), getTensorShape(_im2col.get()),
 219       getTensorData<uint8_t>(_im2col.get()), gemmlowp_context.get());
 220 }
 221
 222 void Conv2D::evalQuantizedS16() const
 223 {
 224   const auto *input_data = getTensorData<int16_t>(input());
 225   const auto *filter_data = getTensorData<int16_t>(filter());
 226   const auto *bias_data = getTensorData<int64_t>(bias());
 227   auto *output_data = getTensorData<int16_t>(output());
 228
 229   const Shape &input_shape = input()->shape();
 230   const Shape &filter_shape = filter()->shape();
 231   const Shape &output_shape = output()->shape();
 232
 233   const int32_t batches = input_shape.dim(0);
 234   const int32_t input_height = input_shape.dim(1);
 235   const int32_t input_width = input_shape.dim(2);
 236   const int32_t input_depth = input_shape.dim(3);
 237   const int32_t output_depth = filter_shape.dim(0);
 238   const int32_t filter_height = filter_shape.dim(1);
 239   const int32_t filter_width = filter_shape.dim(2);
 240   const int32_t output_height = output_shape.dim(1);
 241   const int32_t output_width = output_shape.dim(2);
 242
 243   const int32_t stride_height = _params.stride_height;
 244   const int32_t stride_width = _params.stride_width;
 245   const int32_t dilation_height_factor = _params.dilation_height_factor;
 246   const int32_t dilation_width_factor = _params.dilation_width_factor;
 247
 248   int32_t activation_min{};
 249   int32_t activation_max{};
 250   calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
 251
 252   const std::vector<double> effective_output_scale =
 253       getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
 254
 255   const std::vector<ChannelQuantMultipliers> multipliers_raw =
 256       quantizeMultipliers(effective_output_scale);
 257   BroadcastableWrapper<ChannelQuantMultipliers> multipliers(multipliers_raw);
 258
 259   for (int32_t batch = 0; batch < batches; ++batch)
 260   {
 261     for (int32_t out_y = 0; out_y < output_height; ++out_y)
 262     {
 263       for (int32_t out_x = 0; out_x < output_width; ++out_x)
 264       {
 265         for (int32_t out_c = 0; out_c < output_depth; ++out_c)
 266         {
 267           const int32_t in_y_origin = out_y * stride_height - _padding_height;
 268           const int32_t in_x_origin = out_x * stride_width - _padding_width;
 269           int64_t acc = 0;
 270           for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y)
 271           {
 272             for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x)
 273             {
 274               const int32_t in_y = in_y_origin + dilation_height_factor * filter_y;
 275               const int32_t in_x = in_x_origin + dilation_width_factor * filter_x;
 276               if ((in_y >= 0 && in_y < input_height) && (in_x >= 0 && in_x < input_width))
 277               {
 278                 for (int32_t in_c = 0; in_c < input_depth; ++in_c)
 279                 {
 280                   const int16_t input_val =
 281                       input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
 282                   const int16_t filter_val =
 283                       filter_data[calcOffset(filter_shape, out_c, filter_y, filter_x, in_c)];
 284                   acc += static_cast<int64_t>(input_val) * static_cast<int64_t>(filter_val);
 285                 }
 286               }
 287             }
 288           }
 289           if (bias_data)
 290           {
 291             acc += bias_data[out_c];
 292           }
 293
 294           int32_t scaled_acc = tflite::MultiplyByQuantizedMultiplier(
 295               acc, multipliers[out_c].multiplier, multipliers[out_c].shift);
 296
 297           scaled_acc = std::max(scaled_acc, activation_min);
 298           scaled_acc = std::min(scaled_acc, activation_max);
 299
 300           output_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] = scaled_acc;
 301         }
 302       }
 303     }
 304   }
 305 }
 306
 307 } // namespace kernels
 308 } // namespace luci_interpreter