compiler/luci-interpreter/src/kernels/TransposeConv.cpp

   1 /*
   2  * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
   3  * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
   4  *
   5  * Licensed under the Apache License, Version 2.0 (the "License");
   6  * you may not use this file except in compliance with the License.
   7  * You may obtain a copy of the License at
   8  *
   9  *    http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 #include "kernels/TransposeConv.h"
  19
  20 #include "kernels/Utils.h"
  21
  22 #include <tensorflow/lite/kernels/internal/reference/reference_ops.h>
  23
  24 #include <stdexcept>
  25
  26 namespace luci_interpreter
  27 {
  28
  29 namespace kernels
  30 {
  31
  32 TransposeConv::TransposeConv(const Tensor *output_shape, const Tensor *filter, const Tensor *input,
  33                              const Tensor *bias, Tensor *output, const TransposeConvParams &params)
  34     : KernelWithParams<TransposeConvParams>({output_shape, filter, input, bias}, {output}, params)
  35 {
  36 }
  37
  38 TransposeConv::~TransposeConv()
  39 {
  40   // Define destructor here, to delete vector of qunatized multipliers properly
  41 }
  42
  43 void TransposeConv::configure()
  44 {
  45   assert(output_shape()->shape().num_dims() == 1);
  46   assert(input()->shape().num_dims() == 4);
  47   assert(filter()->shape().num_dims() == 4);
  48   assert(input()->element_type() == DataType::FLOAT32 || input()->element_type() == DataType::U8 ||
  49          input()->element_type() == DataType::S16);
  50   assert(input()->element_type() == output()->element_type());
  51   assert(input()->shape().dim(3) == filter()->shape().dim(3));
  52
  53   const int num_dims = output_shape()->shape().dim(0);
  54   Shape out_shape(num_dims);
  55   const auto *shape_data = getTensorData<int32_t>(output_shape());
  56   for (int i = 0; i < num_dims; i++)
  57     out_shape.dim(i) = shape_data[i];
  58   output()->resize(out_shape);
  59
  60   const int32_t filter_height = filter()->shape().dim(1);
  61   const int32_t filter_width = filter()->shape().dim(2);
  62   const int32_t output_height = out_shape.dim(1);
  63   const int32_t output_width = out_shape.dim(2);
  64
  65   const int32_t unused_output_height =
  66       computeOutputSize(params().padding, output_height, filter_height, params().stride_height, 1);
  67   const int32_t unused_output_width =
  68       computeOutputSize(params().padding, output_width, filter_width, params().stride_width, 1);
  69
  70   _padding_height =
  71       computePadding(params().stride_height, 1, output_height, filter_height, unused_output_height);
  72   _padding_width =
  73       computePadding(params().stride_width, 1, output_width, filter_width, unused_output_width);
  74
  75   if (input()->element_type() == DataType::U8 || input()->element_type() == DataType::S16)
  76   {
  77     DataType scratch_data_type =
  78         input()->element_type() == DataType::S16 ? DataType::S64 : DataType::S32;
  79     _scratch_tensor =
  80         std::make_unique<Tensor>(scratch_data_type, output()->shape(), AffineQuantization{}, "");
  81     const std::vector<double> real_multipliers =
  82         getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
  83
  84     _quant_multipliers = quantizeMultipliers(real_multipliers);
  85   }
  86 }
  87
  88 void TransposeConv::execute() const
  89 {
  90   switch (input()->element_type())
  91   {
  92     case DataType::FLOAT32:
  93       evalFloat();
  94       break;
  95     case DataType::U8:
  96       if (filter()->scales().size() == 1)
  97       {
  98         evalQuantized();
  99       }
 100       else if (filter()->scales().size() > 1)
 101       {
 102         LUCI_INTERPRETER_CHECK(filter()->shape().num_dims() == 4);
 103         LUCI_INTERPRETER_CHECK(filter()->scales().size() ==
 104                                static_cast<size_t>(filter()->shape().dim(0)));
 105         evalQuantizedPerChannel();
 106       }
 107       break;
 108     case DataType::S16:
 109       evalQuantizedS16();
 110       break;
 111     default:
 112       throw std::runtime_error("Unsupported type.");
 113   }
 114   if (!!_scratch_tensor)
 115     _scratch_tensor->deallocate();
 116 }
 117
 118 void TransposeConv::evalFloat() const
 119 {
 120   tflite::ConvParams op_params{};
 121   op_params.padding_type = tflite::PaddingType::kSame;
 122   op_params.padding_values.height = _padding_height;
 123   op_params.padding_values.width = _padding_width;
 124   op_params.stride_height = params().stride_height;
 125   op_params.stride_width = params().stride_width;
 126   tflite::reference_ops::TransposeConv(op_params,                                                //
 127                                        getTensorShape(input()), getTensorData<float>(input()),   //
 128                                        getTensorShape(filter()), getTensorData<float>(filter()), //
 129                                        getTensorShape(bias()), getTensorData<float>(bias()),     //
 130                                        getTensorShape(output()), getTensorData<float>(output()), //
 131                                        tflite::RuntimeShape(), nullptr);
 132 }
 133
 134 void TransposeConv::evalQuantized() const
 135 {
 136   tflite::ConvParams op_params{};
 137   op_params.padding_type = tflite::PaddingType::kSame;
 138   op_params.padding_values.height = _padding_height;
 139   op_params.padding_values.width = _padding_width;
 140   op_params.stride_height = params().stride_height;
 141   op_params.stride_width = params().stride_width;
 142   // The kernel expects input and filter zero points to be negated.
 143   op_params.input_offset = -input()->zero_point();    // Note the '-'.
 144   op_params.weights_offset = -filter()->zero_point(); // Note the '-'.
 145   op_params.output_offset = output()->zero_point();
 146   op_params.output_multiplier = _quant_multipliers[0].multiplier;
 147   op_params.output_shift = _quant_multipliers[0].shift;
 148   op_params.quantized_activation_min = std::numeric_limits<uint8_t>::min();
 149   op_params.quantized_activation_max = std::numeric_limits<uint8_t>::max();
 150
 151   tflite::reference_ops::TransposeConv(op_params,                                                //
 152                                        getTensorShape(input()), getTensorData<uint8>(input()),   //
 153                                        getTensorShape(filter()), getTensorData<uint8>(filter()), //
 154                                        getTensorShape(bias()), getTensorData<int32_t>(bias()),   //
 155                                        getTensorShape(output()), getTensorData<uint8>(output()), //
 156                                        tflite::RuntimeShape(), nullptr,                          //
 157                                        getTensorData<int32_t>(_scratch_tensor.get()));
 158 }
 159
 160 void TransposeConv::evalQuantizedPerChannel() const
 161 {
 162   const auto *input_data = getTensorData<uint8_t>(input());
 163   const auto *filter_data = getTensorData<uint8_t>(filter());
 164   const auto *bias_data = getTensorData<int32_t>(bias());
 165   auto *output_data = getTensorData<uint8_t>(output());
 166   auto *scratch_data = getTensorData<int32_t>(_scratch_tensor.get());
 167
 168   const Shape &input_shape = input()->shape();
 169   const Shape &filter_shape = filter()->shape();
 170   const Shape &output_shape = output()->shape();
 171
 172   const int32_t batches = input_shape.dim(0);
 173   const int32_t input_height = input_shape.dim(1);
 174   const int32_t input_width = input_shape.dim(2);
 175   const int32_t input_depth = input_shape.dim(3);
 176   const int32_t output_depth = filter_shape.dim(0);
 177   const int32_t filter_height = filter_shape.dim(1);
 178   const int32_t filter_width = filter_shape.dim(2);
 179   const int32_t output_height = output_shape.dim(1);
 180   const int32_t output_width = output_shape.dim(2);
 181
 182   const int32_t stride_height = _params.stride_height;
 183   const int32_t stride_width = _params.stride_width;
 184
 185   int32_t activation_min{};
 186   int32_t activation_max{};
 187   calculateActivationRangeQuantized(Activation::NONE, output(), &activation_min, &activation_max);
 188
 189   std::memset(scratch_data, 0, _scratch_tensor->shape().num_elements() * sizeof(int32_t));
 190
 191   BroadcastableWrapper<ChannelQuantMultipliers> output_multipliers(_quant_multipliers);
 192   for (int32_t batch = 0; batch < batches; ++batch)
 193   {
 194     for (int32_t in_y = 0; in_y < input_height; ++in_y)
 195     {
 196       for (int32_t in_x = 0; in_x < input_width; ++in_x)
 197       {
 198         for (int32_t in_c = 0; in_c < input_depth; ++in_c)
 199         {
 200           const int32_t out_y_origin = in_y * stride_height - _padding_height;
 201           const int32_t out_x_origin = in_x * stride_width - _padding_width;
 202           for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y)
 203           {
 204             for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x)
 205             {
 206               const int32_t out_x = out_x_origin + filter_x;
 207               const int32_t out_y = out_y_origin + filter_y;
 208               if ((out_y >= 0 && out_y < output_height) && (out_x >= 0 && out_x < output_width))
 209               {
 210                 for (int32_t out_c = 0; out_c < output_depth; ++out_c)
 211                 {
 212                   const uint8_t input_val =
 213                       input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
 214                   const uint8_t filter_val =
 215                       filter_data[calcOffset(filter_shape, out_c, filter_y, filter_x, in_c)];
 216                   scratch_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] +=
 217                       static_cast<int32_t>(input_val - input()->zero_point()) *
 218                       static_cast<int32_t>(filter_val - filter()->zero_points()[out_c]);
 219                 }
 220               }
 221             }
 222           }
 223         }
 224       }
 225     }
 226     for (int32_t out_y = 0; out_y < output_height; ++out_y)
 227     {
 228       for (int32_t out_x = 0; out_x < output_width; ++out_x)
 229       {
 230         for (int32_t out_c = 0; out_c < output_depth; ++out_c)
 231         {
 232           int32_t acc = scratch_data[calcOffset(output_shape, batch, out_y, out_x, out_c)];
 233           if (bias_data)
 234           {
 235             acc += bias_data[out_c];
 236           }
 237
 238           int32_t scaled_acc = tflite::MultiplyByQuantizedMultiplier(
 239               acc, output_multipliers[out_c].multiplier, output_multipliers[out_c].shift);
 240
 241           scaled_acc += output()->zero_point();
 242           scaled_acc = std::max(scaled_acc, activation_min);
 243           scaled_acc = std::min(scaled_acc, activation_max);
 244
 245           output_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] = scaled_acc;
 246         }
 247       }
 248     }
 249   }
 250 }
 251
 252 void TransposeConv::evalQuantizedS16() const
 253 {
 254   const auto *input_data = getTensorData<int16_t>(input());
 255   const auto *filter_data = getTensorData<int16_t>(filter());
 256   const auto *bias_data = getTensorData<int64_t>(bias());
 257   auto *output_data = getTensorData<int16_t>(output());
 258   auto *scratch_data = getTensorData<int64_t>(_scratch_tensor.get());
 259
 260   const Shape &input_shape = input()->shape();
 261   const Shape &filter_shape = filter()->shape();
 262   const Shape &output_shape = output()->shape();
 263
 264   const int32_t batches = input_shape.dim(0);
 265   const int32_t input_height = input_shape.dim(1);
 266   const int32_t input_width = input_shape.dim(2);
 267   const int32_t input_depth = input_shape.dim(3);
 268   const int32_t output_depth = filter_shape.dim(0);
 269   const int32_t filter_height = filter_shape.dim(1);
 270   const int32_t filter_width = filter_shape.dim(2);
 271   const int32_t output_height = output_shape.dim(1);
 272   const int32_t output_width = output_shape.dim(2);
 273
 274   const int32_t stride_height = _params.stride_height;
 275   const int32_t stride_width = _params.stride_width;
 276
 277   int32_t activation_min{};
 278   int32_t activation_max{};
 279   calculateActivationRangeQuantized(Activation::NONE, output(), &activation_min, &activation_max);
 280
 281   std::memset(scratch_data, 0, _scratch_tensor->shape().num_elements() * sizeof(int64_t));
 282
 283   BroadcastableWrapper<ChannelQuantMultipliers> output_multipliers(_quant_multipliers);
 284   for (int32_t batch = 0; batch < batches; ++batch)
 285   {
 286     for (int32_t in_y = 0; in_y < input_height; ++in_y)
 287     {
 288       for (int32_t in_x = 0; in_x < input_width; ++in_x)
 289       {
 290         for (int32_t in_c = 0; in_c < input_depth; ++in_c)
 291         {
 292           const int32_t out_y_origin = in_y * stride_height - _padding_height;
 293           const int32_t out_x_origin = in_x * stride_width - _padding_width;
 294           for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y)
 295           {
 296             for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x)
 297             {
 298               const int32_t out_x = out_x_origin + filter_x;
 299               const int32_t out_y = out_y_origin + filter_y;
 300               if ((out_y >= 0 && out_y < output_height) && (out_x >= 0 && out_x < output_width))
 301               {
 302                 for (int32_t out_c = 0; out_c < output_depth; ++out_c)
 303                 {
 304                   const int16_t input_val =
 305                       input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
 306                   const int16_t filter_val =
 307                       filter_data[calcOffset(filter_shape, out_c, filter_y, filter_x, in_c)];
 308                   scratch_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] +=
 309                       static_cast<int64_t>(input_val) * static_cast<int64_t>(filter_val);
 310                 }
 311               }
 312             }
 313           }
 314         }
 315       }
 316     }
 317     for (int32_t out_y = 0; out_y < output_height; ++out_y)
 318     {
 319       for (int32_t out_x = 0; out_x < output_width; ++out_x)
 320       {
 321         for (int32_t out_c = 0; out_c < output_depth; ++out_c)
 322         {
 323           int64_t acc = scratch_data[calcOffset(output_shape, batch, out_y, out_x, out_c)];
 324           if (bias_data)
 325           {
 326             acc += bias_data[out_c];
 327           }
 328           int32_t scaled_acc = tflite::MultiplyByQuantizedMultiplier(
 329               acc, output_multipliers[out_c].multiplier, output_multipliers[out_c].shift);
 330
 331           scaled_acc = std::max(scaled_acc, activation_min);
 332           scaled_acc = std::min(scaled_acc, activation_max);
 333
 334           output_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] = scaled_acc;
 335         }
 336       }
 337     }
 338   }
 339 }
 340
 341 } // namespace kernels
 342 } // namespace luci_interpreter