compiler/luci-interpreter/src/kernels/TransposeConv.cpp

   1 /*
   2  * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
   3  * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
   4  *
   5  * Licensed under the Apache License, Version 2.0 (the "License");
   6  * you may not use this file except in compliance with the License.
   7  * You may obtain a copy of the License at
   8  *
   9  *    http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 #include "kernels/TransposeConv.h"
  19
  20 #include "kernels/Utils.h"
  21
  22 #include <tensorflow/lite/kernels/internal/reference/transpose_conv.h>
  23
  24 #include <stdexcept>
  25
  26 namespace luci_interpreter
  27 {
  28
  29 namespace kernels
  30 {
  31
  32 TransposeConv::TransposeConv(const Tensor *output_shape, const Tensor *filter, const Tensor *input,
  33                              const Tensor *bias, Tensor *output, Tensor *scratch_tensor,
  34                              const TransposeConvParams &params)
  35   : KernelWithParams<TransposeConvParams>({output_shape, filter, input, bias},
  36                                           {output, scratch_tensor}, params)
  37 {
  38 }
  39
  40 TransposeConv::~TransposeConv()
  41 {
  42   // Define destructor here, to delete vector of qunatized multipliers properly
  43 }
  44
  45 void TransposeConv::configure()
  46 {
  47   assert(output_shape()->shape().num_dims() == 1);
  48   assert(input()->shape().num_dims() == 4);
  49   assert(filter()->shape().num_dims() == 4);
  50   assert(input()->element_type() == DataType::FLOAT32 || input()->element_type() == DataType::U8 ||
  51          input()->element_type() == DataType::S16);
  52   assert(input()->element_type() == output()->element_type());
  53   assert(input()->shape().dim(3) == filter()->shape().dim(3));
  54
  55   const int num_dims = output_shape()->shape().dim(0);
  56   Shape out_shape(num_dims);
  57   const auto *shape_data = getTensorData<int32_t>(output_shape());
  58   for (int i = 0; i < num_dims; i++)
  59     out_shape.dim(i) = shape_data[i];
  60   output()->resize(out_shape);
  61
  62   const int32_t filter_height = filter()->shape().dim(1);
  63   const int32_t filter_width = filter()->shape().dim(2);
  64   const int32_t output_height = out_shape.dim(1);
  65   const int32_t output_width = out_shape.dim(2);
  66
  67   const int32_t unused_output_height =
  68     computeOutputSize(params().padding, output_height, filter_height, params().stride_height, 1);
  69   const int32_t unused_output_width =
  70     computeOutputSize(params().padding, output_width, filter_width, params().stride_width, 1);
  71
  72   _padding_height =
  73     computePadding(params().stride_height, 1, output_height, filter_height, unused_output_height);
  74   _padding_width =
  75     computePadding(params().stride_width, 1, output_width, filter_width, unused_output_width);
  76
  77   if (input()->element_type() == DataType::U8 || input()->element_type() == DataType::S16)
  78   {
  79     auto scratch_tensor = getOutputTensors()[1];
  80     scratch_tensor->resize(output()->shape());
  81     const std::vector<double> real_multipliers =
  82       getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
  83
  84     _quant_multipliers = quantizeMultipliers(real_multipliers);
  85   }
  86   else
  87   {
  88     auto scratch_tensor = getOutputTensors()[1];
  89     scratch_tensor->set_allocatable(false);
  90   }
  91 }
  92
  93 void TransposeConv::execute() const
  94 {
  95   switch (input()->element_type())
  96   {
  97     case DataType::FLOAT32:
  98       evalFloat();
  99       break;
 100     case DataType::U8:
 101       if (filter()->scales().size() == 1)
 102       {
 103         evalQuantized();
 104       }
 105       else if (filter()->scales().size() > 1)
 106       {
 107         LUCI_INTERPRETER_CHECK(filter()->shape().num_dims() == 4);
 108         LUCI_INTERPRETER_CHECK(filter()->scales().size() ==
 109                                static_cast<size_t>(filter()->shape().dim(0)));
 110         evalQuantizedPerChannel();
 111       }
 112       break;
 113     case DataType::S16:
 114       evalQuantizedS16();
 115       break;
 116     default:
 117       throw std::runtime_error("Unsupported type.");
 118   }
 119 }
 120
 121 void TransposeConv::evalFloat() const
 122 {
 123   tflite::ConvParams op_params{};
 124   op_params.padding_type = tflite::PaddingType::kSame;
 125   op_params.padding_values.height = _padding_height;
 126   op_params.padding_values.width = _padding_width;
 127   op_params.stride_height = params().stride_height;
 128   op_params.stride_width = params().stride_width;
 129   tflite::reference_ops::TransposeConv(op_params,                                                //
 130                                        getTensorShape(input()), getTensorData<float>(input()),   //
 131                                        getTensorShape(filter()), getTensorData<float>(filter()), //
 132                                        getTensorShape(bias()), getTensorData<float>(bias()),     //
 133                                        getTensorShape(output()), getTensorData<float>(output()), //
 134                                        tflite::RuntimeShape(), nullptr);
 135 }
 136
 137 void TransposeConv::evalQuantized() const
 138 {
 139   tflite::ConvParams op_params{};
 140   op_params.padding_type = tflite::PaddingType::kSame;
 141   op_params.padding_values.height = _padding_height;
 142   op_params.padding_values.width = _padding_width;
 143   op_params.stride_height = params().stride_height;
 144   op_params.stride_width = params().stride_width;
 145   // The kernel expects input and filter zero points to be negated.
 146   op_params.input_offset = -input()->zero_point();    // Note the '-'.
 147   op_params.weights_offset = -filter()->zero_point(); // Note the '-'.
 148   op_params.output_offset = output()->zero_point();
 149   op_params.output_multiplier = _quant_multipliers[0].multiplier;
 150   op_params.output_shift = _quant_multipliers[0].shift;
 151   op_params.quantized_activation_min = std::numeric_limits<uint8_t>::min();
 152   op_params.quantized_activation_max = std::numeric_limits<uint8_t>::max();
 153
 154   auto scratch_tensor = getOutputTensors()[1];
 155
 156   tflite::reference_ops::TransposeConv(op_params,                                                //
 157                                        getTensorShape(input()), getTensorData<uint8>(input()),   //
 158                                        getTensorShape(filter()), getTensorData<uint8>(filter()), //
 159                                        getTensorShape(bias()), getTensorData<int32_t>(bias()),   //
 160                                        getTensorShape(output()), getTensorData<uint8>(output()), //
 161                                        tflite::RuntimeShape(), nullptr,                          //
 162                                        getTensorData<int32_t>(scratch_tensor));
 163 }
 164
 165 void TransposeConv::evalQuantizedPerChannel() const
 166 {
 167   const auto *input_data = getTensorData<uint8_t>(input());
 168   const auto *filter_data = getTensorData<uint8_t>(filter());
 169   const auto *bias_data = getTensorData<int32_t>(bias());
 170   auto *output_data = getTensorData<uint8_t>(output());
 171
 172   auto scratch_tensor = getOutputTensors()[1];
 173   auto *scratch_data = getTensorData<int32_t>(scratch_tensor);
 174
 175   const Shape &input_shape = input()->shape();
 176   const Shape &filter_shape = filter()->shape();
 177   const Shape &output_shape = output()->shape();
 178
 179   const int32_t batches = input_shape.dim(0);
 180   const int32_t input_height = input_shape.dim(1);
 181   const int32_t input_width = input_shape.dim(2);
 182   const int32_t input_depth = input_shape.dim(3);
 183   const int32_t output_depth = filter_shape.dim(0);
 184   const int32_t filter_height = filter_shape.dim(1);
 185   const int32_t filter_width = filter_shape.dim(2);
 186   const int32_t output_height = output_shape.dim(1);
 187   const int32_t output_width = output_shape.dim(2);
 188
 189   const int32_t stride_height = _params.stride_height;
 190   const int32_t stride_width = _params.stride_width;
 191
 192   int32_t activation_min{};
 193   int32_t activation_max{};
 194   calculateActivationRangeQuantized(Activation::NONE, output(), &activation_min, &activation_max);
 195
 196   std::memset(scratch_data, 0, scratch_tensor->shape().num_elements() * sizeof(int32_t));
 197
 198   BroadcastableWrapper<ChannelQuantMultipliers> output_multipliers(_quant_multipliers);
 199   for (int32_t batch = 0; batch < batches; ++batch)
 200   {
 201     for (int32_t in_y = 0; in_y < input_height; ++in_y)
 202     {
 203       for (int32_t in_x = 0; in_x < input_width; ++in_x)
 204       {
 205         for (int32_t in_c = 0; in_c < input_depth; ++in_c)
 206         {
 207           const int32_t out_y_origin = in_y * stride_height - _padding_height;
 208           const int32_t out_x_origin = in_x * stride_width - _padding_width;
 209           for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y)
 210           {
 211             for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x)
 212             {
 213               const int32_t out_x = out_x_origin + filter_x;
 214               const int32_t out_y = out_y_origin + filter_y;
 215               if ((out_y >= 0 && out_y < output_height) && (out_x >= 0 && out_x < output_width))
 216               {
 217                 for (int32_t out_c = 0; out_c < output_depth; ++out_c)
 218                 {
 219                   const uint8_t input_val =
 220                     input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
 221                   const uint8_t filter_val =
 222                     filter_data[calcOffset(filter_shape, out_c, filter_y, filter_x, in_c)];
 223                   scratch_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] +=
 224                     static_cast<int32_t>(input_val - input()->zero_point()) *
 225                     static_cast<int32_t>(filter_val - filter()->zero_points()[out_c]);
 226                 }
 227               }
 228             }
 229           }
 230         }
 231       }
 232     }
 233     for (int32_t out_y = 0; out_y < output_height; ++out_y)
 234     {
 235       for (int32_t out_x = 0; out_x < output_width; ++out_x)
 236       {
 237         for (int32_t out_c = 0; out_c < output_depth; ++out_c)
 238         {
 239           int32_t acc = scratch_data[calcOffset(output_shape, batch, out_y, out_x, out_c)];
 240           if (bias_data)
 241           {
 242             acc += bias_data[out_c];
 243           }
 244
 245           int32_t scaled_acc = tflite::MultiplyByQuantizedMultiplier(
 246             acc, output_multipliers[out_c].multiplier, output_multipliers[out_c].shift);
 247
 248           scaled_acc += output()->zero_point();
 249           scaled_acc = std::max(scaled_acc, activation_min);
 250           scaled_acc = std::min(scaled_acc, activation_max);
 251
 252           output_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] = scaled_acc;
 253         }
 254       }
 255     }
 256   }
 257 }
 258
 259 void TransposeConv::evalQuantizedS16() const
 260 {
 261   const auto *input_data = getTensorData<int16_t>(input());
 262   const auto *filter_data = getTensorData<int16_t>(filter());
 263   const auto *bias_data = getTensorData<int64_t>(bias());
 264   auto *output_data = getTensorData<int16_t>(output());
 265
 266   auto scratch_tensor = getOutputTensors()[1];
 267   auto *scratch_data = getTensorData<int64_t>(scratch_tensor);
 268
 269   const Shape &input_shape = input()->shape();
 270   const Shape &filter_shape = filter()->shape();
 271   const Shape &output_shape = output()->shape();
 272
 273   const int32_t batches = input_shape.dim(0);
 274   const int32_t input_height = input_shape.dim(1);
 275   const int32_t input_width = input_shape.dim(2);
 276   const int32_t input_depth = input_shape.dim(3);
 277   const int32_t output_depth = filter_shape.dim(0);
 278   const int32_t filter_height = filter_shape.dim(1);
 279   const int32_t filter_width = filter_shape.dim(2);
 280   const int32_t output_height = output_shape.dim(1);
 281   const int32_t output_width = output_shape.dim(2);
 282
 283   const int32_t stride_height = _params.stride_height;
 284   const int32_t stride_width = _params.stride_width;
 285
 286   int32_t activation_min{};
 287   int32_t activation_max{};
 288   calculateActivationRangeQuantized(Activation::NONE, output(), &activation_min, &activation_max);
 289
 290   std::memset(scratch_data, 0, scratch_tensor->shape().num_elements() * sizeof(int64_t));
 291
 292   BroadcastableWrapper<ChannelQuantMultipliers> output_multipliers(_quant_multipliers);
 293   for (int32_t batch = 0; batch < batches; ++batch)
 294   {
 295     for (int32_t in_y = 0; in_y < input_height; ++in_y)
 296     {
 297       for (int32_t in_x = 0; in_x < input_width; ++in_x)
 298       {
 299         for (int32_t in_c = 0; in_c < input_depth; ++in_c)
 300         {
 301           const int32_t out_y_origin = in_y * stride_height - _padding_height;
 302           const int32_t out_x_origin = in_x * stride_width - _padding_width;
 303           for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y)
 304           {
 305             for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x)
 306             {
 307               const int32_t out_x = out_x_origin + filter_x;
 308               const int32_t out_y = out_y_origin + filter_y;
 309               if ((out_y >= 0 && out_y < output_height) && (out_x >= 0 && out_x < output_width))
 310               {
 311                 for (int32_t out_c = 0; out_c < output_depth; ++out_c)
 312                 {
 313                   const int16_t input_val =
 314                     input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
 315                   const int16_t filter_val =
 316                     filter_data[calcOffset(filter_shape, out_c, filter_y, filter_x, in_c)];
 317                   scratch_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] +=
 318                     static_cast<int64_t>(input_val) * static_cast<int64_t>(filter_val);
 319                 }
 320               }
 321             }
 322           }
 323         }
 324       }
 325     }
 326     for (int32_t out_y = 0; out_y < output_height; ++out_y)
 327     {
 328       for (int32_t out_x = 0; out_x < output_width; ++out_x)
 329       {
 330         for (int32_t out_c = 0; out_c < output_depth; ++out_c)
 331         {
 332           int64_t acc = scratch_data[calcOffset(output_shape, batch, out_y, out_x, out_c)];
 333           if (bias_data)
 334           {
 335             acc += bias_data[out_c];
 336           }
 337           int32_t scaled_acc = tflite::MultiplyByQuantizedMultiplier(
 338             acc, output_multipliers[out_c].multiplier, output_multipliers[out_c].shift);
 339
 340           scaled_acc = std::max(scaled_acc, activation_min);
 341           scaled_acc = std::min(scaled_acc, activation_max);
 342
 343           output_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] = scaled_acc;
 344         }
 345       }
 346     }
 347   }
 348 }
 349
 350 } // namespace kernels
 351 } // namespace luci_interpreter