Imported Upstream version 1.18.0
[platform/core/ml/nnfw.git] / compiler / luci-interpreter / src / kernels / TransposeConv.cpp
1 /*
2  * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
3  * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  *    http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17
18 #include "kernels/TransposeConv.h"
19
20 #include "kernels/Utils.h"
21
22 #include <tensorflow/lite/kernels/internal/reference/transpose_conv.h>
23
24 #include <stdexcept>
25
26 namespace luci_interpreter
27 {
28
29 namespace kernels
30 {
31
32 TransposeConv::TransposeConv(const Tensor *output_shape, const Tensor *filter, const Tensor *input,
33                              const Tensor *bias, Tensor *output, Tensor *scratch_tensor,
34                              const TransposeConvParams &params)
35   : KernelWithParams<TransposeConvParams>({output_shape, filter, input, bias},
36                                           {output, scratch_tensor}, params)
37 {
38 }
39
40 TransposeConv::~TransposeConv()
41 {
42   // Define destructor here, to delete vector of qunatized multipliers properly
43 }
44
45 void TransposeConv::configure()
46 {
47   assert(output_shape()->shape().num_dims() == 1);
48   assert(input()->shape().num_dims() == 4);
49   assert(filter()->shape().num_dims() == 4);
50   assert(input()->element_type() == DataType::FLOAT32 || input()->element_type() == DataType::U8 ||
51          input()->element_type() == DataType::S16);
52   assert(input()->element_type() == output()->element_type());
53   assert(input()->shape().dim(3) == filter()->shape().dim(3));
54
55   const int num_dims = output_shape()->shape().dim(0);
56   Shape out_shape(num_dims);
57   const auto *shape_data = getTensorData<int32_t>(output_shape());
58   for (int i = 0; i < num_dims; i++)
59     out_shape.dim(i) = shape_data[i];
60   output()->resize(out_shape);
61
62   const int32_t filter_height = filter()->shape().dim(1);
63   const int32_t filter_width = filter()->shape().dim(2);
64   const int32_t output_height = out_shape.dim(1);
65   const int32_t output_width = out_shape.dim(2);
66
67   const int32_t unused_output_height =
68     computeOutputSize(params().padding, output_height, filter_height, params().stride_height, 1);
69   const int32_t unused_output_width =
70     computeOutputSize(params().padding, output_width, filter_width, params().stride_width, 1);
71
72   _padding_height =
73     computePadding(params().stride_height, 1, output_height, filter_height, unused_output_height);
74   _padding_width =
75     computePadding(params().stride_width, 1, output_width, filter_width, unused_output_width);
76
77   if (input()->element_type() == DataType::U8 || input()->element_type() == DataType::S16)
78   {
79     auto scratch_tensor = getOutputTensors()[1];
80     scratch_tensor->resize(output()->shape());
81     const std::vector<double> real_multipliers =
82       getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
83
84     _quant_multipliers = quantizeMultipliers(real_multipliers);
85   }
86   else
87   {
88     auto scratch_tensor = getOutputTensors()[1];
89     scratch_tensor->set_allocatable(false);
90   }
91 }
92
93 void TransposeConv::execute() const
94 {
95   switch (input()->element_type())
96   {
97     case DataType::FLOAT32:
98       evalFloat();
99       break;
100     case DataType::U8:
101       if (filter()->scales().size() == 1)
102       {
103         evalQuantized();
104       }
105       else if (filter()->scales().size() > 1)
106       {
107         LUCI_INTERPRETER_CHECK(filter()->shape().num_dims() == 4);
108         LUCI_INTERPRETER_CHECK(filter()->scales().size() ==
109                                static_cast<size_t>(filter()->shape().dim(0)));
110         evalQuantizedPerChannel();
111       }
112       break;
113     case DataType::S16:
114       evalQuantizedS16();
115       break;
116     default:
117       throw std::runtime_error("Unsupported type.");
118   }
119 }
120
121 void TransposeConv::evalFloat() const
122 {
123   tflite::ConvParams op_params{};
124   op_params.padding_type = tflite::PaddingType::kSame;
125   op_params.padding_values.height = _padding_height;
126   op_params.padding_values.width = _padding_width;
127   op_params.stride_height = params().stride_height;
128   op_params.stride_width = params().stride_width;
129   tflite::reference_ops::TransposeConv(op_params,                                                //
130                                        getTensorShape(input()), getTensorData<float>(input()),   //
131                                        getTensorShape(filter()), getTensorData<float>(filter()), //
132                                        getTensorShape(bias()), getTensorData<float>(bias()),     //
133                                        getTensorShape(output()), getTensorData<float>(output()), //
134                                        tflite::RuntimeShape(), nullptr);
135 }
136
137 void TransposeConv::evalQuantized() const
138 {
139   tflite::ConvParams op_params{};
140   op_params.padding_type = tflite::PaddingType::kSame;
141   op_params.padding_values.height = _padding_height;
142   op_params.padding_values.width = _padding_width;
143   op_params.stride_height = params().stride_height;
144   op_params.stride_width = params().stride_width;
145   // The kernel expects input and filter zero points to be negated.
146   op_params.input_offset = -input()->zero_point();    // Note the '-'.
147   op_params.weights_offset = -filter()->zero_point(); // Note the '-'.
148   op_params.output_offset = output()->zero_point();
149   op_params.output_multiplier = _quant_multipliers[0].multiplier;
150   op_params.output_shift = _quant_multipliers[0].shift;
151   op_params.quantized_activation_min = std::numeric_limits<uint8_t>::min();
152   op_params.quantized_activation_max = std::numeric_limits<uint8_t>::max();
153
154   auto scratch_tensor = getOutputTensors()[1];
155
156   tflite::reference_ops::TransposeConv(op_params,                                                //
157                                        getTensorShape(input()), getTensorData<uint8>(input()),   //
158                                        getTensorShape(filter()), getTensorData<uint8>(filter()), //
159                                        getTensorShape(bias()), getTensorData<int32_t>(bias()),   //
160                                        getTensorShape(output()), getTensorData<uint8>(output()), //
161                                        tflite::RuntimeShape(), nullptr,                          //
162                                        getTensorData<int32_t>(scratch_tensor));
163 }
164
165 void TransposeConv::evalQuantizedPerChannel() const
166 {
167   const auto *input_data = getTensorData<uint8_t>(input());
168   const auto *filter_data = getTensorData<uint8_t>(filter());
169   const auto *bias_data = getTensorData<int32_t>(bias());
170   auto *output_data = getTensorData<uint8_t>(output());
171
172   auto scratch_tensor = getOutputTensors()[1];
173   auto *scratch_data = getTensorData<int32_t>(scratch_tensor);
174
175   const Shape &input_shape = input()->shape();
176   const Shape &filter_shape = filter()->shape();
177   const Shape &output_shape = output()->shape();
178
179   const int32_t batches = input_shape.dim(0);
180   const int32_t input_height = input_shape.dim(1);
181   const int32_t input_width = input_shape.dim(2);
182   const int32_t input_depth = input_shape.dim(3);
183   const int32_t output_depth = filter_shape.dim(0);
184   const int32_t filter_height = filter_shape.dim(1);
185   const int32_t filter_width = filter_shape.dim(2);
186   const int32_t output_height = output_shape.dim(1);
187   const int32_t output_width = output_shape.dim(2);
188
189   const int32_t stride_height = _params.stride_height;
190   const int32_t stride_width = _params.stride_width;
191
192   int32_t activation_min{};
193   int32_t activation_max{};
194   calculateActivationRangeQuantized(Activation::NONE, output(), &activation_min, &activation_max);
195
196   std::memset(scratch_data, 0, scratch_tensor->shape().num_elements() * sizeof(int32_t));
197
198   BroadcastableWrapper<ChannelQuantMultipliers> output_multipliers(_quant_multipliers);
199   for (int32_t batch = 0; batch < batches; ++batch)
200   {
201     for (int32_t in_y = 0; in_y < input_height; ++in_y)
202     {
203       for (int32_t in_x = 0; in_x < input_width; ++in_x)
204       {
205         for (int32_t in_c = 0; in_c < input_depth; ++in_c)
206         {
207           const int32_t out_y_origin = in_y * stride_height - _padding_height;
208           const int32_t out_x_origin = in_x * stride_width - _padding_width;
209           for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y)
210           {
211             for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x)
212             {
213               const int32_t out_x = out_x_origin + filter_x;
214               const int32_t out_y = out_y_origin + filter_y;
215               if ((out_y >= 0 && out_y < output_height) && (out_x >= 0 && out_x < output_width))
216               {
217                 for (int32_t out_c = 0; out_c < output_depth; ++out_c)
218                 {
219                   const uint8_t input_val =
220                     input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
221                   const uint8_t filter_val =
222                     filter_data[calcOffset(filter_shape, out_c, filter_y, filter_x, in_c)];
223                   scratch_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] +=
224                     static_cast<int32_t>(input_val - input()->zero_point()) *
225                     static_cast<int32_t>(filter_val - filter()->zero_points()[out_c]);
226                 }
227               }
228             }
229           }
230         }
231       }
232     }
233     for (int32_t out_y = 0; out_y < output_height; ++out_y)
234     {
235       for (int32_t out_x = 0; out_x < output_width; ++out_x)
236       {
237         for (int32_t out_c = 0; out_c < output_depth; ++out_c)
238         {
239           int32_t acc = scratch_data[calcOffset(output_shape, batch, out_y, out_x, out_c)];
240           if (bias_data)
241           {
242             acc += bias_data[out_c];
243           }
244
245           int32_t scaled_acc = tflite::MultiplyByQuantizedMultiplier(
246             acc, output_multipliers[out_c].multiplier, output_multipliers[out_c].shift);
247
248           scaled_acc += output()->zero_point();
249           scaled_acc = std::max(scaled_acc, activation_min);
250           scaled_acc = std::min(scaled_acc, activation_max);
251
252           output_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] = scaled_acc;
253         }
254       }
255     }
256   }
257 }
258
259 void TransposeConv::evalQuantizedS16() const
260 {
261   const auto *input_data = getTensorData<int16_t>(input());
262   const auto *filter_data = getTensorData<int16_t>(filter());
263   const auto *bias_data = getTensorData<int64_t>(bias());
264   auto *output_data = getTensorData<int16_t>(output());
265
266   auto scratch_tensor = getOutputTensors()[1];
267   auto *scratch_data = getTensorData<int64_t>(scratch_tensor);
268
269   const Shape &input_shape = input()->shape();
270   const Shape &filter_shape = filter()->shape();
271   const Shape &output_shape = output()->shape();
272
273   const int32_t batches = input_shape.dim(0);
274   const int32_t input_height = input_shape.dim(1);
275   const int32_t input_width = input_shape.dim(2);
276   const int32_t input_depth = input_shape.dim(3);
277   const int32_t output_depth = filter_shape.dim(0);
278   const int32_t filter_height = filter_shape.dim(1);
279   const int32_t filter_width = filter_shape.dim(2);
280   const int32_t output_height = output_shape.dim(1);
281   const int32_t output_width = output_shape.dim(2);
282
283   const int32_t stride_height = _params.stride_height;
284   const int32_t stride_width = _params.stride_width;
285
286   int32_t activation_min{};
287   int32_t activation_max{};
288   calculateActivationRangeQuantized(Activation::NONE, output(), &activation_min, &activation_max);
289
290   std::memset(scratch_data, 0, scratch_tensor->shape().num_elements() * sizeof(int64_t));
291
292   BroadcastableWrapper<ChannelQuantMultipliers> output_multipliers(_quant_multipliers);
293   for (int32_t batch = 0; batch < batches; ++batch)
294   {
295     for (int32_t in_y = 0; in_y < input_height; ++in_y)
296     {
297       for (int32_t in_x = 0; in_x < input_width; ++in_x)
298       {
299         for (int32_t in_c = 0; in_c < input_depth; ++in_c)
300         {
301           const int32_t out_y_origin = in_y * stride_height - _padding_height;
302           const int32_t out_x_origin = in_x * stride_width - _padding_width;
303           for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y)
304           {
305             for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x)
306             {
307               const int32_t out_x = out_x_origin + filter_x;
308               const int32_t out_y = out_y_origin + filter_y;
309               if ((out_y >= 0 && out_y < output_height) && (out_x >= 0 && out_x < output_width))
310               {
311                 for (int32_t out_c = 0; out_c < output_depth; ++out_c)
312                 {
313                   const int16_t input_val =
314                     input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
315                   const int16_t filter_val =
316                     filter_data[calcOffset(filter_shape, out_c, filter_y, filter_x, in_c)];
317                   scratch_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] +=
318                     static_cast<int64_t>(input_val) * static_cast<int64_t>(filter_val);
319                 }
320               }
321             }
322           }
323         }
324       }
325     }
326     for (int32_t out_y = 0; out_y < output_height; ++out_y)
327     {
328       for (int32_t out_x = 0; out_x < output_width; ++out_x)
329       {
330         for (int32_t out_c = 0; out_c < output_depth; ++out_c)
331         {
332           int64_t acc = scratch_data[calcOffset(output_shape, batch, out_y, out_x, out_c)];
333           if (bias_data)
334           {
335             acc += bias_data[out_c];
336           }
337           int32_t scaled_acc = tflite::MultiplyByQuantizedMultiplier(
338             acc, output_multipliers[out_c].multiplier, output_multipliers[out_c].shift);
339
340           scaled_acc = std::max(scaled_acc, activation_min);
341           scaled_acc = std::min(scaled_acc, activation_max);
342
343           output_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] = scaled_acc;
344         }
345       }
346     }
347   }
348 }
349
350 } // namespace kernels
351 } // namespace luci_interpreter