Imported Upstream version 1.12.0
[platform/core/ml/nnfw.git] / compiler / luci-interpreter / src / kernels / TransposeConv.cpp
1 /*
2  * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
3  * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  *    http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17
18 #include "kernels/TransposeConv.h"
19
20 #include "kernels/Utils.h"
21
22 #include <tensorflow/lite/kernels/internal/reference/reference_ops.h>
23
24 #include <stdexcept>
25
26 namespace luci_interpreter
27 {
28
29 namespace kernels
30 {
31
32 TransposeConv::TransposeConv(const Tensor *output_shape, const Tensor *filter, const Tensor *input,
33                              const Tensor *bias, Tensor *output, const TransposeConvParams &params)
34     : KernelWithParams<TransposeConvParams>({output_shape, filter, input, bias}, {output}, params)
35 {
36 }
37
38 TransposeConv::~TransposeConv()
39 {
40   // Define destructor here, to delete vector of qunatized multipliers properly
41 }
42
43 void TransposeConv::configure()
44 {
45   assert(output_shape()->shape().num_dims() == 1);
46   assert(input()->shape().num_dims() == 4);
47   assert(filter()->shape().num_dims() == 4);
48   assert(input()->element_type() == DataType::FLOAT32 || input()->element_type() == DataType::U8 ||
49          input()->element_type() == DataType::S16);
50   assert(input()->element_type() == output()->element_type());
51   assert(input()->shape().dim(3) == filter()->shape().dim(3));
52
53   const int num_dims = output_shape()->shape().dim(0);
54   Shape out_shape(num_dims);
55   const auto *shape_data = getTensorData<int32_t>(output_shape());
56   for (int i = 0; i < num_dims; i++)
57     out_shape.dim(i) = shape_data[i];
58   output()->resize(out_shape);
59
60   const int32_t filter_height = filter()->shape().dim(1);
61   const int32_t filter_width = filter()->shape().dim(2);
62   const int32_t output_height = out_shape.dim(1);
63   const int32_t output_width = out_shape.dim(2);
64
65   const int32_t unused_output_height =
66       computeOutputSize(params().padding, output_height, filter_height, params().stride_height, 1);
67   const int32_t unused_output_width =
68       computeOutputSize(params().padding, output_width, filter_width, params().stride_width, 1);
69
70   _padding_height =
71       computePadding(params().stride_height, 1, output_height, filter_height, unused_output_height);
72   _padding_width =
73       computePadding(params().stride_width, 1, output_width, filter_width, unused_output_width);
74
75   if (input()->element_type() == DataType::U8 || input()->element_type() == DataType::S16)
76   {
77     DataType scratch_data_type =
78         input()->element_type() == DataType::S16 ? DataType::S64 : DataType::S32;
79     _scratch_tensor =
80         std::make_unique<Tensor>(scratch_data_type, output()->shape(), AffineQuantization{}, "");
81     const std::vector<double> real_multipliers =
82         getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
83
84     _quant_multipliers = quantizeMultipliers(real_multipliers);
85   }
86 }
87
88 void TransposeConv::execute() const
89 {
90   switch (input()->element_type())
91   {
92     case DataType::FLOAT32:
93       evalFloat();
94       break;
95     case DataType::U8:
96       if (filter()->scales().size() == 1)
97       {
98         evalQuantized();
99       }
100       else if (filter()->scales().size() > 1)
101       {
102         LUCI_INTERPRETER_CHECK(filter()->shape().num_dims() == 4);
103         LUCI_INTERPRETER_CHECK(filter()->scales().size() ==
104                                static_cast<size_t>(filter()->shape().dim(0)));
105         evalQuantizedPerChannel();
106       }
107       break;
108     case DataType::S16:
109       evalQuantizedS16();
110       break;
111     default:
112       throw std::runtime_error("Unsupported type.");
113   }
114   if (!!_scratch_tensor)
115     _scratch_tensor->deallocate();
116 }
117
118 void TransposeConv::evalFloat() const
119 {
120   tflite::ConvParams op_params{};
121   op_params.padding_type = tflite::PaddingType::kSame;
122   op_params.padding_values.height = _padding_height;
123   op_params.padding_values.width = _padding_width;
124   op_params.stride_height = params().stride_height;
125   op_params.stride_width = params().stride_width;
126   tflite::reference_ops::TransposeConv(op_params,                                                //
127                                        getTensorShape(input()), getTensorData<float>(input()),   //
128                                        getTensorShape(filter()), getTensorData<float>(filter()), //
129                                        getTensorShape(bias()), getTensorData<float>(bias()),     //
130                                        getTensorShape(output()), getTensorData<float>(output()), //
131                                        tflite::RuntimeShape(), nullptr);
132 }
133
134 void TransposeConv::evalQuantized() const
135 {
136   tflite::ConvParams op_params{};
137   op_params.padding_type = tflite::PaddingType::kSame;
138   op_params.padding_values.height = _padding_height;
139   op_params.padding_values.width = _padding_width;
140   op_params.stride_height = params().stride_height;
141   op_params.stride_width = params().stride_width;
142   // The kernel expects input and filter zero points to be negated.
143   op_params.input_offset = -input()->zero_point();    // Note the '-'.
144   op_params.weights_offset = -filter()->zero_point(); // Note the '-'.
145   op_params.output_offset = output()->zero_point();
146   op_params.output_multiplier = _quant_multipliers[0].multiplier;
147   op_params.output_shift = _quant_multipliers[0].shift;
148   op_params.quantized_activation_min = std::numeric_limits<uint8_t>::min();
149   op_params.quantized_activation_max = std::numeric_limits<uint8_t>::max();
150
151   tflite::reference_ops::TransposeConv(op_params,                                                //
152                                        getTensorShape(input()), getTensorData<uint8>(input()),   //
153                                        getTensorShape(filter()), getTensorData<uint8>(filter()), //
154                                        getTensorShape(bias()), getTensorData<int32_t>(bias()),   //
155                                        getTensorShape(output()), getTensorData<uint8>(output()), //
156                                        tflite::RuntimeShape(), nullptr,                          //
157                                        getTensorData<int32_t>(_scratch_tensor.get()));
158 }
159
160 void TransposeConv::evalQuantizedPerChannel() const
161 {
162   const auto *input_data = getTensorData<uint8_t>(input());
163   const auto *filter_data = getTensorData<uint8_t>(filter());
164   const auto *bias_data = getTensorData<int32_t>(bias());
165   auto *output_data = getTensorData<uint8_t>(output());
166   auto *scratch_data = getTensorData<int32_t>(_scratch_tensor.get());
167
168   const Shape &input_shape = input()->shape();
169   const Shape &filter_shape = filter()->shape();
170   const Shape &output_shape = output()->shape();
171
172   const int32_t batches = input_shape.dim(0);
173   const int32_t input_height = input_shape.dim(1);
174   const int32_t input_width = input_shape.dim(2);
175   const int32_t input_depth = input_shape.dim(3);
176   const int32_t output_depth = filter_shape.dim(0);
177   const int32_t filter_height = filter_shape.dim(1);
178   const int32_t filter_width = filter_shape.dim(2);
179   const int32_t output_height = output_shape.dim(1);
180   const int32_t output_width = output_shape.dim(2);
181
182   const int32_t stride_height = _params.stride_height;
183   const int32_t stride_width = _params.stride_width;
184
185   int32_t activation_min{};
186   int32_t activation_max{};
187   calculateActivationRangeQuantized(Activation::NONE, output(), &activation_min, &activation_max);
188
189   std::memset(scratch_data, 0, _scratch_tensor->shape().num_elements() * sizeof(int32_t));
190
191   BroadcastableWrapper<ChannelQuantMultipliers> output_multipliers(_quant_multipliers);
192   for (int32_t batch = 0; batch < batches; ++batch)
193   {
194     for (int32_t in_y = 0; in_y < input_height; ++in_y)
195     {
196       for (int32_t in_x = 0; in_x < input_width; ++in_x)
197       {
198         for (int32_t in_c = 0; in_c < input_depth; ++in_c)
199         {
200           const int32_t out_y_origin = in_y * stride_height - _padding_height;
201           const int32_t out_x_origin = in_x * stride_width - _padding_width;
202           for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y)
203           {
204             for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x)
205             {
206               const int32_t out_x = out_x_origin + filter_x;
207               const int32_t out_y = out_y_origin + filter_y;
208               if ((out_y >= 0 && out_y < output_height) && (out_x >= 0 && out_x < output_width))
209               {
210                 for (int32_t out_c = 0; out_c < output_depth; ++out_c)
211                 {
212                   const uint8_t input_val =
213                       input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
214                   const uint8_t filter_val =
215                       filter_data[calcOffset(filter_shape, out_c, filter_y, filter_x, in_c)];
216                   scratch_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] +=
217                       static_cast<int32_t>(input_val - input()->zero_point()) *
218                       static_cast<int32_t>(filter_val - filter()->zero_points()[out_c]);
219                 }
220               }
221             }
222           }
223         }
224       }
225     }
226     for (int32_t out_y = 0; out_y < output_height; ++out_y)
227     {
228       for (int32_t out_x = 0; out_x < output_width; ++out_x)
229       {
230         for (int32_t out_c = 0; out_c < output_depth; ++out_c)
231         {
232           int32_t acc = scratch_data[calcOffset(output_shape, batch, out_y, out_x, out_c)];
233           if (bias_data)
234           {
235             acc += bias_data[out_c];
236           }
237
238           int32_t scaled_acc = tflite::MultiplyByQuantizedMultiplier(
239               acc, output_multipliers[out_c].multiplier, output_multipliers[out_c].shift);
240
241           scaled_acc += output()->zero_point();
242           scaled_acc = std::max(scaled_acc, activation_min);
243           scaled_acc = std::min(scaled_acc, activation_max);
244
245           output_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] = scaled_acc;
246         }
247       }
248     }
249   }
250 }
251
252 void TransposeConv::evalQuantizedS16() const
253 {
254   const auto *input_data = getTensorData<int16_t>(input());
255   const auto *filter_data = getTensorData<int16_t>(filter());
256   const auto *bias_data = getTensorData<int64_t>(bias());
257   auto *output_data = getTensorData<int16_t>(output());
258   auto *scratch_data = getTensorData<int64_t>(_scratch_tensor.get());
259
260   const Shape &input_shape = input()->shape();
261   const Shape &filter_shape = filter()->shape();
262   const Shape &output_shape = output()->shape();
263
264   const int32_t batches = input_shape.dim(0);
265   const int32_t input_height = input_shape.dim(1);
266   const int32_t input_width = input_shape.dim(2);
267   const int32_t input_depth = input_shape.dim(3);
268   const int32_t output_depth = filter_shape.dim(0);
269   const int32_t filter_height = filter_shape.dim(1);
270   const int32_t filter_width = filter_shape.dim(2);
271   const int32_t output_height = output_shape.dim(1);
272   const int32_t output_width = output_shape.dim(2);
273
274   const int32_t stride_height = _params.stride_height;
275   const int32_t stride_width = _params.stride_width;
276
277   int32_t activation_min{};
278   int32_t activation_max{};
279   calculateActivationRangeQuantized(Activation::NONE, output(), &activation_min, &activation_max);
280
281   std::memset(scratch_data, 0, _scratch_tensor->shape().num_elements() * sizeof(int64_t));
282
283   BroadcastableWrapper<ChannelQuantMultipliers> output_multipliers(_quant_multipliers);
284   for (int32_t batch = 0; batch < batches; ++batch)
285   {
286     for (int32_t in_y = 0; in_y < input_height; ++in_y)
287     {
288       for (int32_t in_x = 0; in_x < input_width; ++in_x)
289       {
290         for (int32_t in_c = 0; in_c < input_depth; ++in_c)
291         {
292           const int32_t out_y_origin = in_y * stride_height - _padding_height;
293           const int32_t out_x_origin = in_x * stride_width - _padding_width;
294           for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y)
295           {
296             for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x)
297             {
298               const int32_t out_x = out_x_origin + filter_x;
299               const int32_t out_y = out_y_origin + filter_y;
300               if ((out_y >= 0 && out_y < output_height) && (out_x >= 0 && out_x < output_width))
301               {
302                 for (int32_t out_c = 0; out_c < output_depth; ++out_c)
303                 {
304                   const int16_t input_val =
305                       input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
306                   const int16_t filter_val =
307                       filter_data[calcOffset(filter_shape, out_c, filter_y, filter_x, in_c)];
308                   scratch_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] +=
309                       static_cast<int64_t>(input_val) * static_cast<int64_t>(filter_val);
310                 }
311               }
312             }
313           }
314         }
315       }
316     }
317     for (int32_t out_y = 0; out_y < output_height; ++out_y)
318     {
319       for (int32_t out_x = 0; out_x < output_width; ++out_x)
320       {
321         for (int32_t out_c = 0; out_c < output_depth; ++out_c)
322         {
323           int64_t acc = scratch_data[calcOffset(output_shape, batch, out_y, out_x, out_c)];
324           if (bias_data)
325           {
326             acc += bias_data[out_c];
327           }
328           int32_t scaled_acc = tflite::MultiplyByQuantizedMultiplier(
329               acc, output_multipliers[out_c].multiplier, output_multipliers[out_c].shift);
330
331           scaled_acc = std::max(scaled_acc, activation_min);
332           scaled_acc = std::min(scaled_acc, activation_max);
333
334           output_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] = scaled_acc;
335         }
336       }
337     }
338   }
339 }
340
341 } // namespace kernels
342 } // namespace luci_interpreter