Imported Upstream version 1.8.0
[platform/core/ml/nnfw.git] / compute / ARMComputeEx / src / runtime / CL / functions / CLFullyConnectedHybridLayer.cpp
1 /*
2  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 /*
18  * Copyright (c) 2017-2019 ARM Limited.
19  *
20  * SPDX-License-Identifier: MIT
21  *
22  * Permission is hereby granted, free of charge, to any person obtaining a copy
23  * of this software and associated documentation files (the "Software"), to
24  * deal in the Software without restriction, including without limitation the
25  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
26  * sell copies of the Software, and to permit persons to whom the Software is
27  * furnished to do so, subject to the following conditions:
28  *
29  * The above copyright notice and this permission notice shall be included in all
30  * copies or substantial portions of the Software.
31  *
32  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
33  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
34  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
35  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
36  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
37  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
38  * SOFTWARE.
39  */
40
41 #include "arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h"
42
43 #include "arm_compute/core/Size2D.h"
44 #include "arm_compute/core/Validate.h"
45 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
46 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
47 #include "arm_compute/runtime/CL/CLScheduler.h"
48 #include "support/MemorySupport.h"
49
50 #include <algorithm>
51
52 using namespace arm_compute;
53 using namespace arm_compute::misc::shape_calculator;
54
55 namespace
56 {
57 Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output)
58 {
59   ARM_COMPUTE_UNUSED(input);
60   ARM_COMPUTE_UNUSED(weights);
61   ARM_COMPUTE_UNUSED(output);
62   ARM_COMPUTE_RETURN_ON_ERROR(
63       CLGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output));
64
65   return Status{};
66 }
67 } // namespace
68
69 void CLFullyConnectedHybridLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output)
70 {
71   auto k = support::cpp14::make_unique<CLTransposeKernel>();
72   k->configure(input, output);
73   _kernel = std::move(k);
74 }
75
76 Status CLFullyConnectedHybridLayerReshapeWeights::validate(const ITensorInfo *input,
77                                                            const ITensorInfo *output)
78 {
79   return CLTransposeKernel::validate(input, output);
80 }
81
82 CLFullyConnectedHybridLayer::CLFullyConnectedHybridLayer(
83     std::shared_ptr<IMemoryManager> memory_manager)
84     : _memory_group(memory_manager), _reshape_weights_kernel(), _quant_input_kernel(),
85       _mm_gemmlowp(memory_manager), _multiply_scale_kernel(), _accumulate_biases_kernel(),
86       _reshape_weights_output(), _quantized_input(), _scale_factor(), _gemmlowp_output(),
87       _are_weights_reshaped(true), _accumulate_biases(false), _is_prepared(false),
88       _original_weights(nullptr)
89 {
90 }
91 void CLFullyConnectedHybridLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights,
92                                                ICLTensor *output, bool retain_internal_weights)
93 {
94   ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
95
96   ARM_COMPUTE_UNUSED(output);
97   ARM_COMPUTE_UNUSED(retain_internal_weights);
98   // Configure gemmlowp function
99   _mm_gemmlowp.configure(input, weights, nullptr, output);
100 }
101
102 void CLFullyConnectedHybridLayer::configure(const ICLTensor *input, const ICLTensor *weights,
103                                             const ICLTensor *biases, ICLTensor *output,
104                                             FullyConnectedLayerInfo fc_info)
105 {
106   ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
107
108   // Perform validate step
109   ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedHybridLayer::validate(
110       input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
111       fc_info));
112
113   _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
114   _accumulate_biases = false;
115   _is_prepared = fc_info.retain_internal_weights;
116   _original_weights = weights;
117
118   // Configure accumulate biases kernel for non quantized asymmetric types
119   if (biases != nullptr)
120   {
121     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
122
123     _accumulate_biases = true;
124
125     // Configure accumulate biases kernel
126     _accumulate_biases_kernel.set_target(CLScheduler::get().target());
127     _accumulate_biases_kernel.configure(output, biases);
128   }
129
130   const ICLTensor *weights_to_use = weights;
131
132   // With the Fully Connected layer we can have 4 different cases:
133   //  1) Convolution layer -> Fully Connected layer without batches
134   //  2) Fully Connected layer -> Fully Connected layer without batches
135   //  3) Convolution layer -> Fully Connected layer with batches
136   //  4) Fully Connected layer -> Fully Connected layer with batches
137
138   // Check if we have a fully connected layer with batches
139   const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
140   bool is_fc_after_conv = false;
141   if (is_batched_fc_layer)
142   {
143     is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
144                        (std::equal(input->info()->tensor_shape().cbegin() + 3,
145                                    input->info()->tensor_shape().cend(),
146                                    output->info()->tensor_shape().cbegin() + 1));
147   }
148   else
149   {
150     is_fc_after_conv = input->info()->num_dimensions() > 1 && input->info()->dimension(1) > 1;
151   }
152   ARM_COMPUTE_ERROR_ON_MSG(is_fc_after_conv,
153                            "CLFullyConnectedHybridLayer does not support after conv");
154   ARM_COMPUTE_UNUSED(is_fc_after_conv);
155
156   // Reshape weights if needed
157   if (!_are_weights_reshaped)
158   {
159     // Reshape the weights
160     _reshape_weights_output.allocator()->init(
161         weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
162             compute_transposed_shape(*weights->info())));
163     _reshape_weights_kernel.configure(weights_to_use, &_reshape_weights_output);
164     weights_to_use = &_reshape_weights_output;
165   }
166
167   // Extract scale factor
168   _scale_factor.allocator()->init(
169       TensorInfo(TensorShape{output->info()->dimension(1)}, 1, input->info()->data_type()));
170   _memory_group.manage(&_scale_factor);
171   _scale_factor_kernel.configure(input, &_scale_factor);
172
173   // Quantize input
174   _quantized_input.allocator()->init(
175       input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
176           DataType::QASYMM8_SIGNED));
177   _memory_group.manage(&_quantized_input);
178   _quant_input_kernel.configure(input, &_scale_factor, &_quantized_input);
179
180   // GEMMLowp
181   _gemmlowp_output.allocator()->init(
182       output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
183   _memory_group.manage(&_gemmlowp_output);
184   configure_mm(&_quantized_input, weights_to_use, &_gemmlowp_output,
185                fc_info.retain_internal_weights);
186   _quantized_input.allocator()->allocate();
187
188   // Multiply scale
189   _multiply_scale_kernel.configure(&_gemmlowp_output, &_scale_factor, output,
190                                    weights->info()->quantization_info().uniform().scale);
191   _gemmlowp_output.allocator()->allocate();
192   _scale_factor.allocator()->allocate();
193
194   _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights;
195 }
196
197 Status CLFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITensorInfo *weights,
198                                              const ITensorInfo *biases, const ITensorInfo *output,
199                                              FullyConnectedLayerInfo fc_info)
200 {
201   ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
202   ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
203   ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8_SIGNED);
204   ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
205   ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
206
207   bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
208   bool is_fc_after_conv = true;
209   const GPUTarget gpu_target = CLScheduler::get().target();
210
211   const ITensorInfo &reshaped_weights =
212       TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
213           compute_transposed_shape(*weights)));
214
215   // Configure accumulate biases kernel for non quantized asymmetric types
216   if (biases != nullptr)
217   {
218     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
219     ARM_COMPUTE_RETURN_ON_ERROR(
220         CLGEMMMatrixAccumulateBiasesKernel::validate(output, biases, gpu_target));
221   }
222
223   // With the Fully Connected layer we can have 4 different cases:
224   //  1) Convolution layer -> Fully Connected layer without batches
225   //  2) Fully Connected layer -> Fully Connected layer without batches
226   //  3) Convolution layer -> Fully Connected layer with batches
227   //  4) Fully Connected layer -> Fully Connected layer with batches
228
229   const ITensorInfo *weights_to_use = weights;
230
231   // Check if we have a fully connected layer with batches
232   const bool is_batched_fc_layer = output->dimension(1) > 1;
233   if (is_batched_fc_layer)
234   {
235     is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
236                        (std::equal(input->tensor_shape().cbegin() + 3, input->tensor_shape().cend(),
237                                    output->tensor_shape().cbegin() + 1));
238   }
239   else
240   {
241     is_fc_after_conv = input->num_dimensions() > 1 && input->dimension(1) > 1;
242   }
243   ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_fc_after_conv,
244                                   "CLFullyConnectedHybridLayer does not support after conv");
245
246   if (!weights_reshaped)
247   {
248     // Validate reshape weights kernel
249     ARM_COMPUTE_RETURN_ON_ERROR(
250         CLFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights));
251     weights_to_use = &reshaped_weights;
252   }
253
254   // Validate Scale factor kernel
255   const ITensorInfo &scale_factor =
256       TensorInfo(TensorShape{output->dimension(1)}, 1, input->data_type());
257   ARM_COMPUTE_RETURN_ON_ERROR(CLScaleFactorSymm8Kernel::validate(input, &scale_factor));
258
259   // Validate quantization symm8 kernel
260   const ITensorInfo &quantized_input =
261       TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_data_type(
262           DataType::QASYMM8_SIGNED));
263   ARM_COMPUTE_RETURN_ON_ERROR(
264       CLQuantizationSymmetricKernel::validate(input, &scale_factor, &quantized_input));
265
266   // Fully Connected layer after a Fully Connected Layer without batches
267   ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
268
269   // Validate matrix multiply kernel
270   const ITensorInfo &gemmlowp_output = TensorInfo(
271       output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
272   ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(quantized_input, *weights_to_use, gemmlowp_output));
273
274   // Multiply scale
275   ARM_COMPUTE_RETURN_ON_ERROR(
276       CLMultiplyScaleFactorKernel::validate(&gemmlowp_output, &scale_factor, output));
277
278   return Status{};
279 }
280
281 void CLFullyConnectedHybridLayer::run()
282 {
283   prepare();
284
285   MemoryGroupResourceScope scope_mg(_memory_group);
286
287   // Extract scale_factor
288   CLScheduler::get().enqueue(_scale_factor_kernel);
289
290   // Quantize input
291   CLScheduler::get().enqueue(_quant_input_kernel);
292
293   // Run matrix multiply
294   _mm_gemmlowp.run();
295
296   // Multiply scale factor
297   CLScheduler::get().enqueue(_multiply_scale_kernel);
298
299   // Accumulate biases if provided
300   if (_accumulate_biases)
301   {
302     CLScheduler::get().enqueue(_accumulate_biases_kernel);
303   }
304 }
305
306 void CLFullyConnectedHybridLayer::prepare()
307 {
308   if (!_is_prepared)
309   {
310     ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
311
312     auto release_unused = [](CLTensor *w) {
313       if (!w->is_used())
314       {
315         CLScheduler::get().queue().finish();
316         w->allocator()->free();
317       }
318     };
319
320     // Reshape of the weights if needed (happens only once)
321     if (!_are_weights_reshaped)
322     {
323       // Run reshape weights kernel and mark weights as unused
324       _reshape_weights_output.allocator()->allocate();
325       _reshape_weights_kernel.run();
326
327       _are_weights_reshaped = true;
328       // We can not release _original_weights because it can be used in other nodes
329     }
330
331     // Prepare GEMM prepare and release unused weights
332     _mm_gemmlowp.prepare();
333
334     // Release reshaped weights if unused
335     release_unused(&_reshape_weights_output);
336
337     _is_prepared = true;
338   }
339 }