arm_compute v18.02
[platform/upstream/armcl.git] / src / core / CL / kernels / CLDirectConvolutionLayerKernel.cpp
1 /*
2  * Copyright (c) 2017-2018 ARM Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
25
26 #include "arm_compute/core/AccessWindowStatic.h"
27 #include "arm_compute/core/CL/CLHelpers.h"
28 #include "arm_compute/core/CL/CLKernelLibrary.h"
29 #include "arm_compute/core/CL/ICLTensor.h"
30 #include "arm_compute/core/Error.h"
31 #include "arm_compute/core/Helpers.h"
32 #include "arm_compute/core/IAccessWindow.h"
33 #include "arm_compute/core/ITensor.h"
34 #include "arm_compute/core/Types.h"
35 #include "arm_compute/core/Utils.h"
36 #include "arm_compute/core/Validate.h"
37 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
38 #include "support/ToolchainSupport.h"
39
40 using namespace arm_compute;
41
42 namespace
43 {
44 /** Calculates expected output shape dimension
45  *
46  * @param[in] Input shape
47  *
48  * @return Expected output shape
49  */
50 TensorShape get_output_shape(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info)
51 {
52     unsigned int output_width  = 0;
53     unsigned int output_height = 0;
54     std::tie(output_width, output_height) = scaled_dimensions(input_shape.x(), input_shape.y(), weights_shape.x(), weights_shape.y(), conv_info);
55
56     TensorShape output_shape = input_shape;
57     output_shape.set(0, output_width);
58     output_shape.set(1, output_height);
59     output_shape.set(2, weights_shape[3]);
60
61     return output_shape;
62 }
63
64 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
65 {
66     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
67     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
68     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != weights->dimension(1),
69                                     "Weights should have same width as length");
70     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != 1 && weights->dimension(0) != 3 && weights->dimension(0) != 5,
71                                     "Kernel sizes other than 1x1, 3x3 or 5x5 are not supported");
72     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(2) != input->dimension(2),
73                                     "Weights feature map dimension should match the respective input's one");
74     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != weights->dimension(1),
75                                     "Only rectangular weights are supported!");
76     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4,
77                                     "Weights can be at most 4 dimensional");
78     ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(0) == 1) && std::get<0>(conv_info.stride()) > 3,
79                                     "Strides larger than 3 not supported for 1x1 convolution.");
80     ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(0) == 3 || weights->dimension(0) == 5) && std::get<0>(conv_info.stride()) > 2,
81                                     "Strides larger than 2 not supported for 3x3 convolution.");
82
83     if(biases != nullptr)
84     {
85         if(is_data_type_quantized_asymmetric(input->data_type()))
86         {
87             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
88         }
89         else
90         {
91             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
92         }
93         ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(0) != weights->dimension(3),
94                                         "Biases size and number of input feature maps should match");
95         ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1,
96                                         "Biases should be one dimensional");
97     }
98
99     // Checks performed when output is configured
100     if(output->total_size() != 0)
101     {
102         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
103                                                            get_output_shape(input->tensor_shape(), weights->tensor_shape(), conv_info));
104         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
105         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
106     }
107
108     return Status{};
109 }
110
111 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, const GPUTarget target)
112 {
113     const unsigned int kernel_size = weights->dimension(0);
114     const DataType     data_type   = input->data_type();
115
116     // Get convolved dimensions
117     TensorShape output_shape = get_output_shape(input->tensor_shape(), weights->tensor_shape(), conv_info);
118
119     // Output auto inizialitation if not yet initialized
120     auto_init_if_empty(*output, output_shape,
121                        1,
122                        input->data_type(),
123                        input->fixed_point_position(),
124                        input->quantization_info());
125
126     unsigned int conv_stride_x = std::get<0>(conv_info.stride());
127     unsigned int conv_stride_y = std::get<1>(conv_info.stride());
128     unsigned int conv_pad_left = conv_info.pad_left();
129     unsigned int conv_pad_top  = conv_info.pad_top();
130
131     unsigned int num_elems_read_per_iteration_x    = 0;
132     unsigned int num_elems_read_per_iteration_y    = 0;
133     unsigned int num_elems_written_per_iteration_x = 0;
134     unsigned int num_elems_written_per_iteration_y = 0;
135
136     if((target == GPUTarget::BIFROST) && (kernel_size <= 5) && (conv_stride_x == 1) && (conv_stride_y == 1) && (data_type == DataType::F32))
137     {
138         // Configure kernel window
139
140         switch(kernel_size)
141         {
142             case 1:
143             {
144                 num_elems_read_per_iteration_x    = 4;
145                 num_elems_read_per_iteration_y    = 4;
146                 num_elems_written_per_iteration_x = 4;
147                 num_elems_written_per_iteration_y = 4;
148                 break;
149             }
150             case 3:
151             {
152                 num_elems_read_per_iteration_x    = 6;
153                 num_elems_read_per_iteration_y    = 5;
154                 num_elems_written_per_iteration_x = 4;
155                 num_elems_written_per_iteration_y = 3;
156                 break;
157             }
158             case 5:
159             {
160                 num_elems_read_per_iteration_x    = 8;
161                 num_elems_read_per_iteration_y    = 6;
162                 num_elems_written_per_iteration_x = 4;
163                 num_elems_written_per_iteration_y = 2;
164                 break;
165             }
166             default:
167             {
168                 ARM_COMPUTE_ERROR("Kernel size not optimized for Bifrost");
169             }
170         }
171     }
172     else
173     {
174         num_elems_read_per_iteration_y    = kernel_size;
175         num_elems_written_per_iteration_x = 8;
176         num_elems_written_per_iteration_y = 1;
177         switch(kernel_size)
178         {
179             case 1:
180                 switch(conv_stride_x)
181                 {
182                     case 1:
183                         num_elems_read_per_iteration_x = 8;
184                         break;
185                     case 2:
186                         num_elems_read_per_iteration_x = 16;
187                         break;
188                     case 3:
189                         switch(input->element_size())
190                         {
191                             case 1:
192                                 num_elems_read_per_iteration_x = 28;
193                                 break;
194                             case 2:
195                                 num_elems_read_per_iteration_x = 24;
196                                 break;
197                             case 4:
198                                 num_elems_read_per_iteration_x = 22;
199                                 break;
200                             default:
201                                 ARM_COMPUTE_ERROR("Invalid data size");
202                         }
203                         break;
204                     default:
205                         ARM_COMPUTE_ERROR("Invalid convolution stride X");
206                 }
207                 break;
208             case 3:
209                 switch(conv_stride_x)
210                 {
211                     case 1:
212                         num_elems_read_per_iteration_x = 10;
213                         break;
214                     case 2:
215                         num_elems_read_per_iteration_x = 17;
216                         break;
217                     default:
218                         ARM_COMPUTE_ERROR("Invalid convolution stride X");
219                 }
220                 break;
221             case 5:
222                 switch(conv_stride_x)
223                 {
224                     case 1:
225                         num_elems_read_per_iteration_x = 12;
226                         break;
227                     case 2:
228                         num_elems_read_per_iteration_x = 20;
229                         break;
230                     default:
231                         ARM_COMPUTE_ERROR("Invalid convolution stride X");
232                 }
233                 break;
234             default:
235                 ARM_COMPUTE_ERROR("Invalid direct convolution size");
236         }
237     }
238
239     // Create window and update padding
240     bool   window_changed = false;
241     Window win            = calculate_max_window(*output, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
242
243     AccessWindowRectangle input_access(input, -conv_pad_left, -conv_pad_top,
244                                        num_elems_read_per_iteration_x, num_elems_read_per_iteration_y,
245                                        conv_stride_x, conv_stride_y);
246     AccessWindowStatic    weights_access(weights, 0, 0, kernel_size, kernel_size);
247     AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
248
249     window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
250
251     output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
252
253     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
254     return std::make_pair(err, win);
255 }
256 } // namespace
257
258 CLDirectConvolutionLayerKernel::CLDirectConvolutionLayerKernel()
259     : _input(nullptr), _biases(nullptr), _weights(nullptr), _output(nullptr), _border_size(0), _conv_stride_x(0), _conv_stride_y(0)
260 {
261 }
262
263 BorderSize CLDirectConvolutionLayerKernel::border_size() const
264 {
265     return _border_size;
266 }
267
268 void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
269 {
270     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
271
272     const unsigned int kernel_size = weights->info()->dimension(0);
273     const DataType     data_type   = input->info()->data_type();
274
275     // Get convolved dimensions
276     TensorShape output_shape = get_output_shape(input->info()->tensor_shape(), weights->info()->tensor_shape(), conv_info);
277
278     // Output auto inizialitation if not yet initialized
279     auto_init_if_empty(*output->info(),
280                        output_shape,
281                        1,
282                        input->info()->data_type(),
283                        input->info()->fixed_point_position(),
284                        input->info()->quantization_info());
285
286     // Perform validation step
287     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
288                                                   weights->info(),
289                                                   (biases != nullptr) ? biases->info() : nullptr,
290                                                   output->info(),
291                                                   conv_info));
292
293     _conv_stride_x = std::get<0>(conv_info.stride());
294     _conv_stride_y = std::get<1>(conv_info.stride());
295     _border_size   = BorderSize(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left());
296
297     _input   = input;
298     _weights = weights;
299     _output  = output;
300     _biases  = biases;
301
302     const GPUTarget gpu_target = get_arch_from_target(get_target());
303
304     std::stringstream kernel_name;
305     kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size;
306
307     CLBuildOptions build_options;
308     build_options.add_option_if(_biases != nullptr, std::string("-DHAS_BIAS"));
309
310     if((gpu_target == GPUTarget::BIFROST) && (kernel_size <= 5) && (_conv_stride_x == 1) && (_conv_stride_y == 1) && (data_type == DataType::F32))
311     {
312         build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(2))));
313
314         kernel_name << "_f32_bifrost";
315         _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str(), build_options.options()));
316
317         // Through extensive experimentation with over 30 representative tensor
318         // shapes, we found a small number of local work size configurations
319         // that result in nearly optimal execution times. Selecting the right
320         // lws for a given shape, however, required a complex decision tree,
321         // until we constructed a simple feature as described below.
322         //
323         // We started from the number of multiply-accumulate operations for a
324         // convolution layer, which is equal to the product of the input
325         // dimensions 0..2 and the weights dimensions 0..2.  Unfortunately,
326         // this resulted in ties between distinct shapes that required distinct
327         // lws configurations. Replacing the width of the input with the kernel
328         // size, however, resulted in nearly optimal predictions. We use underscores
329         // in variable names to indicate when they are intentionally misleading.
330         const size_t product_of_weights_dimensions = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2);
331         const size_t product_of_input_dimensions_  = input->info()->dimension(0) * weights->info()->dimension(1) * input->info()->dimension(2);
332         const float  mega_ops_                     = 1e-6 * product_of_weights_dimensions * product_of_input_dimensions_;
333
334         switch(kernel_size)
335         {
336             case 1:
337             {
338                 if(mega_ops_ < 1.f)
339                 {
340                     _lws_hint = cl::NDRange(1, 1, 8);
341                 }
342                 else if(mega_ops_ < 7.f)
343                 {
344                     _lws_hint = cl::NDRange(1, 1, 4);
345                 }
346                 else
347                 {
348                     _lws_hint = cl::NDRange(1, 1, 2);
349                 }
350                 break;
351             }
352             case 3:
353             {
354                 if(mega_ops_ < 1.f)
355                 {
356                     _lws_hint = cl::NDRange(1, 1, 8);
357                 }
358                 else if(mega_ops_ < 13.f)
359                 {
360                     _lws_hint = cl::NDRange(2, 1, 4);
361                 }
362                 else if(mega_ops_ < 50.f)
363                 {
364                     _lws_hint = cl::NDRange(3, 1, 4);
365                 }
366                 else
367                 {
368                     _lws_hint = cl::NDRange(2, 1, 6);
369                 }
370                 break;
371             }
372             case 5:
373             {
374                 if(mega_ops_ < 2.f || mega_ops_ > 80.f)
375                 {
376                     _lws_hint = cl::NDRange(2, 1, 4);
377                 }
378                 else
379                 {
380                     _lws_hint = cl::NDRange(2, 1, 8);
381                 }
382                 break;
383             }
384             default:
385             {
386                 ARM_COMPUTE_ERROR("Kernel size not optimized for Bifrost");
387             }
388         }
389     }
390     else
391     {
392         bool     is_quantized_fixed_point = is_data_type_fixed_point(data_type);
393         bool     is_quantized_asymm       = is_data_type_quantized_asymmetric(data_type);
394         DataType promoted_type            = (is_quantized_fixed_point) ? get_promoted_data_type(data_type) : data_type;
395
396         build_options.add_option_if(is_quantized_asymm, std::string("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size)));
397         build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)));
398         build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type)));
399         build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(2))));
400         build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(_conv_stride_x)));
401         build_options.add_option_if(is_quantized_fixed_point,
402                                     std::string("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position())));
403         build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(promoted_type)));
404
405         // Create kernel
406         _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(is_quantized_asymm ? "direct_convolution_1x1_3x3_5x5_quantized" : kernel_name.str(),
407                                                                                build_options.options()));
408     }
409
410     // Configure kernel window
411     auto win_config = validate_and_configure_window(input->info(), weights->info(), output->info(), conv_info, gpu_target);
412     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
413     ICLKernel::configure(win_config.second);
414
415     // Set static kernel arguments
416     if(is_data_type_quantized_asymmetric(data_type))
417     {
418         int output_multiplier = 0;
419         int output_shift      = 0;
420
421         float multiplier = _input->info()->quantization_info().scale * _weights->info()->quantization_info().scale / _output->info()->quantization_info().scale;
422         ARM_COMPUTE_THROW_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift));
423
424         unsigned int idx = 3 * num_arguments_per_3D_tensor() + ((_biases != nullptr) ? num_arguments_per_1D_tensor() : 0) + 1;
425         _kernel.setArg(idx++, -_input->info()->quantization_info().offset);
426         _kernel.setArg(idx++, -_weights->info()->quantization_info().offset);
427         _kernel.setArg(idx++, _output->info()->quantization_info().offset);
428         _kernel.setArg(idx++, output_multiplier);
429         _kernel.setArg(idx++, output_shift);
430     }
431
432     // Set config_id for enabling LWS tuning
433     _config_id = "direct_convolution_";
434     _config_id += lower_string(string_from_data_type(data_type));
435     _config_id += "_";
436     _config_id += support::cpp11::to_string(kernel_size);
437     _config_id += "_";
438     _config_id += support::cpp11::to_string(border_size().left);
439     _config_id += "_";
440     _config_id += support::cpp11::to_string(border_size().top);
441     _config_id += "_";
442     _config_id += support::cpp11::to_string(border_size().right);
443     _config_id += "_";
444     _config_id += support::cpp11::to_string(border_size().bottom);
445     _config_id += "_";
446     _config_id += support::cpp11::to_string(_conv_stride_x);
447     _config_id += "_";
448     _config_id += support::cpp11::to_string(_conv_stride_y);
449     _config_id += "_";
450     _config_id += support::cpp11::to_string(output->info()->dimension(0));
451     _config_id += "_";
452     _config_id += support::cpp11::to_string(output->info()->dimension(1));
453 }
454
455 Status CLDirectConvolutionLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
456                                                 const GPUTarget target)
457 {
458     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info));
459     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), output->clone().get(), conv_info, target).first);
460
461     return Status{};
462 }
463
464 void CLDirectConvolutionLayerKernel::run(const Window &window, cl::CommandQueue &queue)
465 {
466     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
467     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
468
469     // Get initial windows
470     Window slice  = window.first_slice_window_3D();
471     Window win_in = window;
472
473     win_in.adjust(Window::DimX, -_border_size.left, true);
474     win_in.adjust(Window::DimY, -_border_size.top, true);
475     win_in.set_dimension_step(Window::DimX, window.x().step() * _conv_stride_x);
476     win_in.set_dimension_step(Window::DimY, window.y().step() * _conv_stride_y);
477
478     Window slice_in = win_in.first_slice_window_3D();
479
480     unsigned int idx1 = 2 * num_arguments_per_3D_tensor();
481     add_3D_tensor_argument(idx1, _weights, slice);
482
483     if(_biases != nullptr)
484     {
485         Window slice_biases;
486         slice_biases.use_tensor_dimensions(_biases->info()->tensor_shape());
487         add_1D_tensor_argument(idx1, _biases, slice_biases);
488     }
489
490     _kernel.setArg(idx1++, static_cast<unsigned int>(_weights->info()->strides_in_bytes()[3]));
491
492     do
493     {
494         unsigned int idx = 0;
495         add_3D_tensor_argument(idx, _input, slice_in);
496         add_3D_tensor_argument(idx, _output, slice);
497
498         enqueue(queue, *this, slice, _lws_hint);
499     }
500     while(window.slide_window_slice_3D(slice) && win_in.slide_window_slice_3D(slice_in));
501 }