src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp

   1 /*
   2  * Copyright (c) 2017-2018 ARM Limited.
   3  *
   4  * SPDX-License-Identifier: MIT
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to
   8  * deal in the Software without restriction, including without limitation the
   9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  10  * sell copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in all
  14  * copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24 #include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
  25
  26 #include "arm_compute/core/AccessWindowStatic.h"
  27 #include "arm_compute/core/CL/CLHelpers.h"
  28 #include "arm_compute/core/CL/CLKernelLibrary.h"
  29 #include "arm_compute/core/CL/ICLTensor.h"
  30 #include "arm_compute/core/Error.h"
  31 #include "arm_compute/core/Helpers.h"
  32 #include "arm_compute/core/IAccessWindow.h"
  33 #include "arm_compute/core/ITensor.h"
  34 #include "arm_compute/core/Types.h"
  35 #include "arm_compute/core/Utils.h"
  36 #include "arm_compute/core/Validate.h"
  37 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
  38 #include "support/ToolchainSupport.h"
  39
  40 using namespace arm_compute;
  41
  42 namespace
  43 {
  44 /** Calculates expected output shape dimension
  45  *
  46  * @param[in] Input shape
  47  *
  48  * @return Expected output shape
  49  */
  50 TensorShape get_output_shape(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info)
  51 {
  52     unsigned int output_width  = 0;
  53     unsigned int output_height = 0;
  54     std::tie(output_width, output_height) = scaled_dimensions(input_shape.x(), input_shape.y(), weights_shape.x(), weights_shape.y(), conv_info);
  55
  56     TensorShape output_shape = input_shape;
  57     output_shape.set(0, output_width);
  58     output_shape.set(1, output_height);
  59     output_shape.set(2, weights_shape[3]);
  60
  61     return output_shape;
  62 }
  63
  64 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
  65 {
  66     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
  67     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
  68     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != weights->dimension(1),
  69                                     "Weights should have same width as length");
  70     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != 1 && weights->dimension(0) != 3 && weights->dimension(0) != 5,
  71                                     "Kernel sizes other than 1x1, 3x3 or 5x5 are not supported");
  72     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(2) != input->dimension(2),
  73                                     "Weights feature map dimension should match the respective input's one");
  74     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != weights->dimension(1),
  75                                     "Only rectangular weights are supported!");
  76     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4,
  77                                     "Weights can be at most 4 dimensional");
  78     ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(0) == 1) && std::get<0>(conv_info.stride()) > 3,
  79                                     "Strides larger than 3 not supported for 1x1 convolution.");
  80     ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(0) == 3 || weights->dimension(0) == 5) && std::get<0>(conv_info.stride()) > 2,
  81                                     "Strides larger than 2 not supported for 3x3 convolution.");
  82
  83     if(biases != nullptr)
  84     {
  85         if(is_data_type_quantized_asymmetric(input->data_type()))
  86         {
  87             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
  88         }
  89         else
  90         {
  91             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
  92         }
  93         ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(0) != weights->dimension(3),
  94                                         "Biases size and number of input feature maps should match");
  95         ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1,
  96                                         "Biases should be one dimensional");
  97     }
  98
  99     // Checks performed when output is configured
 100     if(output->total_size() != 0)
 101     {
 102         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
 103                                                            get_output_shape(input->tensor_shape(), weights->tensor_shape(), conv_info));
 104         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 105         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
 106     }
 107
 108     return Status{};
 109 }
 110
 111 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, const GPUTarget target)
 112 {
 113     const unsigned int kernel_size = weights->dimension(0);
 114     const DataType     data_type   = input->data_type();
 115
 116     // Get convolved dimensions
 117     TensorShape output_shape = get_output_shape(input->tensor_shape(), weights->tensor_shape(), conv_info);
 118
 119     // Output auto inizialitation if not yet initialized
 120     auto_init_if_empty(*output, output_shape,
 121                        1,
 122                        input->data_type(),
 123                        input->fixed_point_position(),
 124                        input->quantization_info());
 125
 126     unsigned int conv_stride_x = std::get<0>(conv_info.stride());
 127     unsigned int conv_stride_y = std::get<1>(conv_info.stride());
 128     unsigned int conv_pad_left = conv_info.pad_left();
 129     unsigned int conv_pad_top  = conv_info.pad_top();
 130
 131     unsigned int num_elems_read_per_iteration_x    = 0;
 132     unsigned int num_elems_read_per_iteration_y    = 0;
 133     unsigned int num_elems_written_per_iteration_x = 0;
 134     unsigned int num_elems_written_per_iteration_y = 0;
 135
 136     if((target == GPUTarget::BIFROST) && (kernel_size <= 5) && (conv_stride_x == 1) && (conv_stride_y == 1) && (data_type == DataType::F32))
 137     {
 138         // Configure kernel window
 139
 140         switch(kernel_size)
 141         {
 142             case 1:
 143             {
 144                 num_elems_read_per_iteration_x    = 4;
 145                 num_elems_read_per_iteration_y    = 4;
 146                 num_elems_written_per_iteration_x = 4;
 147                 num_elems_written_per_iteration_y = 4;
 148                 break;
 149             }
 150             case 3:
 151             {
 152                 num_elems_read_per_iteration_x    = 6;
 153                 num_elems_read_per_iteration_y    = 5;
 154                 num_elems_written_per_iteration_x = 4;
 155                 num_elems_written_per_iteration_y = 3;
 156                 break;
 157             }
 158             case 5:
 159             {
 160                 num_elems_read_per_iteration_x    = 8;
 161                 num_elems_read_per_iteration_y    = 6;
 162                 num_elems_written_per_iteration_x = 4;
 163                 num_elems_written_per_iteration_y = 2;
 164                 break;
 165             }
 166             default:
 167             {
 168                 ARM_COMPUTE_ERROR("Kernel size not optimized for Bifrost");
 169             }
 170         }
 171     }
 172     else
 173     {
 174         num_elems_read_per_iteration_y    = kernel_size;
 175         num_elems_written_per_iteration_x = 8;
 176         num_elems_written_per_iteration_y = 1;
 177         switch(kernel_size)
 178         {
 179             case 1:
 180                 switch(conv_stride_x)
 181                 {
 182                     case 1:
 183                         num_elems_read_per_iteration_x = 8;
 184                         break;
 185                     case 2:
 186                         num_elems_read_per_iteration_x = 16;
 187                         break;
 188                     case 3:
 189                         switch(input->element_size())
 190                         {
 191                             case 1:
 192                                 num_elems_read_per_iteration_x = 28;
 193                                 break;
 194                             case 2:
 195                                 num_elems_read_per_iteration_x = 24;
 196                                 break;
 197                             case 4:
 198                                 num_elems_read_per_iteration_x = 22;
 199                                 break;
 200                             default:
 201                                 ARM_COMPUTE_ERROR("Invalid data size");
 202                         }
 203                         break;
 204                     default:
 205                         ARM_COMPUTE_ERROR("Invalid convolution stride X");
 206                 }
 207                 break;
 208             case 3:
 209                 switch(conv_stride_x)
 210                 {
 211                     case 1:
 212                         num_elems_read_per_iteration_x = 10;
 213                         break;
 214                     case 2:
 215                         num_elems_read_per_iteration_x = 17;
 216                         break;
 217                     default:
 218                         ARM_COMPUTE_ERROR("Invalid convolution stride X");
 219                 }
 220                 break;
 221             case 5:
 222                 switch(conv_stride_x)
 223                 {
 224                     case 1:
 225                         num_elems_read_per_iteration_x = 12;
 226                         break;
 227                     case 2:
 228                         num_elems_read_per_iteration_x = 20;
 229                         break;
 230                     default:
 231                         ARM_COMPUTE_ERROR("Invalid convolution stride X");
 232                 }
 233                 break;
 234             default:
 235                 ARM_COMPUTE_ERROR("Invalid direct convolution size");
 236         }
 237     }
 238
 239     // Create window and update padding
 240     bool   window_changed = false;
 241     Window win            = calculate_max_window(*output, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
 242
 243     AccessWindowRectangle input_access(input, -conv_pad_left, -conv_pad_top,
 244                                        num_elems_read_per_iteration_x, num_elems_read_per_iteration_y,
 245                                        conv_stride_x, conv_stride_y);
 246     AccessWindowStatic    weights_access(weights, 0, 0, kernel_size, kernel_size);
 247     AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
 248
 249     window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
 250
 251     output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
 252
 253     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
 254     return std::make_pair(err, win);
 255 }
 256 } // namespace
 257
 258 CLDirectConvolutionLayerKernel::CLDirectConvolutionLayerKernel()
 259     : _input(nullptr), _biases(nullptr), _weights(nullptr), _output(nullptr), _border_size(0), _conv_stride_x(0), _conv_stride_y(0)
 260 {
 261 }
 262
 263 BorderSize CLDirectConvolutionLayerKernel::border_size() const
 264 {
 265     return _border_size;
 266 }
 267
 268 void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
 269 {
 270     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 271
 272     const unsigned int kernel_size = weights->info()->dimension(0);
 273     const DataType     data_type   = input->info()->data_type();
 274
 275     // Get convolved dimensions
 276     TensorShape output_shape = get_output_shape(input->info()->tensor_shape(), weights->info()->tensor_shape(), conv_info);
 277
 278     // Output auto inizialitation if not yet initialized
 279     auto_init_if_empty(*output->info(),
 280                        output_shape,
 281                        1,
 282                        input->info()->data_type(),
 283                        input->info()->fixed_point_position(),
 284                        input->info()->quantization_info());
 285
 286     // Perform validation step
 287     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
 288                                                   weights->info(),
 289                                                   (biases != nullptr) ? biases->info() : nullptr,
 290                                                   output->info(),
 291                                                   conv_info));
 292
 293     _conv_stride_x = std::get<0>(conv_info.stride());
 294     _conv_stride_y = std::get<1>(conv_info.stride());
 295     _border_size   = BorderSize(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left());
 296
 297     _input   = input;
 298     _weights = weights;
 299     _output  = output;
 300     _biases  = biases;
 301
 302     const GPUTarget gpu_target = get_arch_from_target(get_target());
 303
 304     std::stringstream kernel_name;
 305     kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size;
 306
 307     CLBuildOptions build_options;
 308     build_options.add_option_if(_biases != nullptr, std::string("-DHAS_BIAS"));
 309
 310     if((gpu_target == GPUTarget::BIFROST) && (kernel_size <= 5) && (_conv_stride_x == 1) && (_conv_stride_y == 1) && (data_type == DataType::F32))
 311     {
 312         build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(2))));
 313
 314         kernel_name << "_f32_bifrost";
 315         _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str(), build_options.options()));
 316
 317         // Through extensive experimentation with over 30 representative tensor
 318         // shapes, we found a small number of local work size configurations
 319         // that result in nearly optimal execution times. Selecting the right
 320         // lws for a given shape, however, required a complex decision tree,
 321         // until we constructed a simple feature as described below.
 322         //
 323         // We started from the number of multiply-accumulate operations for a
 324         // convolution layer, which is equal to the product of the input
 325         // dimensions 0..2 and the weights dimensions 0..2.  Unfortunately,
 326         // this resulted in ties between distinct shapes that required distinct
 327         // lws configurations. Replacing the width of the input with the kernel
 328         // size, however, resulted in nearly optimal predictions. We use underscores
 329         // in variable names to indicate when they are intentionally misleading.
 330         const size_t product_of_weights_dimensions = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2);
 331         const size_t product_of_input_dimensions_  = input->info()->dimension(0) * weights->info()->dimension(1) * input->info()->dimension(2);
 332         const float  mega_ops_                     = 1e-6 * product_of_weights_dimensions * product_of_input_dimensions_;
 333
 334         switch(kernel_size)
 335         {
 336             case 1:
 337             {
 338                 if(mega_ops_ < 1.f)
 339                 {
 340                     _lws_hint = cl::NDRange(1, 1, 8);
 341                 }
 342                 else if(mega_ops_ < 7.f)
 343                 {
 344                     _lws_hint = cl::NDRange(1, 1, 4);
 345                 }
 346                 else
 347                 {
 348                     _lws_hint = cl::NDRange(1, 1, 2);
 349                 }
 350                 break;
 351             }
 352             case 3:
 353             {
 354                 if(mega_ops_ < 1.f)
 355                 {
 356                     _lws_hint = cl::NDRange(1, 1, 8);
 357                 }
 358                 else if(mega_ops_ < 13.f)
 359                 {
 360                     _lws_hint = cl::NDRange(2, 1, 4);
 361                 }
 362                 else if(mega_ops_ < 50.f)
 363                 {
 364                     _lws_hint = cl::NDRange(3, 1, 4);
 365                 }
 366                 else
 367                 {
 368                     _lws_hint = cl::NDRange(2, 1, 6);
 369                 }
 370                 break;
 371             }
 372             case 5:
 373             {
 374                 if(mega_ops_ < 2.f || mega_ops_ > 80.f)
 375                 {
 376                     _lws_hint = cl::NDRange(2, 1, 4);
 377                 }
 378                 else
 379                 {
 380                     _lws_hint = cl::NDRange(2, 1, 8);
 381                 }
 382                 break;
 383             }
 384             default:
 385             {
 386                 ARM_COMPUTE_ERROR("Kernel size not optimized for Bifrost");
 387             }
 388         }
 389     }
 390     else
 391     {
 392         bool     is_quantized_fixed_point = is_data_type_fixed_point(data_type);
 393         bool     is_quantized_asymm       = is_data_type_quantized_asymmetric(data_type);
 394         DataType promoted_type            = (is_quantized_fixed_point) ? get_promoted_data_type(data_type) : data_type;
 395
 396         build_options.add_option_if(is_quantized_asymm, std::string("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size)));
 397         build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)));
 398         build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type)));
 399         build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(2))));
 400         build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(_conv_stride_x)));
 401         build_options.add_option_if(is_quantized_fixed_point,
 402                                     std::string("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position())));
 403         build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(promoted_type)));
 404
 405         // Create kernel
 406         _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(is_quantized_asymm ? "direct_convolution_1x1_3x3_5x5_quantized" : kernel_name.str(),
 407                                                                                build_options.options()));
 408     }
 409
 410     // Configure kernel window
 411     auto win_config = validate_and_configure_window(input->info(), weights->info(), output->info(), conv_info, gpu_target);
 412     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
 413     ICLKernel::configure(win_config.second);
 414
 415     // Set static kernel arguments
 416     if(is_data_type_quantized_asymmetric(data_type))
 417     {
 418         int output_multiplier = 0;
 419         int output_shift      = 0;
 420
 421         float multiplier = _input->info()->quantization_info().scale * _weights->info()->quantization_info().scale / _output->info()->quantization_info().scale;
 422         ARM_COMPUTE_THROW_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift));
 423
 424         unsigned int idx = 3 * num_arguments_per_3D_tensor() + ((_biases != nullptr) ? num_arguments_per_1D_tensor() : 0) + 1;
 425         _kernel.setArg(idx++, -_input->info()->quantization_info().offset);
 426         _kernel.setArg(idx++, -_weights->info()->quantization_info().offset);
 427         _kernel.setArg(idx++, _output->info()->quantization_info().offset);
 428         _kernel.setArg(idx++, output_multiplier);
 429         _kernel.setArg(idx++, output_shift);
 430     }
 431
 432     // Set config_id for enabling LWS tuning
 433     _config_id = "direct_convolution_";
 434     _config_id += lower_string(string_from_data_type(data_type));
 435     _config_id += "_";
 436     _config_id += support::cpp11::to_string(kernel_size);
 437     _config_id += "_";
 438     _config_id += support::cpp11::to_string(border_size().left);
 439     _config_id += "_";
 440     _config_id += support::cpp11::to_string(border_size().top);
 441     _config_id += "_";
 442     _config_id += support::cpp11::to_string(border_size().right);
 443     _config_id += "_";
 444     _config_id += support::cpp11::to_string(border_size().bottom);
 445     _config_id += "_";
 446     _config_id += support::cpp11::to_string(_conv_stride_x);
 447     _config_id += "_";
 448     _config_id += support::cpp11::to_string(_conv_stride_y);
 449     _config_id += "_";
 450     _config_id += support::cpp11::to_string(output->info()->dimension(0));
 451     _config_id += "_";
 452     _config_id += support::cpp11::to_string(output->info()->dimension(1));
 453 }
 454
 455 Status CLDirectConvolutionLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
 456                                                 const GPUTarget target)
 457 {
 458     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info));
 459     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), output->clone().get(), conv_info, target).first);
 460
 461     return Status{};
 462 }
 463
 464 void CLDirectConvolutionLayerKernel::run(const Window &window, cl::CommandQueue &queue)
 465 {
 466     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
 467     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 468
 469     // Get initial windows
 470     Window slice  = window.first_slice_window_3D();
 471     Window win_in = window;
 472
 473     win_in.adjust(Window::DimX, -_border_size.left, true);
 474     win_in.adjust(Window::DimY, -_border_size.top, true);
 475     win_in.set_dimension_step(Window::DimX, window.x().step() * _conv_stride_x);
 476     win_in.set_dimension_step(Window::DimY, window.y().step() * _conv_stride_y);
 477
 478     Window slice_in = win_in.first_slice_window_3D();
 479
 480     unsigned int idx1 = 2 * num_arguments_per_3D_tensor();
 481     add_3D_tensor_argument(idx1, _weights, slice);
 482
 483     if(_biases != nullptr)
 484     {
 485         Window slice_biases;
 486         slice_biases.use_tensor_dimensions(_biases->info()->tensor_shape());
 487         add_1D_tensor_argument(idx1, _biases, slice_biases);
 488     }
 489
 490     _kernel.setArg(idx1++, static_cast<unsigned int>(_weights->info()->strides_in_bytes()[3]));
 491
 492     do
 493     {
 494         unsigned int idx = 0;
 495         add_3D_tensor_argument(idx, _input, slice_in);
 496         add_3D_tensor_argument(idx, _output, slice);
 497
 498         enqueue(queue, *this, slice, _lws_hint);
 499     }
 500     while(window.slide_window_slice_3D(slice) && win_in.slide_window_slice_3D(slice_in));
 501 }