arm_compute v18.05

[platform/upstream/armcl.git] / src / core / CL / kernels / CLGEMMMatrixMultiplyKernel.cpp
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp

index 6c31e37..cc9ae27 100644 (file)
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -32,6 +32,7 @@
  #include "arm_compute/core/Error.h"
  #include "arm_compute/core/FixedPoint.h"
  #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
  #include "arm_compute/core/Types.h"
  #include "arm_compute/core/Utils.h"
  #include "arm_compute/core/Validate.h"
@@ -54,6 +55,7 @@ inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *i
      ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
      ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
      ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the matrix B must be <= 3");
  
      if(!is_interleaved_transposed)
      {
@@ -105,7 +107,7 @@ inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *i
  }
  
  inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output,
-                                                               bool is_interleaved_transposed, GPUTarget gpu_target,
+                                                               bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target,
                                                                 ElementsProcessed &num_elements_processed)
  {
      bool   window_changed = false;
@@ -115,6 +117,9 @@ inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *inpu
      unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
      unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
  
+    // Output tensor auto inizialitation if not yet initialized
+    auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, is_interleaved_transposed, reshape_info)));
+
      if(is_interleaved_transposed)
      {
          // Configure kernel window
@@ -124,7 +129,9 @@ inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *inpu
          win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
  
          AccessWindowRectangle input0_access(input0, 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f);
-        AccessWindowTranspose input1_access(input1, 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f);
+        AccessWindowStatic    input1_access(input1, 0, 0,
+                                            ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
+                                            ceil_to_multiple(input1->dimension(1), num_elems_processed_per_iteration_y));
          AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
  
          window_changed = update_window_and_padding(win, input0_access, input1_access, output_access);
@@ -138,7 +145,8 @@ inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *inpu
          num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);
  
          // Create kernels according to the architecture, data type and input size.
-        if(gpu_target == GPUTarget::BIFROST && data_type == DataType::F32)
+        GPUTarget arch_target = get_arch_from_target(gpu_target);
+        if(arch_target == GPUTarget::BIFROST && data_type == DataType::F32)
          {
              num_elems_processed_per_iteration_x = (input1->dimension(0) <= 1000 && input0->num_dimensions() == 1) ? 2 : 4;
          }
@@ -157,13 +165,19 @@ inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *inpu
          output_access.set_valid_region(win, ValidRegion(coord, output->tensor_shape()));
      }
  
+    // Collapse along the Z direction
+    // This collapse needs to be here in order to tune the Z dimension of LWS
+    Window             collapsed             = win;
+    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
+    collapsed                                = win.collapse(win, dimension_to_collapse);
+
      Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
+    return std::make_pair(err, collapsed);
  }
  } // namespace
  
  CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel()
-    : _input0(nullptr), _input1(nullptr), _output(nullptr)
+    : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true)
  {
  }
  
@@ -171,45 +185,64 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
  {
      ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
  
-    // Output tensor auto inizialitation if not yet initialized
-    TensorShape tensor_shape{ input0->info()->tensor_shape() };
-    tensor_shape.set(0, is_interleaved_transposed ? reshape_info.n() : input1->info()->dimension(0));
-    tensor_shape.set(1, is_interleaved_transposed ? reshape_info.m() : input0->info()->dimension(1));
-
-    auto_init_if_empty(*output->info(), input0->info()->clone()->set_tensor_shape(tensor_shape));
-
      // Perform validate step
      ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info));
  
-    _input0 = input0;
-    _input1 = input1;
-    _output = output;
+    _input0         = input0;
+    _input1         = input1;
+    _output         = output;
+    _slide_matrix_b = _input1->info()->num_dimensions() >= _input0->info()->num_dimensions();
  
      const DataType data_type = input0->info()->data_type();
      const int      fp_pos    = input0->info()->fixed_point_position();
  
      // Get target architecture
-    GPUTarget arch_target = get_arch_from_target(get_target());
+    GPUTarget gpu_target = get_target();
  
      // Configure LWS hint
-    if(arch_target == GPUTarget::BIFROST && input1->info()->dimension(1) == 24)
-    {
-        // LWS optimized for the 11x11 AlexNet convolution on Bifrost.
-        _lws_hint = cl::NDRange(2, 2);
-    }
-    else if(output->info()->dimension(1) == 196)
-    {
-        _lws_hint = cl::NDRange(1, 7);
-    }
-    else
+    switch(gpu_target)
      {
-        _lws_hint = cl::NDRange(8, 8);
+        case GPUTarget::MIDGARD:
+        case GPUTarget::T600:
+        case GPUTarget::T700:
+        case GPUTarget::T800:
+            if(output->info()->dimension(1) == 196)
+            {
+                _lws_hint = cl::NDRange(1, 7);
+            }
+            else
+            {
+                _lws_hint = cl::NDRange(8, 8);
+            }
+            break;
+        case GPUTarget::G71:
+        case GPUTarget::G72:
+        case GPUTarget::G51:
+        case GPUTarget::G51BIG:
+        case GPUTarget::G51LIT:
+        case GPUTarget::TNOX:
+            if(input1->info()->dimension(1) == 24)
+            {
+                // LWS optimized for the 11x11 AlexNet convolution on Bifrost.
+                _lws_hint = cl::NDRange(2, 2);
+            }
+            else if(output->info()->dimension(1) == 196)
+            {
+                _lws_hint = cl::NDRange(1, 7);
+            }
+            else
+            {
+                _lws_hint = cl::NDRange(8, 8);
+            }
+            break;
+        default:
+            _lws_hint = cl::NullRange;
      }
  
      ElementsProcessed num_elements_processed{};
  
      // Configure kernel window
-    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), is_interleaved_transposed, arch_target, num_elements_processed);
+    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info, gpu_target, num_elements_processed);
      ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
      ICLKernel::configure(win_config.second);
  
@@ -225,6 +258,11 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
                                        "-DALPHA=" + float_to_string_with_full_precision(alpha));
      }
  
+    // Do not slide matrix B if _slide_matrix_b = false
+    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
+
+    const bool is_bifrost = get_arch_from_target(gpu_target) == GPUTarget::BIFROST;
+
      std::string kernel_name;
      if(is_interleaved_transposed)
      {
@@ -235,9 +273,9 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
          build_opts.add_option("-DMULT_TRANSPOSE1XW_WIDTH=" + support::cpp11::to_string(mult_transpose1xW_width));
          build_opts.add_option("-DMULT_INTERLEAVE4X4_HEIGHT=" + support::cpp11::to_string(mult_interleave4x4_height));
  
-        if(data_type == DataType::F32)
+        if(is_data_type_float(data_type) && is_bifrost)
          {
-            kernel_name = "gemm_mm_interleaved_transposed_f32_" + string_from_target(arch_target);
+            kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type)) + "_bifrost";
          }
          else
          {
@@ -247,14 +285,24 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
      else // The input tensors have not been reshaped
      {
          build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0)));
+        build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
  
          // Create kernels according to the architecture, data type and input size.
-        if(arch_target == GPUTarget::BIFROST && data_type == DataType::F32)
+        if(is_data_type_float(data_type) && is_bifrost)
          {
-            // The first kernel is optimized for the case of 1000 or less output elements (e.g. FC8 of AlexNet and VGG-16, and
-            // FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 output elements (e.g.
-            // FC6 and FC7 of AlexNet and VGG-16).
-            kernel_name = (input1->info()->dimension(0) <= 1000 && input0->info()->num_dimensions() == 1) ? "gemm_mm_floating_point_f32_bifrost_1000" : "gemm_mm_floating_point_f32_bifrost";
+            kernel_name = "gemm_mm_floating_point";
+
+            if(input0->info()->num_dimensions() != 1)
+            {
+                kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost";
+            }
+            else if(input1->info()->dimension(0) <= 1000 && data_type == DataType::F32)
+            {
+                // The first kernel is optimized for the case of 1000 or less output elements (e.g. FC8 of AlexNet and VGG-16, and
+                // FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 output elements (e.g.
+                // FC6 and FC7 of AlexNet and VGG-16).
+                kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost_1000";
+            }
  
              // The work-group size equal to the Bifrost quad size has been proved to be optimal for these kernels
              // via exhaustive autotuning over a range of representative layer configurations.
@@ -266,7 +314,6 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
          }
          else // (MIDGARD and F32) or (F16)
          {
-            build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
              kernel_name = "gemm_mm_floating_point";
          }
          build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elements_processed.y()));
@@ -285,6 +332,10 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
      _config_id += "_";
      _config_id += support::cpp11::to_string(output->info()->dimension(0));
      _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(3));
+    _config_id += "_";
      _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));
  }
  
@@ -299,6 +350,7 @@ Status CLGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITe
                                                                input1->clone().get(),
                                                                output->clone().get(),
                                                                is_interleaved_transposed,
+                                                              reshape_info,
                                                                gpu_target,
                                                                num_elements_processed)
                                  .first);
@@ -311,7 +363,13 @@ void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &que
      ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
      ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
  
-    Window slice          = window.first_slice_window_2D();
+    if(_input1->info()->num_dimensions() < 3)
+    {
+        // The stride_z for matrix B must be zero if we do not slice
+        ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
+    }
+
+    Window slice          = window.first_slice_window_3D();
      Window slice_matrix_b = slice;
  
      slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -321,8 +379,8 @@ void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &que
      {
          Window slice_b = slice;
          // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-        if(_input1->info()->num_dimensions() < 3)
+        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
+        if(!_slide_matrix_b)
          {
              slice_b = slice_matrix_b;
          }
@@ -331,7 +389,10 @@ void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &que
          add_2D_tensor_argument(idx, _input0, slice);
          add_2D_tensor_argument(idx, _input1, slice_b);
          add_2D_tensor_argument(idx, _output, slice);
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
          enqueue(queue, *this, slice, _lws_hint);
      }
-    while(window.slide_window_slice_2D(slice));
+    while(window.slide_window_slice_3D(slice));
  }