arm_compute v18.05
[platform/upstream/armcl.git] / src / core / CL / kernels / CLGEMMMatrixMultiplyKernel.cpp
index 6c31e37..cc9ae27 100644 (file)
@@ -32,6 +32,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/FixedPoint.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
@@ -54,6 +55,7 @@ inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *i
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the matrix B must be <= 3");
 
     if(!is_interleaved_transposed)
     {
@@ -105,7 +107,7 @@ inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *i
 }
 
 inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output,
-                                                               bool is_interleaved_transposed, GPUTarget gpu_target,
+                                                               bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target,
                                                                ElementsProcessed &num_elements_processed)
 {
     bool   window_changed = false;
@@ -115,6 +117,9 @@ inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *inpu
     unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
     unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
 
+    // Output tensor auto inizialitation if not yet initialized
+    auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, is_interleaved_transposed, reshape_info)));
+
     if(is_interleaved_transposed)
     {
         // Configure kernel window
@@ -124,7 +129,9 @@ inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *inpu
         win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
         AccessWindowRectangle input0_access(input0, 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f);
-        AccessWindowTranspose input1_access(input1, 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f);
+        AccessWindowStatic    input1_access(input1, 0, 0,
+                                            ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
+                                            ceil_to_multiple(input1->dimension(1), num_elems_processed_per_iteration_y));
         AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
 
         window_changed = update_window_and_padding(win, input0_access, input1_access, output_access);
@@ -138,7 +145,8 @@ inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *inpu
         num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);
 
         // Create kernels according to the architecture, data type and input size.
-        if(gpu_target == GPUTarget::BIFROST && data_type == DataType::F32)
+        GPUTarget arch_target = get_arch_from_target(gpu_target);
+        if(arch_target == GPUTarget::BIFROST && data_type == DataType::F32)
         {
             num_elems_processed_per_iteration_x = (input1->dimension(0) <= 1000 && input0->num_dimensions() == 1) ? 2 : 4;
         }
@@ -157,13 +165,19 @@ inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *inpu
         output_access.set_valid_region(win, ValidRegion(coord, output->tensor_shape()));
     }
 
+    // Collapse along the Z direction
+    // This collapse needs to be here in order to tune the Z dimension of LWS
+    Window             collapsed             = win;
+    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
+    collapsed                                = win.collapse(win, dimension_to_collapse);
+
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
+    return std::make_pair(err, collapsed);
 }
 } // namespace
 
 CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel()
-    : _input0(nullptr), _input1(nullptr), _output(nullptr)
+    : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true)
 {
 }
 
@@ -171,45 +185,64 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
 
-    // Output tensor auto inizialitation if not yet initialized
-    TensorShape tensor_shape{ input0->info()->tensor_shape() };
-    tensor_shape.set(0, is_interleaved_transposed ? reshape_info.n() : input1->info()->dimension(0));
-    tensor_shape.set(1, is_interleaved_transposed ? reshape_info.m() : input0->info()->dimension(1));
-
-    auto_init_if_empty(*output->info(), input0->info()->clone()->set_tensor_shape(tensor_shape));
-
     // Perform validate step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info));
 
-    _input0 = input0;
-    _input1 = input1;
-    _output = output;
+    _input0         = input0;
+    _input1         = input1;
+    _output         = output;
+    _slide_matrix_b = _input1->info()->num_dimensions() >= _input0->info()->num_dimensions();
 
     const DataType data_type = input0->info()->data_type();
     const int      fp_pos    = input0->info()->fixed_point_position();
 
     // Get target architecture
-    GPUTarget arch_target = get_arch_from_target(get_target());
+    GPUTarget gpu_target = get_target();
 
     // Configure LWS hint
-    if(arch_target == GPUTarget::BIFROST && input1->info()->dimension(1) == 24)
-    {
-        // LWS optimized for the 11x11 AlexNet convolution on Bifrost.
-        _lws_hint = cl::NDRange(2, 2);
-    }
-    else if(output->info()->dimension(1) == 196)
-    {
-        _lws_hint = cl::NDRange(1, 7);
-    }
-    else
+    switch(gpu_target)
     {
-        _lws_hint = cl::NDRange(8, 8);
+        case GPUTarget::MIDGARD:
+        case GPUTarget::T600:
+        case GPUTarget::T700:
+        case GPUTarget::T800:
+            if(output->info()->dimension(1) == 196)
+            {
+                _lws_hint = cl::NDRange(1, 7);
+            }
+            else
+            {
+                _lws_hint = cl::NDRange(8, 8);
+            }
+            break;
+        case GPUTarget::G71:
+        case GPUTarget::G72:
+        case GPUTarget::G51:
+        case GPUTarget::G51BIG:
+        case GPUTarget::G51LIT:
+        case GPUTarget::TNOX:
+            if(input1->info()->dimension(1) == 24)
+            {
+                // LWS optimized for the 11x11 AlexNet convolution on Bifrost.
+                _lws_hint = cl::NDRange(2, 2);
+            }
+            else if(output->info()->dimension(1) == 196)
+            {
+                _lws_hint = cl::NDRange(1, 7);
+            }
+            else
+            {
+                _lws_hint = cl::NDRange(8, 8);
+            }
+            break;
+        default:
+            _lws_hint = cl::NullRange;
     }
 
     ElementsProcessed num_elements_processed{};
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), is_interleaved_transposed, arch_target, num_elements_processed);
+    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info, gpu_target, num_elements_processed);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure(win_config.second);
 
@@ -225,6 +258,11 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
                                       "-DALPHA=" + float_to_string_with_full_precision(alpha));
     }
 
+    // Do not slide matrix B if _slide_matrix_b = false
+    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
+
+    const bool is_bifrost = get_arch_from_target(gpu_target) == GPUTarget::BIFROST;
+
     std::string kernel_name;
     if(is_interleaved_transposed)
     {
@@ -235,9 +273,9 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
         build_opts.add_option("-DMULT_TRANSPOSE1XW_WIDTH=" + support::cpp11::to_string(mult_transpose1xW_width));
         build_opts.add_option("-DMULT_INTERLEAVE4X4_HEIGHT=" + support::cpp11::to_string(mult_interleave4x4_height));
 
-        if(data_type == DataType::F32)
+        if(is_data_type_float(data_type) && is_bifrost)
         {
-            kernel_name = "gemm_mm_interleaved_transposed_f32_" + string_from_target(arch_target);
+            kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type)) + "_bifrost";
         }
         else
         {
@@ -247,14 +285,24 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
     else // The input tensors have not been reshaped
     {
         build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0)));
+        build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
 
         // Create kernels according to the architecture, data type and input size.
-        if(arch_target == GPUTarget::BIFROST && data_type == DataType::F32)
+        if(is_data_type_float(data_type) && is_bifrost)
         {
-            // The first kernel is optimized for the case of 1000 or less output elements (e.g. FC8 of AlexNet and VGG-16, and
-            // FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 output elements (e.g.
-            // FC6 and FC7 of AlexNet and VGG-16).
-            kernel_name = (input1->info()->dimension(0) <= 1000 && input0->info()->num_dimensions() == 1) ? "gemm_mm_floating_point_f32_bifrost_1000" : "gemm_mm_floating_point_f32_bifrost";
+            kernel_name = "gemm_mm_floating_point";
+
+            if(input0->info()->num_dimensions() != 1)
+            {
+                kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost";
+            }
+            else if(input1->info()->dimension(0) <= 1000 && data_type == DataType::F32)
+            {
+                // The first kernel is optimized for the case of 1000 or less output elements (e.g. FC8 of AlexNet and VGG-16, and
+                // FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 output elements (e.g.
+                // FC6 and FC7 of AlexNet and VGG-16).
+                kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost_1000";
+            }
 
             // The work-group size equal to the Bifrost quad size has been proved to be optimal for these kernels
             // via exhaustive autotuning over a range of representative layer configurations.
@@ -266,7 +314,6 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
         }
         else // (MIDGARD and F32) or (F16)
         {
-            build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
             kernel_name = "gemm_mm_floating_point";
         }
         build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elements_processed.y()));
@@ -285,6 +332,10 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
     _config_id += "_";
     _config_id += support::cpp11::to_string(output->info()->dimension(0));
     _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(3));
+    _config_id += "_";
     _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));
 }
 
@@ -299,6 +350,7 @@ Status CLGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITe
                                                               input1->clone().get(),
                                                               output->clone().get(),
                                                               is_interleaved_transposed,
+                                                              reshape_info,
                                                               gpu_target,
                                                               num_elements_processed)
                                 .first);
@@ -311,7 +363,13 @@ void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &que
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    Window slice          = window.first_slice_window_2D();
+    if(_input1->info()->num_dimensions() < 3)
+    {
+        // The stride_z for matrix B must be zero if we do not slice
+        ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
+    }
+
+    Window slice          = window.first_slice_window_3D();
     Window slice_matrix_b = slice;
 
     slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -321,8 +379,8 @@ void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &que
     {
         Window slice_b = slice;
         // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-        if(_input1->info()->num_dimensions() < 3)
+        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
+        if(!_slide_matrix_b)
         {
             slice_b = slice_matrix_b;
         }
@@ -331,7 +389,10 @@ void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &que
         add_2D_tensor_argument(idx, _input0, slice);
         add_2D_tensor_argument(idx, _input1, slice_b);
         add_2D_tensor_argument(idx, _output, slice);
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
         enqueue(queue, *this, slice, _lws_hint);
     }
-    while(window.slide_window_slice_2D(slice));
+    while(window.slide_window_slice_3D(slice));
 }