src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp

   1 /*
   2  * Copyright (c) 2017-2018 ARM Limited.
   3  *
   4  * SPDX-License-Identifier: MIT
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to
   8  * deal in the Software without restriction, including without limitation the
   9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  10  * sell copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in all
  14  * copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24 #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
  25
  26 #include "arm_compute/core/CL/ICLTensor.h"
  27 #include "arm_compute/core/Error.h"
  28 #include "arm_compute/core/Helpers.h"
  29 #include "arm_compute/core/TensorInfo.h"
  30 #include "arm_compute/core/Types.h"
  31 #include "arm_compute/core/Validate.h"
  32 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
  33 #include "arm_compute/runtime/CL/CLScheduler.h"
  34
  35 using namespace arm_compute;
  36 using namespace arm_compute::misc::shape_calculator;
  37
  38 namespace
  39 {
  40 inline bool is_interleaved_transposed(int m, int n, int k, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
  41 {
  42     bool flag = true;
  43
  44     if(gpu_target == GPUTarget::BIFROST)
  45     {
  46         // COMPMID-852
  47         if(k > 256 && m > 4 && reshape_b_only_on_first_run)
  48         {
  49             flag = ((0.72f + n * 0.10766f) < (n * 0.1284f));
  50         }
  51         else
  52         {
  53             flag = false;
  54         }
  55     }
  56
  57     return flag;
  58 }
  59 } // namespace
  60
  61 CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
  62     : _memory_group(std::move(memory_manager)), _mm_kernel(), _mtx_a_reshape_kernel(), _mtx_b_reshape_kernel(), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(),
  63       _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _a_offset(0), _b_offset(0), _is_interleaved_transposed(true), _is_first_run(true), _reshape_b_only_on_first_run(false)
  64 {
  65 }
  66
  67 void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output, const GEMMInfo &gemm_info)
  68 {
  69     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
  70     ARM_COMPUTE_UNUSED(gemm_info);
  71     ARM_COMPUTE_ERROR_THROW_ON(CLGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info(), gemm_info));
  72
  73     _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
  74     _a_offset                    = a->info()->quantization_info().offset;
  75     _b_offset                    = b->info()->quantization_info().offset;
  76
  77     // Get the GPU target
  78     const GPUTarget gpu_target = CLScheduler::get().target();
  79
  80     // Set the target for the kernels
  81     _mtx_a_reshape_kernel.set_target(gpu_target);
  82     _mm_kernel.set_target(gpu_target);
  83
  84     const ICLTensor *matrix_a = a;
  85     const ICLTensor *matrix_b = b;
  86
  87     // Arguments used by GEMMReshapeInfo
  88     // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
  89     // in order to know how the matrices have been reshaped
  90     const int     m                         = a->info()->dimension(1);
  91     const int     n                         = b->info()->dimension(0);
  92     const int     k                         = a->info()->dimension(0);
  93     constexpr int mult_transpose1xW_width   = 1;
  94     constexpr int mult_interleave4x4_height = 1;
  95
  96     // Check if we need to reshape the matrix A and matrix B
  97     _is_interleaved_transposed = is_interleaved_transposed(m, n, k, _reshape_b_only_on_first_run, gpu_target);
  98
  99     if(_is_interleaved_transposed)
 100     {
 101         matrix_a = &_tmp_a;
 102         matrix_b = &_tmp_b;
 103
 104         _memory_group.manage(&_tmp_a);
 105         _memory_group.manage(&_tmp_b);
 106
 107         // Configure interleave kernel
 108         _mtx_a_reshape_kernel.configure(a, &_tmp_a, mult_interleave4x4_height);
 109
 110         // Configure transpose kernel
 111         _mtx_b_reshape_kernel.configure(b, &_tmp_b, mult_transpose1xW_width);
 112     }
 113
 114     // Configure matrix multiply kernel
 115     _mm_kernel.configure(matrix_a, matrix_b, output, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height));
 116
 117     // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
 118     if(_a_offset != 0)
 119     {
 120         TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
 121         _vector_sum_col.allocator()->init(info_vector_sum_col);
 122         _memory_group.manage(&_vector_sum_col);
 123
 124         // Configure Matrix B reduction kernel
 125         _mtx_b_reduction_kernel.configure(b, &_vector_sum_col);
 126     }
 127
 128     // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
 129     if(_b_offset != 0)
 130     {
 131         TensorInfo info_vector_sum_row(compute_reductionB_shape(*a->info()), 1, DataType::S32);
 132         _vector_sum_row.allocator()->init(info_vector_sum_row);
 133         _memory_group.manage(&_vector_sum_row);
 134
 135         // Configure matrix A reduction kernel
 136         _mtx_a_reduction_kernel.configure(a, &_vector_sum_row);
 137     }
 138
 139     // Configure offset contribution kernel
 140     _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset);
 141
 142     // Allocate tensors
 143     if(_is_interleaved_transposed)
 144     {
 145         _tmp_a.allocator()->allocate();
 146         _tmp_b.allocator()->allocate();
 147     }
 148
 149     if(_a_offset != 0)
 150     {
 151         _vector_sum_col.allocator()->allocate();
 152     }
 153
 154     if(_b_offset != 0)
 155     {
 156         _vector_sum_row.allocator()->allocate();
 157     }
 158 }
 159
 160 Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output, const GEMMInfo &gemm_info)
 161 {
 162     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);
 163     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
 164     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
 165     ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
 166                                     "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
 167     ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(1) != (output)->dimension(1),
 168                                     "The output matrix must have the same number of rows as the matrix A");
 169     ARM_COMPUTE_RETURN_ERROR_ON_MSG((b)->dimension(0) != (output)->dimension(0),
 170                                     "The output matrix must have the same number of columns as the matrix B");
 171     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
 172     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
 173
 174     int32_t a_offset = a->quantization_info().offset;
 175     int32_t b_offset = b->quantization_info().offset;
 176
 177     const int             m                         = a->dimension(1);
 178     const int             n                         = b->dimension(0);
 179     const int             k                         = a->dimension(0);
 180     constexpr int         mult_transpose1xW_width   = 1;
 181     constexpr int         mult_interleave4x4_height = 1;
 182     const GEMMReshapeInfo reshape_info(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height);
 183
 184     bool reshape_matrices = is_interleaved_transposed(m, n, k, gemm_info.reshape_b_only_on_first_run(), CLScheduler::get().target());
 185
 186     if(reshape_matrices)
 187     {
 188         TensorInfo info_a(compute_interleaved_shape(*a, mult_interleave4x4_height), 1, a->data_type());
 189         TensorInfo info_b(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width), 1, b->data_type());
 190
 191         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &info_a, mult_interleave4x4_height));
 192         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMTranspose1xWKernel::validate(b, &info_b, mult_transpose1xW_width));
 193         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output, reshape_matrices, reshape_info));
 194     }
 195     else
 196     {
 197         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(a, b, output, reshape_matrices, reshape_info));
 198     }
 199
 200     TensorInfo info_vector_sum_col, info_vector_sum_row;
 201
 202     // Validate matrix B reduction kernel only if _a_offset is not equal to 0
 203     if(a_offset != 0)
 204     {
 205         info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
 206
 207         // Configure Matrix B reduction kernel
 208         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col));
 209     }
 210
 211     // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
 212     if(b_offset != 0)
 213     {
 214         info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
 215
 216         // Configure matrix A reduction kernel
 217         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row));
 218     }
 219
 220     // Validate offset contribution kernel
 221     ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionKernel::validate(output,
 222                                                                              a_offset == 0 ? nullptr : &info_vector_sum_col,
 223                                                                              b_offset == 0 ? nullptr : &info_vector_sum_row,
 224                                                                              a_offset, b_offset));
 225
 226     return Status{};
 227 }
 228
 229 void CLGEMMLowpMatrixMultiplyCore::run()
 230 {
 231     _memory_group.acquire();
 232
 233     if(_is_interleaved_transposed)
 234     {
 235         // Run reshape matrix A
 236         CLScheduler::get().enqueue(_mtx_a_reshape_kernel, false);
 237
 238         if(_is_first_run || !_reshape_b_only_on_first_run)
 239         {
 240             // Run reshape matrix B
 241             CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false);
 242         }
 243     }
 244
 245     // Note: if _reshape_b_only_on_first_run = true, the reduction kernel can be executed only once
 246     if(_is_first_run || !_reshape_b_only_on_first_run)
 247     {
 248         // Run matrix B reduction kernel only if _a_offset is not equal to 0
 249         if(_a_offset != 0)
 250         {
 251             CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);
 252         }
 253     }
 254
 255     // Run matrix multiply
 256     CLScheduler::get().enqueue(_mm_kernel, false);
 257
 258     // Run matrix A reduction kernel only if _b_offset is not equal to 0
 259     if(_b_offset != 0)
 260     {
 261         CLScheduler::get().enqueue(_mtx_a_reduction_kernel, false);
 262     }
 263
 264     // Run offset contribution kernel
 265     CLScheduler::get().enqueue(_offset_contribution_kernel, true);
 266
 267     _memory_group.release();
 268
 269     _is_first_run = false;
 270 }