src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp

   1 /*
   2  * Copyright (c) 2016, 2017 ARM Limited.
   3  *
   4  * SPDX-License-Identifier: MIT
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to
   8  * deal in the Software without restriction, including without limitation the
   9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  10  * sell copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in all
  14  * copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24 #include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
  25
  26 #include "arm_compute/core/AccessWindowTranspose.h"
  27 #include "arm_compute/core/Coordinates.h"
  28 #include "arm_compute/core/Error.h"
  29 #include "arm_compute/core/Helpers.h"
  30 #include "arm_compute/core/ITensor.h"
  31 #include "arm_compute/core/NEON/INEKernel.h"
  32 #include "arm_compute/core/TensorInfo.h"
  33 #include "arm_compute/core/Types.h"
  34 #include "arm_compute/core/Validate.h"
  35 #include "arm_compute/core/Window.h"
  36
  37 #include <arm_neon.h>
  38 #include <cstddef>
  39 #include <cstring>
  40
  41 using namespace arm_compute;
  42
  43 void NEGEMMTranspose1xWKernel::configure(const ITensor *input, ITensor *output)
  44 {
  45     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F16, DataType::F32);
  46     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::F16, DataType::F32);
  47     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
  48     ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) != std::ceil(input->info()->dimension(0) / 8.0f)) && (input->info()->data_type() == DataType::F16));
  49     ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) != std::ceil(input->info()->dimension(0) / 4.0f)) && (input->info()->data_type() == DataType::F32));
  50     ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) != std::ceil(input->info()->dimension(0) / 4.0f)) && (input->info()->data_type() == DataType::U32));
  51
  52     unsigned int num_elems_processed_per_iteration = 0;
  53     switch(input->info()->data_type())
  54     {
  55         case DataType::F32:
  56         case DataType::U8:
  57             num_elems_processed_per_iteration = 4;
  58             break;
  59         case DataType::F16:
  60 #ifdef ARM_COMPUTE_ENABLE_FP16
  61             num_elems_processed_per_iteration = 8;
  62             break;
  63 #endif
  64         default:
  65             ARM_COMPUTE_ERROR("Data type not supported");
  66             break;
  67     }
  68
  69     _input  = input;
  70     _output = output;
  71
  72     // Configure kernel window
  73     Window                win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
  74     AccessWindowTranspose output_access(output->info(), 0, 0, num_elems_processed_per_iteration, 1);
  75
  76     update_window_and_padding(win,
  77                               AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
  78                               output_access);
  79
  80     output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
  81
  82     INEKernel::configure(win);
  83 }
  84
  85 void NEGEMMTranspose1xWKernel::run(const Window &window)
  86 {
  87     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
  88     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
  89
  90     /*
  91      * Following an example of how the transposition1xW works when the input data type is F32
  92      *
  93      *         |a00 a01 a02 a03|
  94      *         |a10 a11 a12 a13|
  95      *         |a20 a21 a22 a23| = | a00 a01 a02 a03 || a10 a11 a12 a13 || a20 a21 a22 a23 || a30 a31 a32 a33 |
  96      *         |a30 a31 a32 a33|
  97      *
  98      * If the input data type is F32, the output matrix will have the following shape: [ height * 4, width / 4 ]
  99      * If the input data type is F16, the output matrix will have the following shape: [ height * 8, width / 8 ]
 100      */
 101
 102     /* Set window for output tensor. Set to 0 the X and Y dimensions in order to allow multi-threading implementation and future batched matrix multiplications. */
 103     Window win_out(window);
 104     win_out.set(Window::DimX, Window::Dimension(0, 0, 0));
 105     win_out.set(Window::DimY, Window::Dimension(0, 0, 0));
 106
 107     Iterator in(_input, window);
 108     Iterator out(_output, win_out);
 109
 110     switch(_input->info()->data_type())
 111     {
 112         case DataType::F32:
 113         {
 114             const size_t out_stride = _output->info()->strides_in_bytes()[1] / sizeof(float);
 115
 116             execute_window_loop(window, [&](const Coordinates & id)
 117             {
 118                 const auto        in_ptr = reinterpret_cast<const float *>(in.ptr());
 119                 const float32x4_t data   = vld1q_f32(in_ptr);
 120                 /* Output address = base addr + (y * 4) + (x / 4 ) * stride */
 121                 const auto out_ptr = reinterpret_cast<float *>(out.ptr()) + (id.y() << 2) + (id.x() >> 2) * out_stride;
 122                 vst1q_f32(out_ptr, data);
 123             },
 124             in, out);
 125             break;
 126         }
 127         case DataType::U8:
 128         {
 129             const size_t out_stride = _output->info()->strides_in_bytes()[1] / sizeof(uint8_t);
 130             execute_window_loop(window, [&](const Coordinates & id)
 131             {
 132                 const auto in_ptr = reinterpret_cast<const uint8_t *>(in.ptr());
 133                 /* Output address = base addr + (y * 4) + (x / 4 ) * stride */
 134                 const auto out_ptr = reinterpret_cast<uint8_t *>(out.ptr()) + (id.y() << 2) + (id.x() >> 2) * out_stride;
 135                 std::copy_n(in_ptr, 4, out_ptr);
 136             },
 137             in, out);
 138             break;
 139         }
 140
 141         case DataType::F16:
 142 #ifdef ARM_COMPUTE_ENABLE_FP16
 143             {
 144                 const size_t out_stride = _output->info()->strides_in_bytes()[1] / sizeof(float16_t);
 145
 146                 execute_window_loop(window, [&](const Coordinates & id)
 147                 {
 148                     const auto in_ptr = reinterpret_cast<const float16_t *>(in.ptr());
 149                     // Output address = base addr + (y * 8) + (x / 8 ) * stride
 150                     float16_t *out_ptr = reinterpret_cast<float16_t *>(out.ptr()) + (id.y() << 3) + (id.x() >> 3) * out_stride;
 151                     vst1q_f16(out_ptr, vld1q_f16(in_ptr));
 152                 },
 153                 in, out);
 154                 break;
 155             }
 156 #endif
 157         default:
 158         {
 159             ARM_COMPUTE_ERROR("Data type not supported");
 160             break;
 161         }
 162     }
 163 }