Compute Library: arm_compute/runtime/NEON/AssemblyHelper.h Source File

 /*
  * Copyright (c) 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
  * deal in the Software without restriction, including without limitation the
  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice shall be included in all
  * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
 #ifndef __ARM_ASSEMBLY_HELPER_H__
 #define __ARM_ASSEMBLY_HELPER_H__

 #include "arm_compute/core/ITensor.h"
 #include "support/ToolchainSupport.h"

 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/Log.h"
 #include "arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapper.h"
 #include "arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"

 namespace arm_compute
 {
 template <typename TypeInput, typename TypeOutput>
 class AssemblyKernelGlue final
 {
 public:
     using TypeOperator = TypeInput;
     using TypeResult = TypeOutput;
     AssemblyKernelGlue()
         : _gemm_kernel_asm(nullptr), _optimised_kernel(nullptr), _a(nullptr), _b(nullptr), _d(nullptr), _pretranspose(nullptr)
     {
     }
     using AssemblyGemm = arm_gemm::GemmCommon<TypeInput, TypeOutput>;

     const AssemblyKernelGlue<TypeInput, TypeOutput> &operator=(const AssemblyKernelGlue<TypeInput, TypeOutput> &) = delete;
     AssemblyKernelGlue(const AssemblyKernelGlue<TypeInput, TypeOutput> &) = delete;

     std::unique_ptr<AssemblyGemm> _gemm_kernel_asm;
     std::unique_ptr<INEKernel> _optimised_kernel;
     const ITensor *_a;
     const ITensor *_b;
     ITensor *_d;
     ITensor *_pretranspose;

     inline void run()
     {
         const int lda = _a->info()->strides_in_bytes().y() / sizeof(TypeInput);
         const int ldb = _b->info()->strides_in_bytes().y() / sizeof(TypeInput);
         const int ldd = _d->info()->strides_in_bytes().y() / sizeof(TypeOutput);

         // In the case of NHWC we want to interpret the output shape as 3D. Thus, the batch stride for A is
         // the relevant multiple of the row stride.
         const bool is_nhwc           = _a->info()->data_layout() == DataLayout::NHWC;
         const int  stride_in_bytes_a = is_nhwc ? _a->info()->strides_in_bytes().y() * _d->info()->dimension(1) : _a->info()->strides_in_bytes().z();

         const int batch_stride_a = stride_in_bytes_a / sizeof(TypeInput);
         const int batch_stride_d = _d->info()->strides_in_bytes().z() / sizeof(TypeOutput);

         const int multi_stride_a = _a->info()->strides_in_bytes()[3] / sizeof(TypeInput);
         const int multi_stride_b = _b->info()->strides_in_bytes().z() / sizeof(TypeInput);
         const int multi_stride_d = _d->info()->strides_in_bytes()[3] / sizeof(TypeOutput);

         const auto in0_ptr = reinterpret_cast<const TypeInput *>(_a->buffer());
         const auto in1_ptr = reinterpret_cast<const TypeInput *>(_b->buffer());
         auto       out_ptr = reinterpret_cast<TypeOutput *>(_d->buffer());

         _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a, in1_ptr, ldb, multi_stride_b, out_ptr, ldd, batch_stride_d, multi_stride_d);
         if(_gemm_kernel_asm->B_pretranspose_required())
         {
             // Forcing 128-byte alignment (required by 32-bit kernels)
             const unsigned int alignment   = 128;
             void              *raw_ptr     = reinterpret_cast<void *>(_pretranspose->buffer());
             size_t             space       = _pretranspose->info()->total_size();
             void              *aligned_ptr = support::cpp11::align(alignment, _gemm_kernel_asm->get_B_pretransposed_array_size(), raw_ptr, space);
             ARM_COMPUTE_ERROR_ON(_pretranspose == nullptr || _pretranspose->buffer() == nullptr);
             _gemm_kernel_asm->pretranspose_B_array(aligned_ptr, in1_ptr, ldb, multi_stride_b);
             _b->mark_as_unused();
         }

         NEScheduler::get().schedule(_optimised_kernel.get(), Window::DimX);
     }
 };

 using AssemblyKernelGlueF32 = AssemblyKernelGlue<float, float>;
 using AssemblyKernelGlueU8U32 = AssemblyKernelGlue<uint8_t, uint32_t>;
 using AssemblyKernelGlueS8S32 = AssemblyKernelGlue<int8_t, int32_t>;

 inline void allocate_workspace(size_t workspace_size, Tensor &workspace, MemoryGroup *memory_group, size_t alignment, unsigned int num_threads)
 {
     ARM_COMPUTE_UNUSED(memory_group);
     ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "size cannot be 0");
     workspace.allocator()->init(TensorInfo(TensorShape{ (workspace_size + alignment - 1) * num_threads }, 1, DataType::S8));
     workspace.allocator()->allocate();
 }

 template <typename T>
 inline bool setup_assembly_kernel(const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint,
                                   Tensor &workspace, Tensor &B_pretranspose, MemoryGroup &memory_group, T &asm_glue)
 {
     const CPUInfo &ci          = NEScheduler::get().cpu_info();
     const int      M           = d->info()->tensor_shape().y();
     const int      N           = d->info()->tensor_shape().x();
     const int      K           = a->info()->tensor_shape().x();
     const int      batches     = d->info()->tensor_shape().total_size_upper(2);
     const int      multis      = b->info()->tensor_shape().z();
     unsigned int   num_threads = NEScheduler::get().num_threads();

     // unique_ptr to a Gemm object
     std::unique_ptr<typename T::AssemblyGemm>
     asm_gemm(arm_gemm::gemm<typename T::TypeOperator, typename T::TypeResult>(ci, M, N, K, batches, multis, false, false, alpha, beta, num_threads, pretranspose_hint));
     // arm_compute wrapper for the Gemm object (see above)
     std::unique_ptr<NEGEMMAssemblyWrapper<typename T::AssemblyGemm>>
                                                                   acl_gemm_wrapper = support::cpp14::make_unique<NEGEMMAssemblyWrapper<typename T::AssemblyGemm>>();
     if(acl_gemm_wrapper != nullptr && asm_gemm != nullptr)
     {
         acl_gemm_wrapper->configure(asm_gemm.get());
         const size_t workspace_size = asm_gemm->get_working_size();
         if(workspace_size)
         {
             // Allocate workspace
             const unsigned int alignment = 4096;
             allocate_workspace(workspace_size, workspace, &memory_group, alignment, num_threads);
             ARM_COMPUTE_ERROR_ON_NULLPTR(workspace.buffer());
             asm_gemm->set_working_space(reinterpret_cast<typename T::TypeResult *>(workspace.buffer()));
         }

         //if we disable this code below in brackets then ConvLayer deadlocks when threads > 1 and
         //the shapes are In=1x1x1024 Weights=1x1x1024x1001 Biases=1001 Out=1x1x1001
         {
             const unsigned int window_size = asm_gemm->get_window_size();
             if(window_size < num_threads)
             {
                 num_threads = window_size;
                 asm_gemm->set_nthreads(num_threads);
             }
         }

         // Check for pre-transposed support
         if(asm_gemm->B_pretranspose_required())
         {
             // Forcing 128-byte alignment (required by 32-bit kernels)
             const unsigned int alignment           = 128;
             const size_t       B_pretranspose_size = asm_gemm->get_B_pretransposed_array_size();
             allocate_workspace(B_pretranspose_size, B_pretranspose, nullptr, alignment, 1);
             ARM_COMPUTE_ERROR_ON_NULLPTR(B_pretranspose.buffer());
             asm_glue._pretranspose = &B_pretranspose;
         }

         asm_glue._gemm_kernel_asm  = std::move(asm_gemm);
         asm_glue._optimised_kernel = std::move(acl_gemm_wrapper);
         // We need to setup the ptrs in the run() method
         asm_glue._a = a;
         asm_glue._b = b;
         asm_glue._d = d;
         return true;
     }
     return false;
 }
 }
 #endif /* __ARM_ASSEMBLY_HELPER_H__ */
arm_compute::AssemblyKernelGlue::_pretranspose
ITensor * _pretranspose
Pre-transpose tensor.
Definition: AssemblyHelper.h:76

arm_compute::AssemblyKernelGlue< int8_t, int32_t >::TypeOperator
int8_t TypeOperator
Operator type.
Definition: AssemblyHelper.h:49

arm_compute::Dimensions::z
T z() const
Alias to access the size of the third dimension.
Definition: Dimensions.h:91

arm_compute::test::validation::beta
beta
Definition: GEMM.cpp:115

arm_compute::TensorShape
Shape of a tensor.
Definition: TensorShape.h:39

ToolchainSupport.h

arm_compute::AssemblyKernelGlue::operator=
const AssemblyKernelGlue< TypeInput, TypeOutput > & operator=(const AssemblyKernelGlue< TypeInput, TypeOutput > &)=delete
Prevent instances of this class from being copy constructed.

arm_compute::AssemblyKernelGlue< int8_t, int32_t >::AssemblyGemm
arm_gemm::GemmCommon< int8_t, int32_t > AssemblyGemm
Assembly Gemm.
Definition: AssemblyHelper.h:58

ITensor.h

arm_compute::test::validation::a
CLTensor a
Definition: GEMM.cpp:121

arm_compute::ITensorInfo::dimension
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.

arm_compute::AssemblyKernelGlue::_a
const ITensor * _a
Input A.
Definition: AssemblyHelper.h:70

arm_compute::Tensor::buffer
uint8_t * buffer() const override
Interface to be implemented by the child class to return a pointer to CPU memory. ...

Window.h

arm_compute::AssemblyKernelGlue::_b
const ITensor * _b
Input B.
Definition: AssemblyHelper.h:72

ARM_COMPUTE_ERROR_ON
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:328

arm_compute::AssemblyKernelGlue::_d
ITensor * _d
Output.
Definition: AssemblyHelper.h:74

TensorInfo.h

arm_compute::ITensor
Interface for NEON tensor.
Definition: ITensor.h:36

arm_compute
This file contains all available output stages for GEMMLowp on OpenCL.
Definition: 00_introduction.dox:1

arm_compute::Tensor::allocator
TensorAllocator * allocator()
Return a pointer to the tensor&#39;s allocator.

arm_compute::Dimensions::x
T x() const
Alias to access the size of the first dimension.
Definition: Dimensions.h:81

arm_compute::AssemblyKernelGlue::AssemblyKernelGlue
AssemblyKernelGlue()
Default constructor.
Definition: AssemblyHelper.h:53

arm_compute::TensorShape::total_size_upper
size_t total_size_upper(size_t dimension) const
Collapses given dimension and above.
Definition: TensorShape.h:167

arm_compute::Window::DimX
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43

ARM_COMPUTE_UNUSED
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:159

arm_compute::ITensorInfo::tensor_shape
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.

arm_compute::ITensor::buffer
virtual uint8_t * buffer() const =0
Interface to be implemented by the child class to return a pointer to CPU memory. ...

arm_compute::TensorAllocator::allocate
void allocate() override
Allocate size specified by TensorInfo of CPU memory.

arm_compute::ITensor::info
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.

arm_compute::Tensor
Basic implementation of the tensor interface.
Definition: Tensor.h:37

NEScheduler.h

arm_compute::AssemblyKernelGlue::run
void run()
Configures the arrays pointers and strides in the assembly kernel and executes the assembly kernel...
Definition: AssemblyHelper.h:81

arm_compute::support::cpp11::align
void * align(std::size_t alignment, std::size_t size, void *&ptr, std::size_t &space)
Definition: ToolchainSupport.h:321

Log.h

arm_compute::setup_assembly_kernel
bool setup_assembly_kernel(const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint, Tensor &workspace, Tensor &B_pretranspose, MemoryGroup &memory_group, T &asm_glue)
Create a wrapper kernel.
Definition: AssemblyHelper.h:159

arm_compute::CPUInfo
Definition: CPPTypes.h:44

arm_compute::AssemblyKernelGlue< int8_t, int32_t >::TypeResult
int32_t TypeResult
Result type.
Definition: AssemblyHelper.h:51

arm_compute::allocate_workspace
void allocate_workspace(size_t workspace_size, Tensor &workspace, MemoryGroup *memory_group, size_t alignment, unsigned int num_threads)
Allocate a workspace tensor.
Definition: AssemblyHelper.h:135

arm_compute::ITensor::mark_as_unused
void mark_as_unused() const
Marks a tensor as unused.

ARM_COMPUTE_ERROR_ON_NULLPTR
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:161

arm_compute::MemoryGroupBase< Tensor >

arm_compute::IScheduler::cpu_info
CPUInfo & cpu_info()
Get CPU info.

arm_compute::ITensorInfo::total_size
virtual size_t total_size() const =0
Returns the total size of the tensor in bytes.

arm_compute::Dimensions::y
T y() const
Alias to access the size of the second dimension.
Definition: Dimensions.h:86

arm_compute::DataLayout::NHWC
Num samples, height, width, channels.

arm_compute::AssemblyKernelGlue::_optimised_kernel
std::unique_ptr< INEKernel > _optimised_kernel
Optimised NEON kernel.
Definition: AssemblyHelper.h:68

arm_compute::TensorAllocator::init
void init(const TensorAllocator &allocator, const Coordinates &coords, TensorInfo sub_info)
Shares the same backing memory with another tensor allocator, while the tensor info might be differen...

arm_compute::TensorInfo
Store the tensor&#39;s metadata.
Definition: TensorInfo.h:45

Helpers.h

arm_compute::test::validation::alpha
alpha
Definition: GEMM.cpp:115

arm_compute::ITensorInfo::strides_in_bytes
virtual const Strides & strides_in_bytes() const =0
The strides in bytes for accessing each dimension of the tensor.

arm_compute::IScheduler::num_threads
virtual unsigned int num_threads() const =0
Returns the number of threads that the SingleThreadScheduler has in his pool.

arm_compute::AssemblyKernelGlue
Assembly kernel glue.
Definition: AssemblyHelper.h:45

arm_compute::AssemblyKernelGlue::_gemm_kernel_asm
std::unique_ptr< AssemblyGemm > _gemm_kernel_asm
Assembly Gemm kernel.
Definition: AssemblyHelper.h:66

arm_compute::test::validation::b
CLTensor b
Definition: GEMM.cpp:122

Types.h

arm_compute::DataType::S8
signed 8-bit number

IAccessWindow.h

Validate.h

arm_compute::Scheduler::get
static IScheduler & get()
Access the scheduler singleton.

arm_compute::IScheduler::schedule
virtual void schedule(ICPPKernel *kernel, unsigned int split_dimension)=0
Runs the kernel in the same thread as the caller synchronously.

arm_compute::ITensorInfo::data_layout
virtual DataLayout data_layout() const =0
Get the data layout of the tensor.

ARM_COMPUTE_ERROR_ON_MSG
#define ARM_COMPUTE_ERROR_ON_MSG(cond,...)
Definition: Error.h:319