24 #ifndef __ARM_ASSEMBLY_HELPER_H__ 25 #define __ARM_ASSEMBLY_HELPER_H__ 33 #include "arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapper.h" 34 #include "arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp" 44 template <
typename TypeInput,
typename TypeOutput>
92 const int batch_stride_a = stride_in_bytes_a /
sizeof(TypeInput);
99 const auto in0_ptr =
reinterpret_cast<const TypeInput *
>(_a->
buffer());
100 const auto in1_ptr =
reinterpret_cast<const TypeInput *
>(_b->
buffer());
101 auto out_ptr =
reinterpret_cast<TypeOutput *
>(_d->
buffer());
103 _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a, in1_ptr, ldb, multi_stride_b, out_ptr, ldd, batch_stride_d, multi_stride_d);
104 if(_gemm_kernel_asm->B_pretranspose_required())
107 const unsigned int alignment = 128;
108 void *raw_ptr =
reinterpret_cast<void *
>(_pretranspose->
buffer());
110 void *aligned_ptr =
support::cpp11::align(alignment, _gemm_kernel_asm->get_B_pretransposed_array_size(), raw_ptr, space);
112 _gemm_kernel_asm->pretranspose_B_array(aligned_ptr, in1_ptr, ldb, multi_stride_b);
158 template <
typename T>
171 std::unique_ptr<typename T::AssemblyGemm>
172 asm_gemm(arm_gemm::gemm<typename T::TypeOperator, typename T::TypeResult>(ci, M, N, K, batches, multis,
false,
false, alpha, beta, num_threads, pretranspose_hint));
174 std::unique_ptr<NEGEMMAssemblyWrapper<typename T::AssemblyGemm>>
175 acl_gemm_wrapper = support::cpp14::make_unique<NEGEMMAssemblyWrapper<typename T::AssemblyGemm>>();
176 if(acl_gemm_wrapper !=
nullptr && asm_gemm !=
nullptr)
178 acl_gemm_wrapper->configure(asm_gemm.get());
179 const size_t workspace_size = asm_gemm->get_working_size();
183 const unsigned int alignment = 4096;
186 asm_gemm->set_working_space(reinterpret_cast<typename T::TypeResult *>(workspace.
buffer()));
192 const unsigned int window_size = asm_gemm->get_window_size();
193 if(window_size < num_threads)
195 num_threads = window_size;
196 asm_gemm->set_nthreads(num_threads);
201 if(asm_gemm->B_pretranspose_required())
204 const unsigned int alignment = 128;
205 const size_t B_pretranspose_size = asm_gemm->get_B_pretransposed_array_size();
208 asm_glue._pretranspose = &B_pretranspose;
211 asm_glue._gemm_kernel_asm = std::move(asm_gemm);
212 asm_glue._optimised_kernel = std::move(acl_gemm_wrapper);
ITensor * _pretranspose
Pre-transpose tensor.
int8_t TypeOperator
Operator type.
T z() const
Alias to access the size of the third dimension.
const AssemblyKernelGlue< TypeInput, TypeOutput > & operator=(const AssemblyKernelGlue< TypeInput, TypeOutput > &)=delete
Prevent instances of this class from being copy constructed.
arm_gemm::GemmCommon< int8_t, int32_t > AssemblyGemm
Assembly Gemm.
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
const ITensor * _a
Input A.
uint8_t * buffer() const override
Interface to be implemented by the child class to return a pointer to CPU memory. ...
const ITensor * _b
Input B.
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Interface for NEON tensor.
This file contains all available output stages for GEMMLowp on OpenCL.
TensorAllocator * allocator()
Return a pointer to the tensor's allocator.
T x() const
Alias to access the size of the first dimension.
AssemblyKernelGlue()
Default constructor.
size_t total_size_upper(size_t dimension) const
Collapses given dimension and above.
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
virtual uint8_t * buffer() const =0
Interface to be implemented by the child class to return a pointer to CPU memory. ...
void allocate() override
Allocate size specified by TensorInfo of CPU memory.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
Basic implementation of the tensor interface.
void run()
Configures the arrays pointers and strides in the assembly kernel and executes the assembly kernel...
void * align(std::size_t alignment, std::size_t size, void *&ptr, std::size_t &space)
bool setup_assembly_kernel(const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint, Tensor &workspace, Tensor &B_pretranspose, MemoryGroup &memory_group, T &asm_glue)
Create a wrapper kernel.
int32_t TypeResult
Result type.
void allocate_workspace(size_t workspace_size, Tensor &workspace, MemoryGroup *memory_group, size_t alignment, unsigned int num_threads)
Allocate a workspace tensor.
void mark_as_unused() const
Marks a tensor as unused.
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
CPUInfo & cpu_info()
Get CPU info.
virtual size_t total_size() const =0
Returns the total size of the tensor in bytes.
T y() const
Alias to access the size of the second dimension.
Num samples, height, width, channels.
std::unique_ptr< INEKernel > _optimised_kernel
Optimised NEON kernel.
void init(const TensorAllocator &allocator, const Coordinates &coords, TensorInfo sub_info)
Shares the same backing memory with another tensor allocator, while the tensor info might be differen...
Store the tensor's metadata.
virtual const Strides & strides_in_bytes() const =0
The strides in bytes for accessing each dimension of the tensor.
virtual unsigned int num_threads() const =0
Returns the number of threads that the SingleThreadScheduler has in his pool.
std::unique_ptr< AssemblyGemm > _gemm_kernel_asm
Assembly Gemm kernel.
static IScheduler & get()
Access the scheduler singleton.
virtual void schedule(ICPPKernel *kernel, unsigned int split_dimension)=0
Runs the kernel in the same thread as the caller synchronously.
virtual DataLayout data_layout() const =0
Get the data layout of the tensor.
#define ARM_COMPUTE_ERROR_ON_MSG(cond,...)