From: Anthony Barbier Date: Wed, 12 Apr 2017 14:12:46 +0000 (+0100) Subject: arm_compute v17.04 X-Git-Tag: submit/tizen/20180223.063230~17 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=a437638028c216a03572593b9d4e3532df2a308d;p=platform%2Fupstream%2Farmcl.git arm_compute v17.04 --- diff --git a/arm_compute/core/AccessWindowAutoPadding.h b/arm_compute/core/AccessWindowAutoPadding.h index cf6555296..ef058bc3e 100644 --- a/arm_compute/core/AccessWindowAutoPadding.h +++ b/arm_compute/core/AccessWindowAutoPadding.h @@ -58,12 +58,16 @@ public: AccessWindowAutoPadding &operator=(AccessWindowAutoPadding &&) = default; ~AccessWindowAutoPadding() = default; + /** Set the valid region to match the entire tensor. */ void set_valid_region(); + /** Return a valid region that spans across the entire tensor. */ + ValidRegion compute_valid_region() const; + // Inherited methods overridden: bool update_window_if_needed(Window &window) const override; bool update_padding_if_needed(const Window &window) const override; - void set_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) override; + ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override; private: TensorInfo *_info; diff --git a/arm_compute/core/AccessWindowStatic.h b/arm_compute/core/AccessWindowStatic.h index 3898eb219..9c269983f 100644 --- a/arm_compute/core/AccessWindowStatic.h +++ b/arm_compute/core/AccessWindowStatic.h @@ -62,11 +62,25 @@ public: AccessWindowStatic &operator=(AccessWindowStatic &&) = default; ~AccessWindowStatic() = default; + /** Set the valid region based on the static access pattern and valid + * region of the inputs. + * + * @param[in] window Execution window of the kernel. + * @param[in] input_valid_region Combined valid region of all inputs. + */ + void set_valid_region(const Window &window, const ValidRegion &input_valid_region); + + /** Compute the valid region based on the static access pattern and valid region of the inputs. + * + * @param[in] window Execution window of the kernel. + * @param[in] input_valid_region Combined valid region of all inputs. + */ + ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region) const; + // Inherited methods overriden: bool update_window_if_needed(Window &window) const override; bool update_padding_if_needed(const Window &window) const override; - void set_valid_region(const Window &window, ValidRegion input_valid_region); - void set_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) override; + ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override; TensorInfo *_info; int _start_x; diff --git a/arm_compute/core/AccessWindowTranspose.h b/arm_compute/core/AccessWindowTranspose.h index d3803aad5..42765032a 100644 --- a/arm_compute/core/AccessWindowTranspose.h +++ b/arm_compute/core/AccessWindowTranspose.h @@ -41,8 +41,8 @@ public: using AccessWindowRectangle::AccessWindowRectangle; bool update_window_if_needed(Window &window) const override; bool update_padding_if_needed(const Window &window) const override; - using AccessWindowRectangle::set_valid_region; - void set_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) override; + using AccessWindowRectangle::compute_valid_region; + ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override; }; } // namespace arm_compute #endif /*__ARM_COMPUTE_IACCESS_WINDOW_TRANSPOSE_H__*/ diff --git a/arm_compute/core/CL/CLKernels.h b/arm_compute/core/CL/CLKernels.h index 7da1bb5bf..70789b209 100644 --- a/arm_compute/core/CL/CLKernels.h +++ b/arm_compute/core/CL/CLKernels.h @@ -25,7 +25,6 @@ #define __ARM_COMPUTE_CLKERNELS_H__ /* Header regrouping all the CL kernels */ - #include "arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h" #include "arm_compute/core/CL/kernels/CLAccumulateKernel.h" #include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h" @@ -50,6 +49,7 @@ #include "arm_compute/core/CL/kernels/CLFastCornersKernel.h" #include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" #include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h" +#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h" #include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h" #include "arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h" #include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h" diff --git a/arm_compute/core/CL/ICLSimpleKernel.h b/arm_compute/core/CL/ICLSimpleKernel.h index 986c86fcb..e9fdb7fb8 100644 --- a/arm_compute/core/CL/ICLSimpleKernel.h +++ b/arm_compute/core/CL/ICLSimpleKernel.h @@ -49,13 +49,13 @@ public: /** Configure the kernel * - * @param[in] input Source tensor. - * @param[out] output Destination tensor. - * @param[in] processed_elements Number of processed elements per iteration. - * @param[in] border_undefined (Optional) True if the border mode is undefined. False if it's replicate or constant. - * @param[in] border_size (Optional) Size of the border. + * @param[in] input Source tensor. + * @param[out] output Destination tensor. + * @param[in] num_elems_processed_per_iteration Number of processed elements per iteration. + * @param[in] border_undefined (Optional) True if the border mode is undefined. False if it's replicate or constant. + * @param[in] border_size (Optional) Size of the border. */ - void configure(const ICLTensor *input, ICLTensor *output, unsigned int processed_elements, bool border_undefined = false, const BorderSize &border_size = BorderSize()); + void configure(const ICLTensor *input, ICLTensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined = false, const BorderSize &border_size = BorderSize()); protected: const ICLTensor *_input; diff --git a/arm_compute/core/CL/kernels/CLCannyEdgeKernel.h b/arm_compute/core/CL/kernels/CLCannyEdgeKernel.h index ea27d264c..5ca3e0341 100644 --- a/arm_compute/core/CL/kernels/CLCannyEdgeKernel.h +++ b/arm_compute/core/CL/kernels/CLCannyEdgeKernel.h @@ -47,32 +47,28 @@ public: * * @note gx, gy and mag must all be the same size (either 16 or 32). * - * @param[in] gx Source tensor - Gx component. Data types supported: S16/S32. - * @param[in] gy Source tensor - Gy component. Data types supported: Same as gx. - * @param[out] magnitude Destination tensor - Magnitude. Data types supported: U16/U32. Must match the pixel size of gx, gy. - * @param[out] phase Destination tensor - Quantized phase. Data types supported: U8. - * @param[in] norm_type Normalization type. if 1, L1-Norm otherwise L2-Norm. - * @param[in] num_pixel_to_skip_prev Number of pixels to skip of previous stage if border_mode = UNDEFINED - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. + * @param[in] gx Source tensor - Gx component. Data types supported: S16/S32. + * @param[in] gy Source tensor - Gy component. Data types supported: Same as gx. + * @param[out] magnitude Destination tensor - Magnitude. Data types supported: U16/U32. Must match the pixel size of gx, gy. + * @param[out] phase Destination tensor - Quantized phase. Data types supported: U8. + * @param[in] norm_type Normalization type. if 1, L1-Norm otherwise L2-Norm. */ - void configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase, int32_t norm_type, int32_t num_pixel_to_skip_prev, bool border_undefined); + void configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase, int32_t norm_type); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; - BorderSize border_size() const override; private: - const ICLTensor *_gx; /**< Source tensor - Gx component */ - const ICLTensor *_gy; /**< Source tensor - Gy component */ - ICLTensor *_magnitude; /**< Destination tensor - Magnitude */ - ICLTensor *_phase; /**< Destination tensor - Quantized phase */ - unsigned int _pixels_to_skip; /**< Pixels to skip around the border. */ + const ICLTensor *_gx; /**< Source tensor - Gx component */ + const ICLTensor *_gy; /**< Source tensor - Gy component */ + ICLTensor *_magnitude; /**< Destination tensor - Magnitude */ + ICLTensor *_phase; /**< Destination tensor - Quantized phase */ }; /** OpenCL kernel to perform Non-Maxima suppression for Canny Edge. * * @note This kernel is meant to be used alongside CannyEdge and performs a non-maxima suppression using magnitude and phase of input - * to characterize points as possible edges. + * to characterize points as possible edges. The output buffer needs to be cleared before this kernel is executed. * * @note Hysteresis is computed in @ref CLEdgeTraceKernel */ @@ -87,24 +83,22 @@ public: CLEdgeNonMaxSuppressionKernel &operator=(const CLEdgeNonMaxSuppressionKernel &) = delete; /** Initialise the kernel's sources, destination and border mode. * - * @param[in] magnitude Source tensor - Magnitude. Data types supported: U16/U32. - * @param[in] phase Source tensor - Quantized phase. Data types supported: U8. - * @param[out] output Destination tensor - * @param[in] lower_thr Lower threshold. - * @param[in] num_pixel_to_skip_prev Number of pixels to skip of previous stage if border_mode = UNDEFINED - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. + * @param[in] magnitude Source tensor - Magnitude. Data types supported: U16/U32. + * @param[in] phase Source tensor - Quantized phase. Data types supported: U8. + * @param[out] output Destination tensor. Data types supported: U16/U32. + * @param[in] lower_thr Lower threshold. + * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. */ - void configure(const ICLTensor *magnitude, const ICLTensor *phase, ICLTensor *output, int32_t lower_thr, int32_t num_pixel_to_skip_prev, bool border_undefined); + void configure(const ICLTensor *magnitude, const ICLTensor *phase, ICLTensor *output, int32_t lower_thr, bool border_undefined); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; BorderSize border_size() const override; private: - const ICLTensor *_magnitude; /**< Source tensor - Magnitude. */ - const ICLTensor *_phase; /**< Source tensor - Quantized phase. */ - ICLTensor *_output; /**< Destination tensor. */ - unsigned int _pixels_to_skip; /**< Pixels to skip around the border. */ + const ICLTensor *_magnitude; /**< Source tensor - Magnitude. */ + const ICLTensor *_phase; /**< Source tensor - Quantized phase. */ + ICLTensor *_output; /**< Destination tensor. */ }; /** OpenCL kernel to perform Edge tracing. @@ -120,28 +114,24 @@ public: CLEdgeTraceKernel &operator=(const CLEdgeTraceKernel &) = delete; /** Initialise the kernel's source, destination and border mode. * - * @param[in] input Source tensor. Data types supported: U8. - * @param[out] output Destination tensor. Data types supported: U8. - * @param[in] upper_thr Upper threshold used for the hysteresis - * @param[in] lower_thr Lower threshold used for the hysteresis - * @param[in,out] visited Tensor for keeping the visited pixels. Data types supported: U32. - * Expected to be initialized to 0 before each run. - * @param[in,out] recorded Tensor for keeping the recorded pixels. Data types supported: U32 - * Expected to be initialized to 0 before each run. - * @param[in,out] l1_stack Tensor with the L1 stack for each pixel. Data types supported: S32. - * Expected to be initialized to 0 before each run. - * @param[in,out] l1_stack_counter Tensor for counting the elements in the L1 stack of each pixel. Data types supported: U8. - * Expected to be initialized to 0 before each run. - * @param[in] num_pixel_to_skip_prev Number of pixels to skip of previous stage if border_mode = UNDEFINED. - public: * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. + * @param[in] input Source tensor. Data types supported: U8. + * @param[out] output Destination tensor. Data types supported: U8. + * @param[in] upper_thr Upper threshold used for the hysteresis + * @param[in] lower_thr Lower threshold used for the hysteresis + * @param[in,out] visited Tensor for keeping the visited pixels. Data types supported: U32. + * Expected to be initialized to 0 before each run. + * @param[in,out] recorded Tensor for keeping the recorded pixels. Data types supported: U32 + * Expected to be initialized to 0 before each run. + * @param[in,out] l1_stack Tensor with the L1 stack for each pixel. Data types supported: S32. + * Expected to be initialized to 0 before each run. + * @param[in,out] l1_stack_counter Tensor for counting the elements in the L1 stack of each pixel. Data types supported: U8. + * Expected to be initialized to 0 before each run. */ void configure(const ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, - ICLTensor *visited, ICLTensor *recorded, ICLTensor *l1_stack, ICLTensor *l1_stack_counter, - int32_t num_pixel_to_skip_prev, bool border_undefined); + ICLTensor *visited, ICLTensor *recorded, ICLTensor *l1_stack, ICLTensor *l1_stack_counter); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; - BorderSize border_size() const override; private: const ICLTensor *_input; /**< Source tensor. */ @@ -152,7 +142,6 @@ private: ICLTensor *_recorded; /**< Marks recorded elements */ ICLTensor *_l1_stack; /**< L1 hysteris stack */ ICLTensor *_l1_stack_counter; /**< L1 hysteris stack counter */ - unsigned int _pixels_to_skip; /**< Pixels to skip */ }; } #endif /* __ARM_COMPUTE_CLCANNYEDGEKERNEL_H__ */ diff --git a/arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h b/arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h index 5bb8051ad..6d79d0e71 100644 --- a/arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h +++ b/arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h @@ -24,14 +24,14 @@ #ifndef __ARM_COMPUTE_CLGAUSSIANPYRAMIDKERNEL_H__ #define __ARM_COMPUTE_CLGAUSSIANPYRAMIDKERNEL_H__ -#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/CL/ICLSimpleKernel.h" namespace arm_compute { class ICLTensor; /** OpenCL kernel to perform a Gaussian filter and half scaling across width (horizontal pass) */ -class CLGaussianPyramidHorKernel : public ICLKernel +class CLGaussianPyramidHorKernel : public ICLSimpleKernel { public: /** Default constructor */ @@ -60,12 +60,12 @@ public: BorderSize border_size() const override; private: - const ICLTensor *_input; - ICLTensor *_output; + BorderSize _border_size; + int _l2_load_offset; }; /** OpenCL kernel to perform a Gaussian filter and half scaling across height (vertical pass) */ -class CLGaussianPyramidVertKernel : public ICLKernel +class CLGaussianPyramidVertKernel : public ICLSimpleKernel { public: /** Default constructor */ @@ -94,8 +94,7 @@ public: BorderSize border_size() const override; private: - const ICLTensor *_input; - ICLTensor *_output; + int _t2_load_offset; }; } #endif /*__ARM_COMPUTE_CLGAUSSIANPYRAMIDKERNEL_H__ */ diff --git a/arm_compute/core/CL/kernels/CLLKTrackerKernel.h b/arm_compute/core/CL/kernels/CLLKTrackerKernel.h index ef21694d5..4d0dbed55 100644 --- a/arm_compute/core/CL/kernels/CLLKTrackerKernel.h +++ b/arm_compute/core/CL/kernels/CLLKTrackerKernel.h @@ -108,6 +108,16 @@ public: class CLLKTrackerStage0Kernel : public ICLKernel { public: + /** Default constructor */ + CLLKTrackerStage0Kernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLLKTrackerStage0Kernel(const CLLKTrackerStage0Kernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLLKTrackerStage0Kernel &operator=(const CLLKTrackerStage0Kernel &) = delete; + /** Allow instances of this class to be moved */ + CLLKTrackerStage0Kernel(CLLKTrackerStage0Kernel &&) = default; + /** Allow instances of this class to be moved */ + CLLKTrackerStage0Kernel &operator=(CLLKTrackerStage0Kernel &&) = default; /** Initialise the kernel input and output * * @param[in] old_input Pointer to the input old tensor. Data types supported: U8 @@ -119,21 +129,35 @@ public: * @param[out] old_ival Pointer to the array holding internal values * @param[in] window_dimension The size of the window on which to perform the algorithm * @param[in] level The pyramid level - * @param[in] border_offset The offset used to define the boundary of the tracked pixels in different border modes */ void configure(const ICLTensor *old_input, const ICLTensor *old_scharr_gx, const ICLTensor *old_scharr_gy, ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal, ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival, - size_t window_dimension, size_t level, int32_t border_offset); + size_t window_dimension, size_t level); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_old_input; + const ICLTensor *_old_scharr_gx; + const ICLTensor *_old_scharr_gy; }; /** Interface to run the second stage of LKTracker, where the motion vectors of the given points are computed */ class CLLKTrackerStage1Kernel : public ICLKernel { public: + /** Default constructor */ + CLLKTrackerStage1Kernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLLKTrackerStage1Kernel(const CLLKTrackerStage1Kernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLLKTrackerStage1Kernel &operator=(const CLLKTrackerStage1Kernel &) = delete; + /** Allow instances of this class to be moved */ + CLLKTrackerStage1Kernel(CLLKTrackerStage1Kernel &&) = default; + /** Allow instances of this class to be moved */ + CLLKTrackerStage1Kernel &operator=(CLLKTrackerStage1Kernel &&) = default; /** Initialise the kernel input and output * * @param[in] new_input Pointer to the input new tensor. Data types supported: U8 @@ -145,13 +169,15 @@ public: * @param[in] num_iterations The maximum number of iterations before terminating the algorithm * @param[in] window_dimension The size of the window on which to perform the algorithm * @param[in] level The pyramid level - * @param[in] border_offset The offset used to define the boundary of the tracked pixels in different border modes */ void configure(const ICLTensor *new_input, ICLLKInternalKeypointArray *new_points_internal, ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival, - Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, size_t level, int32_t border_offset); + Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, size_t level); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_new_input; }; } #endif /*__ARM_COMPUTE_CLLKTRACKERKERNEL_H__ */ diff --git a/arm_compute/core/CPP/CPPKernels.h b/arm_compute/core/CPP/CPPKernels.h new file mode 100644 index 000000000..213a9e6b3 --- /dev/null +++ b/arm_compute/core/CPP/CPPKernels.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CPPKERNELS_H__ +#define __ARM_COMPUTE_CPPKERNELS_H__ + +/* Header regrouping all the CPP kernels */ +#include "arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h" +#include "arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h" + +#endif /* __ARM_COMPUTE_CPPKERNELS_H__ */ diff --git a/arm_compute/core/CPP/ICPPSimpleKernel.h b/arm_compute/core/CPP/ICPPSimpleKernel.h index 3c33c4d37..105de397a 100644 --- a/arm_compute/core/CPP/ICPPSimpleKernel.h +++ b/arm_compute/core/CPP/ICPPSimpleKernel.h @@ -50,13 +50,13 @@ public: protected: /** Configure the kernel * - * @param[in] input Source tensor. - * @param[out] output Destination tensor. - * @param[in] processed_elements Number of processed elements per iteration. - * @param[in] border_undefined (Optional) True if the border mode is undefined. False if it's replicate or constant. - * @param[in] border_size (Optional) Size of the border. + * @param[in] input Source tensor. + * @param[out] output Destination tensor. + * @param[in] num_elems_processed_per_iteration Number of processed elements per iteration. + * @param[in] border_undefined (Optional) True if the border mode is undefined. False if it's replicate or constant. + * @param[in] border_size (Optional) Size of the border. */ - void configure(const ITensor *input, ITensor *output, unsigned int processed_elements, bool border_undefined = false, const BorderSize &border_size = BorderSize()); + void configure(const ITensor *input, ITensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined = false, const BorderSize &border_size = BorderSize()); protected: const ITensor *_input; diff --git a/arm_compute/core/Dimensions.h b/arm_compute/core/Dimensions.h index 87050d221..d627517b2 100644 --- a/arm_compute/core/Dimensions.h +++ b/arm_compute/core/Dimensions.h @@ -116,6 +116,37 @@ public: _num_dimensions = num_dimensions; } + /** Returns a read/write iterator that points to the first element in the dimension array. */ + typename std::array::iterator begin() + { + return _id.begin(); + } + /** Returns a read-only (constant) iterator that points to the first element in the dimension array. */ + typename std::array::const_iterator begin() const + { + return _id.begin(); + } + /** Returns a read-only (constant) iterator that points to the first element in the dimension array. */ + typename std::array::const_iterator cbegin() const + { + return begin(); + } + /** Returns a read/write iterator that points one past the last element in the dimension array. */ + typename std::array::iterator end() + { + return _id.end(); + } + /** Returns a read-only (constant) iterator that points one past the last element in the dimension array. */ + typename std::array::const_iterator end() const + { + return _id.end(); + } + /** Returns a read-only (constant) iterator that points one past the last element in the dimension array. */ + typename std::array::const_iterator cend() const + { + return end(); + } + protected: std::array _id; size_t _num_dimensions{ 0 }; diff --git a/arm_compute/core/Helpers.h b/arm_compute/core/Helpers.h index 44cf30c76..75684ce40 100644 --- a/arm_compute/core/Helpers.h +++ b/arm_compute/core/Helpers.h @@ -108,11 +108,10 @@ inline uint8_t delta_bilinear_c1u8(const uint8_t *pixel_ptr, size_t stride, floa * * @warning Only works if the iterator was created with an IImage * - * @param[in[ first_pixel_ptr Pointer to the first pixel of a single channel U8 image. - * @param[in[ stride Stride in bytes of the image; - * - * @param[in] x X position of the wanted pixel - * @param[in] y Y position of the wanted pixel + * @param[in] first_pixel_ptr Pointer to the first pixel of a single channel U8 image. + * @param[in] stride Stride in bytes of the image; + * @param[in] x X position of the wanted pixel + * @param[in] y Y position of the wanted pixel * * @return The pixel at (x, y) using bilinear interpolation. */ diff --git a/arm_compute/core/IAccessWindow.h b/arm_compute/core/IAccessWindow.h index 8d5c455a6..3b905edec 100644 --- a/arm_compute/core/IAccessWindow.h +++ b/arm_compute/core/IAccessWindow.h @@ -86,14 +86,16 @@ public: * @return True if the padding has been changed. */ virtual bool update_padding_if_needed(const Window &window) const = 0; - /** Set the valid region based on access pattern, valid region of the inputs and border mode. + /** Compute the valid region based on access pattern and valid region of the inputs. + * + * @note This method assumes that there is no border. * * @param[in] window Execution window of the kernel. * @param[in] input_valid_region Combined valid region of all inputs. * @param[in] border_undefined Undefined borders are excluded from the valid region. * @param[in] border_size Size of the border around the XY-plane of the tensor. */ - virtual void set_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) = 0; + virtual ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const = 0; }; /** Implementation of a rectangular access pattern. */ @@ -142,20 +144,28 @@ public: AccessWindowRectangle &operator=(AccessWindowRectangle &&) = default; ~AccessWindowRectangle() = default; - /** Set the valid region based on access pattern and valid region of the inputs. + /** Set the valid region based on access pattern, valid region of the inputs and border mode. + * + * @param[in] window Execution window of the kernel. + * @param[in] input_valid_region Combined valid region of all inputs. + * @param[in] border_undefined (Optional) Undefined borders are excluded from the valid region. + * @param[in] border_size (Optional) Size of the border around the XY-plane of the tensor. + */ + void set_valid_region(const Window &window, const ValidRegion &input_valid_region, bool border_undefined = false, const BorderSize &border_size = BorderSize(0)); + + /** Compute the valid region based on access pattern, valid region of the inputs and border mode. * * @note This method assumes that there is no border. - * @note This method assumes that all elements written by the kernel are valid. * * @param[in] window Execution window of the kernel. * @param[in] input_valid_region Combined valid region of all inputs. */ - void set_valid_region(const Window &window, ValidRegion input_valid_region); + ValidRegion compute_valid_region(const Window &window, const ValidRegion &input_valid_region) const; // Inherited methods overridden: /** @note This method assumes that all elements written by the kernel are valid. */ - void set_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) override; + ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override; bool update_window_if_needed(Window &window) const override; bool update_padding_if_needed(const Window &window) const override; diff --git a/arm_compute/core/ITensor.h b/arm_compute/core/ITensor.h index ef4ea7bbc..55464a744 100644 --- a/arm_compute/core/ITensor.h +++ b/arm_compute/core/ITensor.h @@ -76,6 +76,13 @@ public: * @param[in] src Source tensor to copy from. */ void copy_from(const ITensor &src); + + /** Print a tensor to a given stream using user defined formatting information + * + * @param s Output stream + * @param io_fmt Format information + */ + void print(std::ostream &s, IOFormatInfo io_fmt = IOFormatInfo()) const; }; using IImage = ITensor; diff --git a/arm_compute/core/NEON/NEKernels.h b/arm_compute/core/NEON/NEKernels.h index 2b5596477..55f54dd0b 100644 --- a/arm_compute/core/NEON/NEKernels.h +++ b/arm_compute/core/NEON/NEKernels.h @@ -52,6 +52,8 @@ #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" #include "arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h" diff --git a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h index ab5787c88..7790cf1be 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h @@ -47,20 +47,33 @@ class ITensor; * \end{array} \right) * @f] * - * After this operation, the output matrix will have the following shape: [ height * 4, width / 4 ] + * After this operation, the output matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ] */ class NEGEMMInterleave4x4Kernel : public INESimpleKernel { public: + /* Constructor */ + NEGEMMInterleave4x4Kernel(); /** Initialise the kernel's input and output. * - * @param[in] input Input tensor (Matrix A). Data types supported: F32, F16. - * @param[out] output Output tensor (Matrix A interleaved). Data type supported: same as @p input. + * @param[in] input Input tensor. Data types supported: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input. */ void configure(const ITensor *input, ITensor *output); // Inherited methods overridden: void run(const Window &window) override; + +private: + /** Common signature for all the transpose functions + * + * @param[in] input An input tensor. Data types supported: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[out] output The output tensor. Data type supported: same as @p input + * @param[in] window Region on which to execute the kernel. + */ + using GEMMInterleaveFunction = void(const ITensor *input, ITensor *output, const Window &window); + + GEMMInterleaveFunction *_func; /**< GEMM interleave function to use for the particular tensor types passed to configure() */ }; } #endif /*__ARM_COMPUTE_NEGEMMINTERLEAVE4x4KERNEL_H__*/ diff --git a/arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h b/arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h index decbb2024..40a6aa737 100644 --- a/arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h +++ b/arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h @@ -24,14 +24,14 @@ #ifndef __ARM_COMPUTE_NEGAUSSIANPYRAMIDKERNEL_H__ #define __ARM_COMPUTE_NEGAUSSIANPYRAMIDKERNEL_H__ -#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/NEON/INESimpleKernel.h" namespace arm_compute { class ITensor; /** NEON kernel to perform a GaussianPyramid (horizontal pass) */ -class NEGaussianPyramidHorKernel : public INEKernel +class NEGaussianPyramidHorKernel : public INESimpleKernel { public: /** Default constructor */ @@ -60,11 +60,12 @@ public: BorderSize border_size() const override; private: - const ITensor *_input; - ITensor *_output; + BorderSize _border_size; + int _l2_load_offset; }; + /** NEON kernel to perform a GaussianPyramid (vertical pass) */ -class NEGaussianPyramidVertKernel : public INEKernel +class NEGaussianPyramidVertKernel : public INESimpleKernel { public: /** Default constructor */ @@ -93,8 +94,7 @@ public: BorderSize border_size() const override; private: - const ITensor *_input; - ITensor *_output; + int _t2_load_offset; }; } #endif /*__ARM_COMPUTE_NEGAUSSIANPYRAMIDKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h b/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h index 34e45886a..3bcd686e8 100644 --- a/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h +++ b/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h @@ -74,6 +74,7 @@ protected: float _sensitivity; /**< Sensitivity value */ float _strength_thresh; /**< Threshold value */ float _norm_factor; /**< Normalization factor */ + BorderSize _border_size; /**< Border size */ }; /** Template NEON kernel to perform Harris Score. diff --git a/arm_compute/core/NEON/kernels/NEHistogramKernel.h b/arm_compute/core/NEON/kernels/NEHistogramKernel.h index e11b41f1d..2da3a0846 100644 --- a/arm_compute/core/NEON/kernels/NEHistogramKernel.h +++ b/arm_compute/core/NEON/kernels/NEHistogramKernel.h @@ -125,7 +125,7 @@ private: uint32_t *_local_hist; uint32_t *_window_lut; std::mutex _hist_mtx; - static constexpr unsigned int _max_range_size{ 256 }; //< 256 possible pixel values as we handle only U8 images + static constexpr unsigned int _max_range_size{ 256 }; ///< 256 possible pixel values as we handle only U8 images }; /** Interface for the histogram border handling kernel. @@ -195,7 +195,7 @@ private: const IImage *_input; IDistribution1D *_output; uint32_t *_window_lut; - static constexpr unsigned int _max_range_size{ 256 }; //< 256 possible pixel values as we handle only U8 images + static constexpr unsigned int _max_range_size{ 256 }; ///< 256 possible pixel values as we handle only U8 images }; } diff --git a/arm_compute/core/NEON/kernels/NELKTrackerKernel.h b/arm_compute/core/NEON/kernels/NELKTrackerKernel.h index e578a6afd..70d79a1dc 100644 --- a/arm_compute/core/NEON/kernels/NELKTrackerKernel.h +++ b/arm_compute/core/NEON/kernels/NELKTrackerKernel.h @@ -104,9 +104,9 @@ private: void init_keypoints(int start, int end); /** Compute the structure tensor A^T * A based on the scharr gradients I_x and I_y * - * @param[in] keypoint Keypoint for which gradients are computed - * @param[out] bilinear_x Intermediate interpolated data for X gradient - * @param[out] bilinear_y Intermediate interpolated data for Y gradient + * @param[in] keypoint Keypoint for which gradients are computed + * @param[out] bilinear_ix Intermediate interpolated data for X gradient + * @param[out] bilinear_iy Intermediate interpolated data for Y gradient * * @return Values A11, A12, A22 */ @@ -115,8 +115,8 @@ private: * * @param[in] old_keypoint Old keypoint for which gradient is computed * @param[in] new_keypoint New keypoint for which gradient is computed - * @param[in] bilinear_x Intermediate interpolated data for X gradient - * @param[in] bilinear_y Intermediate interpolated data for Y gradient + * @param[in] bilinear_ix Intermediate interpolated data for X gradient + * @param[in] bilinear_iy Intermediate interpolated data for Y gradient * * @return Values b1, b2 */ diff --git a/arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h b/arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h index b18dc001b..e405ea5ae 100644 --- a/arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h +++ b/arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h @@ -148,14 +148,14 @@ private: template struct create_func_table; - const IImage *_input; /**< Input image. */ - int32_t *_min; /**< Minimum value. */ - int32_t *_max; /**< Maximum value. */ - uint32_t *_min_count; /**< Count of minimum value encounters. */ - uint32_t *_max_count; /**< Count of maximum value encounters. */ - ICoordinates2DArray *_min_loc; /**< Locations of minimum values. */ - ICoordinates2DArray *_max_loc; /**< Locations of maximum values. */ - unsigned int _processed_elements; /**< Elements processed per iteration. */ + const IImage *_input; /**< Input image. */ + int32_t *_min; /**< Minimum value. */ + int32_t *_max; /**< Maximum value. */ + uint32_t *_min_count; /**< Count of minimum value encounters. */ + uint32_t *_max_count; /**< Count of maximum value encounters. */ + ICoordinates2DArray *_min_loc; /**< Locations of minimum values. */ + ICoordinates2DArray *_max_loc; /**< Locations of maximum values. */ + unsigned int _num_elems_processed_per_iteration; /**< Elements processed per iteration. */ }; } #endif /*__ARM_COMPUTE_NEMINMAXLOCATIONKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h b/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h index e74f9c207..cf74cac6c 100644 --- a/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h +++ b/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h @@ -54,7 +54,7 @@ public: /** Initialise the kernel's sources, destinations and border mode. * - * @param[in] input Source tensor. Data types supported: U8, F32. (Must be the same as the output tensor) + * @param[in] input Source tensor. Data types supported: U8, F32 * @param[out] output Destination tensor. Data types supported: same as @p input * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. */ @@ -72,11 +72,10 @@ protected: * @param[in] input_stride Stride of the input tensor */ using NonMaxSuppr3x3Function = void(const void *__restrict input_ptr, void *__restrict output_ptr, const uint32_t input_stride); - /** Non-Maxima suppression function to use for the particular tensor types passed to configure() */ - NonMaxSuppr3x3Function *_func; - const ITensor *_input; /**< Source tensor */ - ITensor *_output; /**< Destination tensor */ + NonMaxSuppr3x3Function *_func; /**< Non-Maxima suppression function to use for the particular tensor types passed to configure() */ + const ITensor *_input; /**< Source tensor */ + ITensor *_output; /**< Destination tensor */ }; #ifdef ARM_COMPUTE_ENABLE_FP16 @@ -87,8 +86,8 @@ class NENonMaximaSuppression3x3FP16Kernel : public NENonMaximaSuppression3x3Kern public: /** Initialise the kernel's sources, destinations and border mode. * - * @param[in] input Source tensor. Data types supported: U8, F32. (Must be the same as the output tensor) - * @param[out] output Destination tensor. Data types supported: U8, F32. (Must be the same as the input tensor) + * @param[in] input Source tensor. Data types supported: U8, F32. + * @param[out] output Destination tensor. Data types supported: same as @p input * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. */ void configure(const ITensor *input, ITensor *output, bool border_undefined); diff --git a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h index 7df85581d..83d55d312 100644 --- a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h @@ -36,7 +36,7 @@ class NELogits1DMaxKernel : public INESimpleKernel { public: /** Default constructor */ - NELogits1DMaxKernel() = default; + NELogits1DMaxKernel(); /** Set the input and output tensors. * * @param[in] input Source tensor. Data types supported: F32. @@ -46,6 +46,10 @@ public: // Inherited methods overridden: void run(const Window &window) override; + BorderSize border_size() const override; + +private: + BorderSize _border_size; }; /** Interface for shifting the logits values around the max value and exponentiating the result */ @@ -76,12 +80,14 @@ public: // Inherited methods overridden: void run(const Window &window) override; + BorderSize border_size() const override; private: const ITensor *_input; const ITensor *_max; ITensor *_output; ITensor *_sum; + BorderSize _border_size; }; /** Interface for calculating the final step of the Softmax Layer where each logit value is multiplied by the inverse of the sum of the logits. */ diff --git a/arm_compute/core/PyramidInfo.h b/arm_compute/core/PyramidInfo.h index 917a14d50..76b3852bb 100644 --- a/arm_compute/core/PyramidInfo.h +++ b/arm_compute/core/PyramidInfo.h @@ -47,6 +47,29 @@ public: PyramidInfo(PyramidInfo &&) = default; /** Allow instances of this class to be moved */ PyramidInfo &operator=(PyramidInfo &&) = default; + + /** Create pyramid info for 2D tensors + * + * @param[in] num_levels The number of pyramid levels. This is required to be a non-zero value + * @param[in] scale Used to indicate the scale between the pyramid levels. + * This is required to be a non-zero positive value. + * @param[in] width The width of the 2D tensor at 0th pyramid level + * @param[in] height The height of the 2D tensor at 0th pyramid level + * @param[in] format The format of all 2D tensors in the pyramid + * NV12, NV21, IYUV, UYVY and YUYV formats are not supported. + */ + PyramidInfo(size_t num_levels, float scale, size_t width, size_t height, Format format); + + /** Create pyramid info using TensorShape + * + * @param[in] num_levels The number of pyramid levels. This is required to be a non-zero value + * @param[in] scale Used to indicate the scale between the pyramid levels. + * This is required to be a non-zero positive value. + * @param[in] tensor_shape It specifies the size for each dimension of the tensor 0th pyramid level in number of elements + * @param[in] format The format of all tensors in the pyramid + */ + PyramidInfo(size_t num_levels, float scale, const TensorShape &tensor_shape, Format format); + /** Initialize pyramid's metadata for 2D tensors * * @param[in] num_levels The number of pyramid levels. This is required to be a non-zero value diff --git a/arm_compute/core/TensorInfo.h b/arm_compute/core/TensorInfo.h index c0fbc2aca..b1f7db0ad 100644 --- a/arm_compute/core/TensorInfo.h +++ b/arm_compute/core/TensorInfo.h @@ -280,19 +280,27 @@ public: { return _total_size; } + /** Padding of tensor. + * + * @return Padding. + */ + PaddingSize padding() const + { + return _padding; + } /** Checks if the tensor has been allocated with padding or not. * * @return True if padding is allocated in the tensor, otherwise false. */ - bool has_padding() + bool has_padding() const { - return (this->total_size() != (this->tensor_shape().total_size() * this->element_size())); + return !_padding.empty(); } /** Flag indicating whether the size of the tensor can be changed. * * @return True if the tensor size can be changed. */ - bool is_resizable() + bool is_resizable() const { return _is_resizable; } @@ -332,6 +340,7 @@ private: Format _format; bool _is_resizable; ValidRegion _valid_region; + PaddingSize _padding; }; } #endif /*__ARM_COMPUTE_TENSORINFO_H__ */ diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h index a6a74a82a..a4fa43513 100644 --- a/arm_compute/core/Types.h +++ b/arm_compute/core/Types.h @@ -448,7 +448,6 @@ public: LINEAR /**< Linear */ }; -public: /** Default Constructor * * @param[in] f The activation function to use. @@ -456,7 +455,7 @@ public: * (@ref ActivationFunction::BOUNDED_RELU, @ref ActivationFunction::LINEAR, @ref ActivationFunction::TANH). * @param[in] b (Optional) The beta parameter used by some activation functions (@ref ActivationFunction::LINEAR, @ref ActivationFunction::TANH). */ - ActivationLayerInfo(ActivationFunction f, uint32_t a = 0, uint32_t b = 0) + ActivationLayerInfo(ActivationFunction f, float a = 0.0f, float b = 0.0f) : _act(f), _a(a), _b(b) { } @@ -464,19 +463,19 @@ public: { return _act; } - uint32_t a() const + float a() const { return _a; } - uint32_t b() const + float b() const { return _b; } private: ActivationFunction _act; - uint32_t _a; - uint32_t _b; + float _a; + float _b; }; /** Normalization Layer Information class */ @@ -491,7 +490,7 @@ public: * @param[in] beta Beta parameter used by normalization equation. Defaults to 0.5. * @param[in] kappa Kappa parameter used by [Krichevksy 2012] Across Channel Local Brightness Normalization equation. */ - NormalizationLayerInfo(NormType type, uint32_t norm_size = 5, float alpha = 0.0001, float beta = 0.5, uint32_t kappa = 1.f) + NormalizationLayerInfo(NormType type, uint32_t norm_size = 5, float alpha = 0.0001f, float beta = 0.5f, float kappa = 1.f) : _type(type), _norm_size(norm_size), _alpha(alpha), _beta(beta), _kappa(kappa) { } @@ -511,7 +510,7 @@ public: { return _beta; } - uint32_t kappa() const + float kappa() const { return _kappa; } @@ -530,5 +529,47 @@ private: float _beta; float _kappa; }; + +/** IO formatting information class*/ +struct IOFormatInfo +{ + /** Precision type used when printing floating point numbers */ + enum class PrecisionType + { + Default, /**< Default precision to the one that the current stream has */ + Custom, /**< Custom precision specified by the user using the precision parameter */ + Full /**< The maximum precision of the floating point representation */ + }; + + /** Specifies the area to be printed, used by Tensor objects */ + enum class PrintRegion + { + ValidRegion, /**< Prints the valid region of the Tensor object */ + NoPadding, /**< Prints the Tensor object without the padding */ + Full /**< Print the tensor object including padding */ + }; + + IOFormatInfo(PrintRegion print_region = PrintRegion::ValidRegion, + PrecisionType precision_type = PrecisionType::Default, + unsigned int precision = 10, + bool align_columns = true, + std::string element_delim = " ", + std::string row_delim = "\n") + : print_region(print_region), + precision_type(precision_type), + precision(precision), + element_delim(element_delim), + row_delim(row_delim), + align_columns(align_columns) + { + } + + PrintRegion print_region; + PrecisionType precision_type; + unsigned int precision; + std::string element_delim; + std::string row_delim; + bool align_columns; +}; } #endif /* __ARM_COMPUTE_TYPES_H__ */ diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h index 9378ef6ed..3ebf3ff47 100644 --- a/arm_compute/core/Utils.h +++ b/arm_compute/core/Utils.h @@ -629,5 +629,72 @@ inline bool is_data_type_float(DataType dt) return false; } } + +/** Print consecutive elements to an output stream. + * + * @param[out] s Output stream to print the elements to. + * @param[in] ptr Pointer to print the elements from. + * @param[in] n Number of elements to print. + * @param[in] stream_width (Optional) Width of the stream. If set to 0 the element's width is used. Defaults to 0. + * @param[in] element_delim (Optional) Delimeter among the consecutive elements. Defaults to space delimeter + */ +template +void print_consecutive_elements_impl(std::ostream &s, const T *ptr, unsigned int n, int stream_width = 0, const std::string &element_delim = " ") +{ + for(unsigned int i = 0; i < n; ++i) + { + // Set stream width as it is not a "sticky" stream manipulator + if(stream_width != 0) + { + s.width(stream_width); + } + s << std::right << ptr[i] << element_delim; + } +} + +/** Identify the maximum width of n consecutive elements. + * + * @param[in] s The output stream which will be used to print the elements. Used to extract the stream format. + * + * @param ptr Pointer to the elements. + * @param n Number of elements. + * + * @return The maximum width of the elements. + */ +template +int max_consecutive_elements_display_width_impl(std::ostream &s, const T *ptr, unsigned int n) +{ + int max_width = -1; + for(unsigned int i = 0; i < n; ++i) + { + std::stringstream ss; + ss.copyfmt(s); + ss << ptr[i]; + max_width = std::max(max_width, ss.str().size()); + } + return max_width; +} + +/** Print consecutive elements to an output stream. + * + * @param[out] s Output stream to print the elements to. + * @param[in] dt Data type of the elements + * @param[in] ptr Pointer to print the elements from. + * @param[in] n Number of elements to print. + * @param[in] stream_width (Optional) Width of the stream. If set to 0 the element's width is used. Defaults to 0. + * @param[in] element_delim (Optional) Delimeter among the consecutive elements. Defaults to space delimeter + */ +void print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n, int stream_width, const std::string &element_delim = " "); + +/** Identify the maximum width of n consecutive elements. + * + * @param[in] s Output stream to print the elements to. + * @param[in] dt Data type of the elements + * @param[in] ptr Pointer to print the elements from. + * @param[in] n Number of elements to print. + * + * @return The maximum width of the elements. + */ +int max_consecutive_elements_display_width(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n); } #endif /*__ARM_COMPUTE_UTILS_H__ */ diff --git a/arm_compute/core/Validate.h b/arm_compute/core/Validate.h index a07d9d99a..5f1c54124 100644 --- a/arm_compute/core/Validate.h +++ b/arm_compute/core/Validate.h @@ -243,7 +243,7 @@ void error_on_data_type_not_in(const char *function, const char *file, const int }), function, file, line, "ITensor data type %s not supported by this kernel", string_from_data_type(tensor_dt).c_str()); } -#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(d, ...) ::arm_compute::error_on_data_type_not_in(__func__, __FILE__, __LINE__, d, __VA_ARGS__) +#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(t, ...) ::arm_compute::error_on_data_type_not_in(__func__, __FILE__, __LINE__, t, __VA_ARGS__) /** Throw an error if the data type or the number of channels of the passed tensor does not match any of the data types and number of channels provided. * @@ -266,7 +266,7 @@ void error_on_data_type_channel_not_in(const char *function, const char *file, c ARM_COMPUTE_ERROR_ON_LOC_MSG(tensor_nc != num_channels, function, file, line, "Number of channels %d. Required number of channels %d", tensor_nc, num_channels); } -#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(d, c, ...) ::arm_compute::error_on_data_type_channel_not_in(__func__, __FILE__, __LINE__, d, c, __VA_ARGS__) +#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c, ...) ::arm_compute::error_on_data_type_channel_not_in(__func__, __FILE__, __LINE__, t, c, __VA_ARGS__) /** Throw an error if the tensor is not 2D. * diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h index fa501728d..51f93f521 100644 --- a/arm_compute/runtime/CL/CLFunctions.h +++ b/arm_compute/runtime/CL/CLFunctions.h @@ -25,7 +25,6 @@ #define __ARM_COMPUTE_CLFUNCTIONS_H__ /* Header regrouping all the CL functions */ - #include "arm_compute/runtime/CL/functions/CLAbsoluteDifference.h" #include "arm_compute/runtime/CL/functions/CLAccumulate.h" #include "arm_compute/runtime/CL/functions/CLActivationLayer.h" diff --git a/arm_compute/runtime/CL/functions/CLCannyEdge.h b/arm_compute/runtime/CL/functions/CLCannyEdge.h index 09b8b5500..e5a82b226 100644 --- a/arm_compute/runtime/CL/functions/CLCannyEdge.h +++ b/arm_compute/runtime/CL/functions/CLCannyEdge.h @@ -27,6 +27,7 @@ #include "arm_compute/runtime/IFunction.h" #include "arm_compute/core/CL/kernels/CLCannyEdgeKernel.h" +#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" #include "arm_compute/runtime/CL/CLTensor.h" #include @@ -69,6 +70,7 @@ public: private: std::unique_ptr _sobel; /**< Pointer to Sobel kernel. */ CLGradientKernel _gradient; /**< Gradient kernel. */ + CLFillBorderKernel _border_mag_gradient; /**< Fill border on magnitude tensor kernel */ CLEdgeNonMaxSuppressionKernel _non_max_suppr; /**< Non-Maxima suppression kernel. */ CLEdgeTraceKernel _edge_trace; /**< Edge tracing kernel. */ CLImage _gx; /**< Source tensor - Gx component. */ diff --git a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h index 1edbdbc95..09e4fc927 100644 --- a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h +++ b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h @@ -24,57 +24,68 @@ #ifndef __ARM_COMPUTE_CLFULLYCONNECTEDLAYER_H__ #define __ARM_COMPUTE_CLFULLYCONNECTEDLAYER_H__ +#include "arm_compute/runtime/IFunction.h" + +#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h" #include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h" +#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h" +#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h" +#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h" #include "arm_compute/core/CL/kernels/CLTransposeKernel.h" -#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h" -#include "arm_compute/runtime/CL/functions/CLGEMM.h" +#include "arm_compute/runtime/CL/CLTensor.h" namespace arm_compute { /** Basic function to compute a Fully Connected layer on OpenCL. This function calls the following OpenCL kernels: - * -# @ref CLConvolutionLayer (called when the weights have 4 dimensions. Pass the stride as 1 and padding as 0) - * -# @ref CLGEMM (called when the weights have 2 dimensions) - * -# @ref CLTransposeKernel (called when the weights have 2 dimensions) - * -# @ref CLGEMMMatrixAccumulateBiasesKernel (called when the weights have 2 dimensions) * - * @note The fully connected layer accepts "weights" tensors only with 2 or 4 dimensions. In particular, the weights tensor has 4 dimensions, - * if the fully connected layer is computed after a convolution layer otherwise the tensor has 2 dimensions if the fully connected layer - * is computed after another fully connected layer + * -# @ref CLIm2ColKernel (called when the input comes from a convolutional layer) + * -# @ref CLTransposeKernel (if @p transpose_weights is set to true) (called once) + * -# @ref NEGEMMTranspose1xWKernel (called once if we have a multi-batch input) + * -# @ref NEGEMMInterleave4x4Kernel (called if we have a multi-batch input) + * -# @ref NEGEMMMatrixMultiplyKernel + * -# @ref CLGEMMMatrixAccumulateBiasesKernel (if @p biases is not equal to nullptr) + * + * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. */ class CLFullyConnectedLayer : public IFunction { public: - /**Constructor */ + /** Constructor */ CLFullyConnectedLayer(); /** Set the input and output tensors. * - * @param[in, out] input Source tensor. Data type supported: F16, F32. (Written to only if @ref CLGEMM needs to pad with zeros the tensor) - * @param[in, out] weights Weights tensor. The weights can be 2 dimensional or 4 dimensional. Data type supported: Same as @p input. (Written to only if @ref CLGEMM needs to pad with zeros the tensor) - * @param[in] biases Bias tensor. Data type supported:Same as @p input. - * @param[out] output Destination tensor. Data type supported: Same as @p input. + * @param[in] input Source tensor. Data type supported: F16, F32. + * @param[in] weights Weights tensor. The weights must be 2 dimensional. Data type supported: Same as @p input + * @param[in] biases Bias tensor. It can be nullptr. Data type supported:Same as @p input. + * @param[out] output Destination tensor. Data type supported: Same as @p input. + * @param[in] transpose_weights (Optional) Transpose weights if true. Defaults to true. */ - void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *biases, ICLTensor *output); + void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose_weights = true); //Inherited methods override void run() override; private: - /** Run the convolution layer connect to fully connected layer case */ - void run_conv(); - /** Run the fully connected layer connect to fully connected layer case */ - void run_fc(); - /** Common signature for the functions to run */ - using FullyConnectedLayerFunction = void (CLFullyConnectedLayer::*)(void); + void configure_fc_fc_wb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output); + void configure_fc_fc_nb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output); + void configure_conv_fc_wb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output); + void configure_conv_fc_nb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output); -private: - CLConvolutionLayer _conv_function; - CLGEMM _gemm_function; + CLIm2ColKernel _im2col_kernel; CLTransposeKernel _transpose_kernel; - CLGEMMMatrixAccumulateBiasesKernel _acc_biases_kernel; - FullyConnectedLayerFunction _run_func; - CLTensor _weights_transpose; + CLGEMMTranspose1xWKernel _transpose1xW_kernel; + CLGEMMInterleave4x4Kernel _interleave4x4_kernel; + CLGEMMMatrixMultiplyKernel _mm_kernel; + CLGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel; + CLTensor _im2col_output; + CLTensor _interleave4x4_output; + CLTensor _transpose_output; + CLTensor _transpose1xW_output; bool _is_first_run; - bool _run_acc_biases; + bool _transpose_weights; + bool _fc_after_conv; + bool _batched_fc_layer; + bool _accumulate_biases; }; } #endif /* __ARM_COMPUTE_CLFULLYCONNECTEDLAYER_H__ */ diff --git a/arm_compute/runtime/CL/functions/CLGaussianPyramid.h b/arm_compute/runtime/CL/functions/CLGaussianPyramid.h index d7f53c1e0..97935193d 100644 --- a/arm_compute/runtime/CL/functions/CLGaussianPyramid.h +++ b/arm_compute/runtime/CL/functions/CLGaussianPyramid.h @@ -51,6 +51,12 @@ public: CLGaussianPyramid(const CLGaussianPyramid &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ CLGaussianPyramid &operator=(const CLGaussianPyramid &) = delete; + /** Allow instances of this class to be moved */ + CLGaussianPyramid(CLGaussianPyramid &&) = default; + /** Allow instances of this class to be moved */ + CLGaussianPyramid &operator=(CLGaussianPyramid &&) = default; + /** Default destructor */ + virtual ~CLGaussianPyramid() = default; /** Initialise the function's source, destinations and border mode. * * @param[in, out] input Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED) diff --git a/arm_compute/runtime/NEON/NEFunctions.h b/arm_compute/runtime/NEON/NEFunctions.h index d1dd15045..ef17599d0 100644 --- a/arm_compute/runtime/NEON/NEFunctions.h +++ b/arm_compute/runtime/NEON/NEFunctions.h @@ -50,7 +50,9 @@ #include "arm_compute/runtime/NEON/functions/NEFillBorder.h" #include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h" #include "arm_compute/runtime/NEON/functions/NEGEMM.h" +#include "arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h" #include "arm_compute/runtime/NEON/functions/NEGEMMLowp.h" +#include "arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h" #include "arm_compute/runtime/NEON/functions/NEGaussian3x3.h" #include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h" #include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h" diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h index abc0b6cef..a6862cae0 100644 --- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h @@ -83,7 +83,6 @@ private: Tensor _gemm_output; bool _is_first_run; bool _has_bias; - bool _is_fc; }; } #endif /* __ARM_COMPUTE_NECONVOLUTIONLAYER_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEFastCorners.h b/arm_compute/runtime/NEON/functions/NEFastCorners.h index d2006d868..d7c31750c 100644 --- a/arm_compute/runtime/NEON/functions/NEFastCorners.h +++ b/arm_compute/runtime/NEON/functions/NEFastCorners.h @@ -27,7 +27,6 @@ #include "arm_compute/core/NEON/kernels/NEFastCornersKernel.h" #include "arm_compute/core/NEON/kernels/NEFillArrayKernel.h" #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" -#include "arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h" #include "arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/Array.h" @@ -44,7 +43,6 @@ using IImage = ITensor; /** Basic function to execute fast corners. This function call the following NEON kernels: * * -# @ref NEFastCornersKernel - * -# @ref NEFillInnerBorderKernel (executed if nonmax_suppression == true) * -# @ref NENonMaximaSuppression3x3Kernel (executed if nonmax_suppression == true) * -# @ref NEFillArrayKernel * @@ -74,7 +72,6 @@ private: NEFillBorderKernel _border_handler; NENonMaximaSuppression3x3Kernel _nonmax_kernel; NEFillArrayKernel _fill_kernel; - NEFillInnerBorderKernel _out_border_handler_kernel; Image _output; Image _suppressed; bool _non_max; diff --git a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h index cdd72e5f9..69e27b83d 100644 --- a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h +++ b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h @@ -24,22 +24,27 @@ #ifndef __ARM_COMPUTE_NEFULLYCONNECTEDLAYER_H__ #define __ARM_COMPUTE_NEFULLYCONNECTEDLAYER_H__ +#include "arm_compute/runtime/IFunction.h" + +#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h" +#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h" #include "arm_compute/core/NEON/kernels/NETransposeKernel.h" -#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h" -#include "arm_compute/runtime/NEON/functions/NEGEMM.h" +#include "arm_compute/runtime/Tensor.h" namespace arm_compute { /** Basic function to compute a Fully Connected layer on NEON. This function calls the following NEON kernels: - * -# @ref NEConvolutionLayer (called when the weights have 4 dimensions. Pass the stride as 1 and padding as 0) - * -# @ref NEGEMM (called when the weights have 2 dimensions) - * -# @ref NETransposeKernel (called when the weights have 2 dimensions) - * -# @ref NEGEMMMatrixAccumulateBiasesKernel (called when the weights have 2 dimensions) + * -# @ref NEIm2ColKernel (called when the input comes from a convolutional layer) + * -# @ref NETransposeKernel (if @p transpose_weights flag is set to true) (called once) + * -# @ref NEGEMMTranspose1xWKernel (called once if we have a multi-batch input) + * -# @ref NEGEMMInterleave4x4Kernel (called if we have a multi-batch input) + * -# @ref NEGEMMMatrixMultiplyKernel + * -# @ref NEGEMMMatrixAccumulateBiasesKernel (if @p biases is not equal to nullptr) * - * @note The fully connected layer accepts "weights" tensors only with 2 or 4 dimensions. In particular, the weights tensor has 4 dimensions, - * if the fully connected layer is computed after a convolution layer otherwise the tensor has 2 dimensions if the fully connected layer - * is computed after another fully connected layer + * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. */ class NEFullyConnectedLayer : public IFunction { @@ -48,33 +53,38 @@ public: NEFullyConnectedLayer(); /** Set the input and output tensors. * - * @param[in, out] input Source tensor. Data type supported: F32. (Written to only if @ref NEGEMM needs to pad with zeros the tensor) - * @param[in, out] weights Weights tensor. The weights can be 2 dimensional or 4 dimensional. Data type supported: Same as @p input. (Written to only if @ref NEGEMM needs to pad with zeros the tensor) - * @param[in] biases Bias tensor. Data type supported:Same as @p input. - * @param[out] output Destination tensor. Data type supported: Same as @p input. + * @param[in] input Source tensor. Data type supported: F32. + * @param[in] weights Weights tensor. The weights must be 2 dimensional. Data type supported: Same as @p input. + * @param[in] biases Bias tensor. Can be nullptr. Data type supported:Same as @p input. + * @param[out] output Destination tensor. Data type supported: Same as @p input. + * @param[in] transpose_weights (Optional) Transpose weights if true. Defaults to true. */ - void configure(ITensor *input, ITensor *weights, const ITensor *biases, ITensor *output); + void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose_weights = true); //Inherited methods override void run() override; private: - /** Run the convolution layer connect to fully connected layer case */ - void run_conv(); - /** Run the fully connected layer connect to fully connected layer case */ - void run_fc(); - /** Common signature for the functions to run */ - using FullyConnectedLayerFunctionPtr = void (NEFullyConnectedLayer::*)(void); + void configure_fc_fc_wb(const ITensor *input, const ITensor *weights, ITensor *output); + void configure_fc_fc_nb(const ITensor *input, const ITensor *weights, ITensor *output); + void configure_conv_fc_wb(const ITensor *input, const ITensor *weights, ITensor *output); + void configure_conv_fc_nb(const ITensor *input, const ITensor *weights, ITensor *output); -private: - NEConvolutionLayer _conv_function; - NEGEMM _gemm_function; + NEIm2ColKernel _im2col_kernel; NETransposeKernel _transpose_kernel; - NEGEMMMatrixAccumulateBiasesKernel _acc_biases_kernel; - FullyConnectedLayerFunctionPtr _run_func; - Tensor _weights_transposed; + NEGEMMTranspose1xWKernel _transpose1xW_kernel; + NEGEMMInterleave4x4Kernel _interleave4x4_kernel; + NEGEMMMatrixMultiplyKernel _mm_kernel; + NEGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel; + Tensor _im2col_output; + Tensor _interleave4x4_output; + Tensor _transpose_output; + Tensor _transpose1xW_output; bool _is_first_run; - bool _run_acc_biases; + bool _transpose_weights; + bool _fc_after_conv; + bool _batched_fc_layer; + bool _accumulate_biases; }; } #endif /* __ARM_COMPUTE_NEFULLYCONNECTEDLAYER_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h b/arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h new file mode 100644 index 000000000..71fefbf5e --- /dev/null +++ b/arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEGEMMINTERLEAVE4X4_H__ +#define __ARM_COMPUTE_NEGEMMINTERLEAVE4X4_H__ + +#include "arm_compute/runtime/NEON/INESimpleFunction.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to execute NEGEMMInterleave4x4Kernel. This function calls the following NEON kernel: + * + * -# @ref NEGEMMInterleave4x4Kernel + * + */ +class NEGEMMInterleave4x4 : public INESimpleFunction +{ +public: + /** Initialise the kernel's inputs, output + * + * @param[in] input First input tensor. Data types supported: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[out] output Output tensor. Data type supported: same as @p input + */ + void configure(const ITensor *input, ITensor *output); +}; +} +#endif /*__ARM_COMPUTE_NEGEMMINTERLEAVE4X4_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h b/arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h new file mode 100644 index 000000000..69096fb6a --- /dev/null +++ b/arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEGEMMTRANSPOSE1XW_H__ +#define __ARM_COMPUTE_NEGEMMTRANSPOSE1XW_H__ + +#include "arm_compute/runtime/NEON/INESimpleFunction.h" + +namespace arm_compute +{ +/** Basic function to execute NEGEMMTranspose1xWKernel. This function calls the following NEON kernels: + * + * -# @ref NEGEMMTranspose1xWKernel + * + */ +class NEGEMMTranspose1xW : public INESimpleFunction +{ +public: + /** Initialise the kernel's inputs, output + * + * @param[in] input First input tensor. Data type supported: F32, F16, U8. + * @param[out] output Output tensor. Data type supported: same as @p input + */ + void configure(const ITensor *input, ITensor *output); +}; +} +#endif /*__ARM_COMPUTE_NEGEMMTRANSPOSE1XW_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEGaussianPyramid.h b/arm_compute/runtime/NEON/functions/NEGaussianPyramid.h index 9b5b1c9fa..5f0a67ea0 100644 --- a/arm_compute/runtime/NEON/functions/NEGaussianPyramid.h +++ b/arm_compute/runtime/NEON/functions/NEGaussianPyramid.h @@ -44,12 +44,19 @@ class ITensor; class NEGaussianPyramid : public IFunction { public: - /**Constructor */ + /** Default constructor */ NEGaussianPyramid(); /** Prevent instances of this class from being copied (As this class contains pointers) */ NEGaussianPyramid(const NEGaussianPyramid &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ NEGaussianPyramid &operator=(const NEGaussianPyramid &) = delete; + /** Allow instances of this class to be moved */ + NEGaussianPyramid(NEGaussianPyramid &&) = default; + /** Allow instances of this class to be moved */ + NEGaussianPyramid &operator=(NEGaussianPyramid &&) = default; + /** Default destructor */ + virtual ~NEGaussianPyramid() = default; + /** Initialise the function's source, destinations and border mode. * * @param[in] input Source tensor. Data type supported: U8. diff --git a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h index 82e015d86..a457f961b 100644 --- a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h +++ b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h @@ -63,6 +63,7 @@ private: NELogits1DShiftExpSumKernel _shift_exp_sum_kernel; NELogits1DNormKernel _norm_kernel; NEFillBorderKernel _fill_border_kernel; + NEFillBorderKernel _fill_border_kernel_sum; Tensor _max; Tensor _sum; Tensor _tmp; diff --git a/docs/Doxyfile b/docs/Doxyfile index c3400cf4a..b78ea3013 100644 --- a/docs/Doxyfile +++ b/docs/Doxyfile @@ -38,7 +38,7 @@ PROJECT_NAME = "ARM Compute Library" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = 17.03.1 +PROJECT_NUMBER = 17.04 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/docs/arm_compute.dox b/docs/arm_compute.dox index 81f82a36c..45799b268 100644 --- a/docs/arm_compute.dox +++ b/docs/arm_compute.dox @@ -92,28 +92,72 @@ You should have the following file organisation: └── test_helpers --> Boiler plate code used by examples └── Utils.h -@section S2_versions_changelog Versions changelog - -@note There will be one major public release with new features per quarter. All releases in between will only contain bug fixes. - -v16.12 (Binary release) - - Original release +@section S2_versions_changelog Release versions and changelog + +@subsection S2_1_versions Release versions + +All releases are numbered vYY.MM Where YY are the last two digits of the year, and MM the month number. +If there is more than one release in a month then an extra sequential number is appended at the end: + + v17.03 (First release of March 2017) + v17.03.1 (Second release of March 2017) + v17.04 (First release of April 2017) + +@note We're aiming at releasing one major public release with new features per quarter. All releases in between will only contain bug fixes. + +@subsection S2_2_changelog Changelog + +v17.04 Public bug fixes release + The following functions have been ported to use the new accurate padding: + - @ref CLColorConvertKernel + - @ref CLEdgeNonMaxSuppressionKernel + - @ref CLEdgeTraceKernel + - @ref CLGaussianPyramidHorKernel + - @ref CLGaussianPyramidVertKernel + - @ref CLGradientKernel + - @ref NEChannelCombineKernel + - @ref NEFillArrayKernel + - @ref NEGaussianPyramidHorKernel + - @ref NEGaussianPyramidVertKernel + - @ref NEHarrisScoreFP16Kernel + - @ref NEHarrisScoreKernel + - @ref NEHOGDetectorKernel + - @ref NELogits1DMaxKernel + - @ref NELogits1DShiftExpSumKernel + - @ref NELogits1DNormKernel + - @ref NENonMaximaSuppression3x3FP16Kernel + - @ref NENonMaximaSuppression3x3Kernel + + +v17.03.1 First Major public release of the sources + - Renamed the library to arm_compute + - New CPP target introduced for C++ kernels shared between NEON and CL functions. + - New padding calculation interface introduced and ported most kernels / functions to use it. + - New OpenCL kernels / functions: + - @ref CLGEMMLowpMatrixMultiplyKernel / @ref CLGEMMLowp + - New NEON kernels / functions: + - @ref NENormalizationLayerKernel / @ref NENormalizationLayer + - @ref NETransposeKernel / @ref NETranspose + - @ref NELogits1DMaxKernel, @ref NELogits1DShiftExpSumKernel, @ref NELogits1DNormKernel / @ref NESoftmaxLayer + - @ref NEIm2ColKernel @ref NECol2ImKernel @ref NEConvolutionLayerWeightsReshapeKernel / @ref NEConvolutionLayer + - @ref NEGEMMMatrixAccumulateBiasesKernel / @ref NEFullyConnectedLayer + - @ref NEGEMMLowpMatrixMultiplyKernel / @ref NEGEMMLowp -v17.02 (Sources) +v17.03 Sources preview - New OpenCL kernels / functions: - - @ref CLActivationLayerKernel / @ref CLActivationLayer - - @ref CLChannelCombineKernel / @ref CLChannelCombine - - @ref CLDerivativeKernel / @ref CLChannelExtract - - @ref CLFastCornersKernel / @ref CLFastCorners - - @ref CLMeanStdDevKernel / @ref CLMeanStdDev + - @ref CLGradientKernel, @ref CLEdgeNonMaxSuppressionKernel, @ref CLEdgeTraceKernel / @ref CLCannyEdge + - GEMM refactoring + FP16 support: @ref CLGEMMInterleave4x4Kernel, @ref CLGEMMTranspose1xWKernel, @ref CLGEMMMatrixMultiplyKernel, @ref CLGEMMMatrixAdditionKernel / @ref CLGEMM + - @ref CLGEMMMatrixAccumulateBiasesKernel / @ref CLFullyConnectedLayer + - @ref CLTransposeKernel / @ref CLTranspose + - @ref CLLKTrackerInitKernel, @ref CLLKTrackerStage0Kernel, @ref CLLKTrackerStage1Kernel, @ref CLLKTrackerFinalizeKernel / @ref CLOpticalFlow + - @ref CLNormalizationLayerKernel / @ref CLNormalizationLayer + - @ref CLLaplacianPyramid, @ref CLLaplacianReconstruct - New NEON kernels / functions: - - HOG / SVM: @ref NEHOGOrientationBinningKernel, @ref NEHOGBlockNormalizationKernel, @ref NEHOGDetectorKernel, @ref NEHOGNonMaximaSuppressionKernel / @ref NEHOGDescriptor, @ref NEHOGDetector, @ref NEHOGGradient, @ref NEHOGMultiDetection - - @ref NENonLinearFilterKernel / @ref NENonLinearFilter - - Introduced a CLScheduler to manage the default context and command queue used by the runtime library and create synchronisation events. - - Switched all the kernels / functions to use tensors instead of images. - - Updated documentation to include instructions to build the library from sources. + - @ref NEActivationLayerKernel / @ref NEActivationLayer + - GEMM refactoring + FP16 support (Requires armv8.2 CPU): @ref NEGEMMInterleave4x4Kernel, @ref NEGEMMTranspose1xWKernel, @ref NEGEMMMatrixMultiplyKernel, @ref NEGEMMMatrixAdditionKernel / @ref NEGEMM + - @ref NEPoolingLayerKernel / @ref NEPoolingLayer -v17.02.1 (Sources) +v17.02.1 Sources preview - New OpenCL kernels / functions: - @ref CLLogits1DMaxKernel, @ref CLLogits1DShiftExpSumKernel, @ref CLLogits1DNormKernel / @ref CLSoftmaxLayer - @ref CLPoolingLayerKernel / @ref CLPoolingLayer @@ -127,34 +171,22 @@ v17.02.1 (Sources) - @ref NEBox3x3FP16Kernel - @ref NENonMaximaSuppression3x3FP16Kernel -v17.03 (Sources) +v17.02 Sources preview - New OpenCL kernels / functions: - - @ref CLGradientKernel, @ref CLEdgeNonMaxSuppressionKernel, @ref CLEdgeTraceKernel / @ref CLCannyEdge - - GEMM refactoring + FP16 support: @ref CLGEMMInterleave4x4Kernel, @ref CLGEMMTranspose1xWKernel, @ref CLGEMMMatrixMultiplyKernel, @ref CLGEMMMatrixAdditionKernel / @ref CLGEMM - - @ref CLGEMMMatrixAccumulateBiasesKernel / @ref CLFullyConnectedLayer - - @ref CLTransposeKernel / @ref CLTranspose - - @ref CLLKTrackerInitKernel, @ref CLLKTrackerStage0Kernel, @ref CLLKTrackerStage1Kernel, @ref CLLKTrackerFinalizeKernel / @ref CLOpticalFlow - - @ref CLNormalizationLayerKernel / @ref CLNormalizationLayer - - @ref CLLaplacianPyramid, @ref CLLaplacianReconstruct - - New NEON kernels / functions: - - @ref NEActivationLayerKernel / @ref NEActivationLayer - - GEMM refactoring + FP16 support (Requires armv8.2 CPU): @ref NEGEMMInterleave4x4Kernel, @ref NEGEMMTranspose1xWKernel, @ref NEGEMMMatrixMultiplyKernel, @ref NEGEMMMatrixAdditionKernel / @ref NEGEMM - - @ref NEPoolingLayerKernel / @ref NEPoolingLayer - -v17.03.1 (Sources) - - Renamed the library to arm_compute - - New CPP target introduced for C++ kernels shared between NEON and CL functions. - - New padding calculation interface introduced and ported most kernels / functions to use it. - - New OpenCL kernels / functions: - - @ref CLGEMMLowpMatrixMultiplyKernel / @ref CLGEMMLowp + - @ref CLActivationLayerKernel / @ref CLActivationLayer + - @ref CLChannelCombineKernel / @ref CLChannelCombine + - @ref CLDerivativeKernel / @ref CLChannelExtract + - @ref CLFastCornersKernel / @ref CLFastCorners + - @ref CLMeanStdDevKernel / @ref CLMeanStdDev - New NEON kernels / functions: - - @ref NENormalizationLayerKernel / @ref NENormalizationLayer - - @ref NETransposeKernel / @ref NETranspose - - @ref NELogits1DMaxKernel, @ref NELogits1DShiftExpSumKernel, @ref NELogits1DNormKernel / @ref NESoftmaxLayer - - @ref NEIm2ColKernel @ref NECol2ImKernel @ref NEConvolutionLayerWeightsReshapeKernel / @ref NEConvolutionLayer - - @ref NEGEMMMatrixAccumulateBiasesKernel / @ref NEFullyConnectedLayer - - @ref NEGEMMLowpMatrixMultiplyKernel / @ref NEGEMMLowp + - HOG / SVM: @ref NEHOGOrientationBinningKernel, @ref NEHOGBlockNormalizationKernel, @ref NEHOGDetectorKernel, @ref NEHOGNonMaximaSuppressionKernel / @ref NEHOGDescriptor, @ref NEHOGDetector, @ref NEHOGGradient, @ref NEHOGMultiDetection + - @ref NENonLinearFilterKernel / @ref NENonLinearFilter + - Introduced a CLScheduler to manage the default context and command queue used by the runtime library and create synchronisation events. + - Switched all the kernels / functions to use tensors instead of images. + - Updated documentation to include instructions to build the library from sources. +v16.12 Binary preview release + - Original release @section S3_how_to_build How to build the library and the examples @@ -171,7 +203,7 @@ To see the build options available simply run ```scons -h```: default: 0 actual: 0 - arch: Target Architecture (default=armv7a) (armv7a|arm64-v8a|arm64-v8.2-a|x86) + arch: Target Architecture (default=armv7a) (armv7a|arm64-v8a|arm64-v8.2-a|x86_32|x86_64) default: armv7a actual: armv7a @@ -199,12 +231,24 @@ To see the build options available simply run ```scons -h```: default: 0 actual: 0 + scheduler: Scheduler backend(Default=cpp) (cpp|pthread|openmp) + default: cpp + actual: cpp + + set_soname: Set the library's soname and shlibversion (Requires SCons 2.4 or above) (yes|no) + default: 0 + actual: False + + extra_cxx_flags: Extra CXX flags to be appended to the build command + default: + actual: + Debug / asserts: - With debug=1 asserts are enabled, and the library is built with symbols and no optimisations enabled. - With debug=0 and asserts=1: Optimisations are enabled and symbols are removed, however all the asserts are still present (This is about 20% slower than the release build) - With debug=0 and asserts=0: All optimisations are enable and no validation is performed, if the application misuses the library it is likely to result in a crash. (Only use this mode once you are sure your application is working as expected). -Architecture: The x86 target can only be used with neon=0 and opencl=1. +Architecture: The x86_32 and x86_64 targets can only be used with neon=0 and opencl=1. OS: Choose the operating system you are targeting: Linux, Android or bare metal. @note bare metal can only be used for NEON (not OpenCL), only static libraries get built and NEON's multi-threading support is disabled. @@ -213,10 +257,21 @@ Build type: you can either build directly on your device (native) or cross compi Werror: If you are compiling using the same toolchains as the ones used in this guide then there shouldn't be any warning and therefore you should be able to keep Werror=1. If with a different compiler version the library fails to build because of warnings interpreted as errors then, if you are sure the warnings are not important, you might want to try to build with Werror=0 (But please do report the issue either on Github or by an email to developer@arm.com so that the issue can be addressed). -OpenCL / NEON: Choose which SIMD technology you are interested targeting. (NEON for ARM Cortex-A CPUs or OpenCL for ARM Mali GPUs) +OpenCL / NEON: Choose which SIMD technology you want to target. (NEON for ARM Cortex-A CPUs or OpenCL for ARM Mali GPUs) embed_kernels: For OpenCL only: set embed_kernels=1 if you want the OpenCL kernels to be built in the library's binaries instead of being read from separate ".cl" files. If embed_kernels is set to 0 then the application can set the path to the folder containing the OpenCL kernel files by calling CLKernelLibrary::init(). By default the path is set to "./cl_kernels". +set_soname: Do you want to build the versioned version of the library ? +If enabled the library will contain a SONAME and SHLIBVERSION and some symlinks will automatically be created between the objects. +Example: + libarm_compute_core.so -> libarm_compute_core.so.1.0.0 + libarm_compute_core.so.1 -> libarm_compute_core.so.1.0.0 + libarm_compute_core.so.1.0.0 + +@note This options is disabled by default as it requires SCons version 2.4 or above. + +extra_cxx_flags: Custom CXX flags which will be appended to the end of the build command. + @subsection S3_2_linux Linux @subsubsection S3_2_1_library How to build the library ? @@ -233,6 +288,27 @@ To cross-compile the library in asserts mode, with OpenCL only support, for Linu scons Werror=1 -j8 debug=0 asserts=1 neon=0 opencl=1 embed_kernels=1 os=linux arch=arm64-v8a +You can also compile the library natively on an ARM device by using build=native: + + scons Werror=1 -j8 debug=0 neon=1 opencl=0 os=linux arch=arm64-v8a build=native + scons Werror=1 -j8 debug=0 neon=1 opencl=0 os=linux arch=armv7a build=native + +@note G++ for ARM is mono-arch, therefore if you want to compile for Linux 32bit on a Linux 64bit platform you will have to use a cross compiler. + +For example on a 64bit Debian based system you would have to install g++-arm-linux-gnueabihf + + apt-get install g++-arm-linux-gnueabihf + +Then run + + scons Werror=1 -j8 debug=0 neon=1 opencl=0 os=linux arch=armv7a build=cross_compile + +or simply remove the build parameter as build=cross_compile is the default value: + + scons Werror=1 -j8 debug=0 neon=1 opencl=0 os=linux arch=armv7a + +@attention To cross compile with opencl=1 you need to make sure to have a version of libOpenCL matching your target architecture. + @subsubsection S3_2_2_examples How to manually build the examples ? The examples get automatically built by scons as part of the build process of the library described above. This section just describes how you can build and link your own application against our library. @@ -447,7 +523,9 @@ The previous section shows how to run a NEON / CPP kernel in the current thread, @snippet src/runtime/CPP/CPPScheduler.cpp Scheduler example -This is the very basic implementation used in the NEON runtime library by all the NEON functions, @sa CPPScheduler. +This is the very basic implementation used in the NEON runtime library by all the NEON functions. + +@sa CPPScheduler. @note Some kernels like for example @ref NEHistogramKernel need some local temporary buffer to perform their calculations. In order to avoid memory corruption between threads, the local buffer must be of size: ```memory_needed_per_thread * num_threads``` and each subwindow must be initialised by calling @ref Window::set_thread_id() with a unique thread_id between 0 and num_threads. @@ -529,7 +607,7 @@ There are different ways padding can be calculated: @note It's important to call allocate @b after the function is configured: if the image / tensor is already allocated then the function will shrink its execution window instead of increasing the padding. (See below for more details). -- Manual padding / no padding / auto padding: You can allocate your images / tensors up front (before configuring your functions), in that case the function will use whatever padding is available and will shrink its execution window if there isn't enough padding available (Which will translates into a smaller valid region for the output @sa valid_region). +- Manual padding / no padding / auto padding: You can allocate your images / tensors up front (before configuring your functions), in that case the function will use whatever padding is available and will shrink its execution window if there isn't enough padding available (Which will translates into a smaller valid region for the output. See also @ref valid_region). If you don't want to manually set the padding but still want to allocate your objects upfront then you can use auto_padding. @code{.cpp} @@ -564,36 +642,18 @@ Some kernels (like edge detectors for example) need to read values of neighbouri Another case is: if a kernel processes 8 pixels per iteration then if the image's dimensions is not a multiple of 8 and not enough padding is available then the kernel will not be able to process the pixels near the right edge as a result these pixels will be left undefined. -In order to know which pixels have been calculated, each kernel sets a valid region for each output image or tensor @sa TensorInfo::valid_region(), @ref ValidRegion +In order to know which pixels have been calculated, each kernel sets a valid region for each output image or tensor. See also @ref TensorInfo::valid_region(), @ref ValidRegion @attention Valid regions and accurate padding have only been introduced in the library recently therefore not all the kernels and functions have been ported to use them yet. All the non ported kernels will set the @ref ValidRegion equal to the @ref TensorShape. List of kernels which haven't been ported yet: -- @ref CLColorConvertKernel -- @ref CLEdgeNonMaxSuppressionKernel -- @ref CLEdgeTraceKernel -- @ref CLGaussianPyramidHorKernel -- @ref CLGaussianPyramidVertKernel -- @ref CLGradientKernel -- @ref NEChannelCombineKernel - @ref NEColorConvertKernel -- @ref NEFillArrayKernel -- @ref NEGaussianPyramidHorKernel -- @ref NEGaussianPyramidVertKernel -- @ref NEHarrisScoreFP16Kernel -- @ref NEHarrisScoreKernel - @ref NEHistogramKernel - @ref NEHistogramBorderKernel - @ref NEHOGBlockNormalizationKernel -- @ref NEHOGDetectorKernel - @ref NEHOGOrientationBinningKernel -- @ref NELogits1DMaxKernel -- @ref NELogits1DShiftExpSumKernel -- @ref NELogits1DNormKernel - @ref NELKTrackerKernel -- @ref NENonMaximaSuppression3x3FP16Kernel -- @ref NENonMaximaSuppression3x3Kernel @subsubsection S4_6_2_tensors Tensors @@ -617,36 +677,27 @@ Each kernel specifies the expected layout of each of its tensors in its document @note Unless specified otherwise in the kernel's or function's documentation the number of channels for tensors is expected to be 1 (For images, the number of channels is inferred from the @ref Format). -@subsubsection S4_6_4_working_with_objects Working with Images and Tensors +@attention Regardless of the @ref DataType used by a tensor the @ref ITensor::buffer() method will always return a uint8_t pointer, and all the metadata in @ref TensorInfo will be expressed in bytes. It is the user's responsibility to cast the pointer to the correct type. -In the case that no padding exists in the Image/Tensor object you can linearize the object memory and directly copy to/from it. -@code{.cpp} -// Create a tensor object -Tensor tensor; -// Operate on tensor -... -// Copy results -unsigned char *dst = ... // Your unpadded destination buffer -// Copy tensor as a linear bulk of memory if no padding exists -if(!tensor.info()->has_padding()) -{ - std::copy_n(tensor.buffer(), tensor.info()->total_size(), dst); -} -@endcode +For example, to read the element located at the coordinates (x,y) of a float tensor: -On the other hand, in case of padding, each row should be carefully copied separately. @code{.cpp} -// Create an image object -Image img; -// Initialize image -const unsigned char *src = ... // Your unpadded input buffer -// Initialize the Image object using an RGB source image -for(unsigned int y = 0; y < height; ++y) -{ - // Copy one RGB row at a time - std::copy_n(img.buffer() + img.info()->offset_element_in_bytes(Coordinates(0, y)), width * 3, src + (y * width) * 3); -} +float value = *reinterpret_cast(input.buffer() + input.info()->offset_element_in_bytes(Coordinates(x,y))); @endcode +@subsubsection S4_6_4_working_with_objects Working with Images and Tensors using iterators + +The library provides some iterators to access objects' data. +Iterators are created by associating a data object (An image or a tensor for example) with an iteration window. + +Iteration windows are defined by an array of dimension, each of which is made of a start, end and step. + +The @ref execute_window_loop function takes an execution window, a lambda function and one or more iterators. +It will iterate through every element of the execution window and for each element it will update the iterators accordingly and call the lambda function. + +Here is a couple of examples of how to use the iterators to fill / read tensors: + +@snippet examples/neon_copy_objects.cpp Copy objects example + */ } diff --git a/documentation/_access_window_auto_padding_8h.xhtml b/documentation/_access_window_auto_padding_8h.xhtml index 4a4ddb72e..76a3261c1 100644 --- a/documentation/_access_window_auto_padding_8h.xhtml +++ b/documentation/_access_window_auto_padding_8h.xhtml @@ -40,7 +40,7 @@
ARM Compute Library -  17.03.1 +  17.04
@@ -143,7 +143,7 @@ Namespaces
-Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017 ARM Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #ifndef __ARM_COMPUTE_ACCESS_WINDOW_AUTO_PADDING_H__
25 #define __ARM_COMPUTE_ACCESS_WINDOW_AUTO_PADDING_H__
26 
30 #include "arm_compute/core/Types.h"
31 
32 namespace arm_compute
33 {
34 class Window;
35 class TensorInfo;
36 
48 {
49 public:
59  ~AccessWindowAutoPadding() = default;
60 
61  void set_valid_region();
62 
63  // Inherited methods overridden:
64  bool update_window_if_needed(Window &window) const override;
65  bool update_padding_if_needed(const Window &window) const override;
66  void set_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) override;
67 
68 private:
69  TensorInfo *_info;
70 };
71 } // namespace arm_compute
72 #endif /*__ARM_COMPUTE_ACCESS_WINDOW_AUTO_PADDING_H__*/
Container for 2D border size.
Definition: Types.h:116
+Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017 ARM Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #ifndef __ARM_COMPUTE_ACCESS_WINDOW_AUTO_PADDING_H__
25 #define __ARM_COMPUTE_ACCESS_WINDOW_AUTO_PADDING_H__
26 
30 #include "arm_compute/core/Types.h"
31 
32 namespace arm_compute
33 {
34 class Window;
35 class TensorInfo;
36 
48 {
49 public:
59  ~AccessWindowAutoPadding() = default;
60 
62  void set_valid_region();
63 
66 
67  // Inherited methods overridden:
68  bool update_window_if_needed(Window &window) const override;
69  bool update_padding_if_needed(const Window &window) const override;
70  ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
71 
72 private:
73  TensorInfo *_info;
74 };
75 } // namespace arm_compute
76 #endif /*__ARM_COMPUTE_ACCESS_WINDOW_AUTO_PADDING_H__*/
Container for 2D border size.
Definition: Types.h:116
AccessWindowAutoPadding(TensorInfo *info)
Default constructor.
@@ -126,9 +126,10 @@ $(document).ready(function(){initNavTree('_access_window_auto_padding_8h_source.
bool update_padding_if_needed(const Window &window) const override
Increase the padding to be large enough for the window.
+
ValidRegion compute_valid_region() const
Return a valid region that spans across the entire tensor.
Store the tensor&#39;s metadata.
Definition: TensorInfo.h:40
- +
void set_valid_region()
Set the valid region to match the entire tensor.
Describe a multidimensional execution window.
Definition: Window.h:39
@@ -139,7 +140,7 @@ $(document).ready(function(){initNavTree('_access_window_auto_padding_8h_source.
-Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017 ARM Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #ifndef __ARM_COMPUTE_IACCESS_WINDOW_STATIC_H__
25 #define __ARM_COMPUTE_IACCESS_WINDOW_STATIC_H__
26 
30 #include "arm_compute/core/Types.h"
31 
32 #include <array>
33 
34 namespace arm_compute
35 {
36 class Window;
37 class TensorInfo;
38 
47 {
48 public:
57  AccessWindowStatic(TensorInfo *info, int start_x, int start_y, int end_x, int end_y);
58 
59  AccessWindowStatic(const AccessWindowStatic &) = delete;
63  ~AccessWindowStatic() = default;
64 
65  // Inherited methods overriden:
66  bool update_window_if_needed(Window &window) const override;
67  bool update_padding_if_needed(const Window &window) const override;
68  void set_valid_region(const Window &window, ValidRegion input_valid_region);
69  void set_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) override;
70 
72  int _start_x;
73  int _start_y;
74  int _end_x;
75  int _end_y;
76 };
77 } // namespace arm_compute
78 #endif /*__ARM_COMPUTE_IACCESS_WINDOW_STATIC_H__*/
bool update_padding_if_needed(const Window &window) const override
Increase the padding to be large enough for the window.
+Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017 ARM Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #ifndef __ARM_COMPUTE_IACCESS_WINDOW_STATIC_H__
25 #define __ARM_COMPUTE_IACCESS_WINDOW_STATIC_H__
26 
30 #include "arm_compute/core/Types.h"
31 
32 #include <array>
33 
34 namespace arm_compute
35 {
36 class Window;
37 class TensorInfo;
38 
47 {
48 public:
57  AccessWindowStatic(TensorInfo *info, int start_x, int start_y, int end_x, int end_y);
58 
59  AccessWindowStatic(const AccessWindowStatic &) = delete;
63  ~AccessWindowStatic() = default;
64 
71  void set_valid_region(const Window &window, const ValidRegion &input_valid_region);
72 
78  ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region) const;
79 
80  // Inherited methods overriden:
81  bool update_window_if_needed(Window &window) const override;
82  bool update_padding_if_needed(const Window &window) const override;
83  ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
84 
86  int _start_x;
87  int _start_y;
88  int _end_x;
89  int _end_y;
90 };
91 } // namespace arm_compute
92 #endif /*__ARM_COMPUTE_IACCESS_WINDOW_STATIC_H__*/
bool update_padding_if_needed(const Window &window) const override
Increase the padding to be large enough for the window.
Container for 2D border size.
Definition: Types.h:116
- +
AccessWindowStatic & operator=(const AccessWindowStatic &)=delete
Implementation of a static rectangular access pattern.
bool update_window_if_needed(Window &window) const override
Shrink the window if padding is not large enough.
- + +
ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region) const
Compute the valid region based on the static access pattern and valid region of the inputs...
Interface describing methods to update access window and padding based on kernel parameters.
Definition: IAccessWindow.h:71
- + - +
Store the tensor&#39;s metadata.
Definition: TensorInfo.h:40
- +
Describe a multidimensional execution window.
Definition: Window.h:39
-
void set_valid_region(const Window &window, ValidRegion input_valid_region)
+
void set_valid_region(const Window &window, const ValidRegion &input_valid_region)
Set the valid region based on the static access pattern and valid region of the inputs.
AccessWindowStatic(TensorInfo *info, int start_x, int start_y, int end_x, int end_y)
Constructor for a static access pattern.
@@ -144,7 +145,7 @@ $(document).ready(function(){initNavTree('_access_window_static_8h_source.xhtml'
-Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017 ARM Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #ifndef __ARM_COMPUTE_IACCESS_WINDOW_TRANSPOSE_H__
25 #define __ARM_COMPUTE_IACCESS_WINDOW_TRANSPOSE_H__
26 
30 #include "arm_compute/core/Types.h"
31 
32 namespace arm_compute
33 {
34 class Window;
35 class TensorInfo;
36 
39 {
40 public:
42  bool update_window_if_needed(Window &window) const override;
43  bool update_padding_if_needed(const Window &window) const override;
45  void set_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) override;
46 };
47 } // namespace arm_compute
48 #endif /*__ARM_COMPUTE_IACCESS_WINDOW_TRANSPOSE_H__*/
Container for 2D border size.
Definition: Types.h:116
+Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017 ARM Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #ifndef __ARM_COMPUTE_IACCESS_WINDOW_TRANSPOSE_H__
25 #define __ARM_COMPUTE_IACCESS_WINDOW_TRANSPOSE_H__
26 
30 #include "arm_compute/core/Types.h"
31 
32 namespace arm_compute
33 {
34 class Window;
35 class TensorInfo;
36 
39 {
40 public:
42  bool update_window_if_needed(Window &window) const override;
43  bool update_padding_if_needed(const Window &window) const override;
45  ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
46 };
47 } // namespace arm_compute
48 #endif /*__ARM_COMPUTE_IACCESS_WINDOW_TRANSPOSE_H__*/
Container for 2D border size.
Definition: Types.h:116
+
ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override
Compute the valid region based on access pattern and valid region of the inputs.
+
ValidRegion compute_valid_region(const Window &window, const ValidRegion &input_valid_region) const
Compute the valid region based on access pattern, valid region of the inputs and border mode...
-
void set_valid_region(const Window &window, ValidRegion input_valid_region)
Set the valid region based on access pattern and valid region of the inputs.
-
Implementation of a rectangular access pattern.
-
void set_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) override
Set the valid region based on access pattern, valid region of the inputs and border mode...
-
AccessWindowRectangle(TensorInfo *info, int x, int y, int width, int height)
Constructor for a rectangular access pattern.
+
Implementation of a rectangular access pattern.
+
AccessWindowRectangle(TensorInfo *info, int x, int y, int width, int height)
Constructor for a rectangular access pattern.
Implementation of a XY-transpose access pattern.
bool update_window_if_needed(Window &window) const override
Shrink the window if padding is not large enough.
@@ -137,7 +137,7 @@ $(document).ready(function(){initNavTree('_access_window_transpose_8h_source.xht