From: 장지섭/On-Device Lab(SR)/Engineer/삼성전자 Date: Mon, 2 Dec 2019 01:56:03 +0000 (+0900) Subject: Make backend acl_cl to support InstanceNorm op (#9301) X-Git-Tag: submit/tizen/20191205.083104~63 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=da94579487e17a9004d9778dd2550418888e82fd;p=platform%2Fcore%2Fml%2Fnnfw.git Make backend acl_cl to support InstanceNorm op (#9301) This commit makes ackend acl_cl to support InstanceNorm op. Signed-off-by: jiseob.jang --- diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h new file mode 100644 index 0000000..f5e147e --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__ +#define __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Interface for performing an instance normalization */ +class CLInstanceNormalizationLayerKernelEx : public ICLKernel +{ +public: + /** Constructor */ + CLInstanceNormalizationLayerKernelEx(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLInstanceNormalizationLayerKernelEx(const CLInstanceNormalizationLayerKernelEx &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLInstanceNormalizationLayerKernelEx & + operator=(const CLInstanceNormalizationLayerKernelEx &) = delete; + /** Default Move Constructor. */ + CLInstanceNormalizationLayerKernelEx(CLInstanceNormalizationLayerKernelEx &&) = default; + /** Default move assignment operator */ + CLInstanceNormalizationLayerKernelEx & + operator=(CLInstanceNormalizationLayerKernelEx &&) = default; + /** Default destructor */ + ~CLInstanceNormalizationLayerKernelEx() = default; + + /** Set the input and output tensors. + * + * @param[in, out] input Source tensor. Data types supported: F16/F32. Data layout supported: + * NCHW + * @param[out] output Destination tensor. Data types and data layouts supported: same as @p + * input. + * @param[in] gamma (Optional) The scale tensor applied to the normalized tensor. Defaults + * to nullptr + * @param[in] beta (Optional) The offset tensor applied to the normalized tensor. Defaults + * to nullptr + * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12 + */ + void configure(ICLTensor *input, ICLTensor *output, ICLTensor *gamma = nullptr, + ICLTensor *beta = nullptr, float epsilon = 1e-12f); + + /** Static function to check if given info will lead to a valid configuration of @ref + * CLInstanceNormalizationLayerEx. + * + * @param[in] input Source tensor info. In case of @p output tensor = nullptr this tensor will + * store the result of the normalization. + * Data types supported: F16/F32. Data layout supported: NHWC, NCHW + * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p + * input. + * @param[in] gamma (Optional) The scale tensor applied to the normalized tensor. Defaults to + * nullptr + * @param[in] beta (Optional) The offset tensor applied to the normalized tensor. Defaults to + * nullptr + * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12 + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma = nullptr, const ITensorInfo *beta = nullptr, + float epsilon = 1e-12f); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + ICLTensor *_input; + ICLTensor *_output; + ICLTensor *_gamma; + ICLTensor *_beta; + float _epsilon; + bool _run_in_place; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h index 5fbbb25..831bb54 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h new file mode 100644 index 0000000..ed29db9 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__ +#define __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to perform a Instance normalization. + * + * This function runs the following kernels: + * -# @ref CLInstanceNormalizationLayerKernelEx + */ +class CLInstanceNormalizationLayerEx : public ICLSimpleFunction +{ +public: + /** Default constructor */ + CLInstanceNormalizationLayerEx(); + /** Set the input and output tensors. + * + * @param[in, out] input Source tensor. In case of @p output tensor = nullptr this tensor will + * store the result of the normalization. + * Data types supported: F16/F32. Data layout supported: NHWC, NCHW + * @param[out] output Destination tensor. Data types and data layouts supported: same as @p + * input. + * @param[in] gamma (Optional) The scale tensor applied to the normalized tensor. Defaults + * to nullptr + * @param[in] beta (Optional) The offset tensor applied to the normalized tensor. Defaults + * to nullptr + * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12 + */ + void configure(ICLTensor *input, ICLTensor *output, ICLTensor *gamma = nullptr, + ICLTensor *beta = nullptr, float epsilon = 1e-12f); + + /** Static function to check if given info will lead to a valid configuration of @ref + * CLInstanceNormalizationLayerEx. + * + * @param[in] input Source tensor info. Data types supported: F16/F32. Data layout supported: + * NHWC, NCHW + * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p + * input. + * @param[in] gamma (Optional) The scale tensor applied to the normalized tensor. Defaults to + * nullptr + * @param[in] beta (Optional) The offset tensor applied to the normalized tensor. Defaults to + * nullptr + * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12 + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma = nullptr, const ITensorInfo *beta = nullptr, + float epsilon = 1e-12f); +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__ */ diff --git a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp index 8081256..7d47606 100644 --- a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp +++ b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp @@ -54,6 +54,7 @@ const std::map CLKernelLibraryEx::_kernel_program_map {"gather_ex_1d", "gather_ex.cl"}, {"gather_ex_1d_out", "gather_ex.cl"}, {"hashtable_lookup", "hashtable_lookup.cl"}, + {"instance_normalization_ex", "instance_normalization_ex.cl"}, {"neg_tensor", "neg_tensor.cl"}, {"permute_generic", "permute_ex.cl"}, {"pixelwise_mul_qasymm8", "pixelwise_mul_quantized.cl"}, @@ -111,6 +112,10 @@ const std::map CLKernelLibraryEx::_program_source_map #include "./cl_kernels/helpers_asymm.hembed" }, { + "instance_normalization_ex.cl", +#include "./cl_kernels/instance_normalization_ex.clembed" + }, + { "binary_logical_op.cl", #include "./cl_kernels/binary_logical_op.clembed" }, diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl new file mode 100644 index 0000000..1d96150 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(DIM_X) && \ + defined(DIM_Y) && defined(DIM_Z) +/** This function normalizes the input 2D tensor across the first dimension with respect to mean and + * standard deviation of the same dimension. + * + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * @attention Data type should be passed using the -DDATA_TYPE=data_type compile flag, e.g. + * -DDATA_TYPE=float + * @attention Normalization epsilon parameter should be given as a preprocessor argument with + * -DEPSILON=value. e.g. -DEPSILON=0.001f + * @attention Dimensions X, Y, and Z should be given as a preprocessor argument with -DDIM_X=value, + * -DDIM_Y=value, -DDIM_Z=value. e.g. -DDIM_X=6, -DDIM_Y=2, -DDIM_Z=7 + * + * @param[in] input_ptr Pointer to the first source tensor. Supported + * data types: F16/F32 + * @param[in] input_stride_x Stride of the first source tensor in X dimension + * (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the first source tensor in Y dimension + * (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the first source tensor in Z dimension + * (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first + * source tensor + * @param[out] output_ptr (Optional) Pointer to the destination tensor. + * Supported data types: same as @p input_ptr + * @param[in] output_stride_x (Optional) Stride of the destination tensor in X + * dimension (in bytes) + * @param[in] output_step_x (Optional) output_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] output_stride_y (Optional) Stride of the destination tensor in Y + * dimension (in bytes) + * @param[in] output_step_y (Optional) output_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] output_stride_z (Optional) Stride of the destination tensor in Z + * dimension (in bytes) + * @param[in] output_step_z (Optional) output_stride_z * number of elements + * along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in + * the destination tensor + * @param[in] gamma_ptr (Optional) Pointer to the gamma tensor. + * Supported data types: same as @p input_ptr + * @param[in] gamma_stride_x (Optional) Stride of the gamma tensor in X + * dimension (in bytes) + * @param[in] gamma_step_x (Optional) output_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] gamma_offset_first_element_in_bytes (Optional) The offset of the first element in + * the gamma tensor + * @param[in] beta_ptr (Optional) Pointer to the beta tensor. Supported + * data types: same as @p input_ptr + * @param[in] beta_stride_x (Optional) Stride of the beta tensor in X + * dimension (in bytes) + * @param[in] beta_step_x (Optional) output_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] beta_offset_first_element_in_bytes (Optional) The offset of the first element in + * the beta tensor + */ +__kernel void instance_normalization_ex(TENSOR4D_DECLARATION(input), +#ifndef IN_PLACE + TENSOR4D_DECLARATION(output) +#endif /* IN_PLACE */ +#ifdef GAMMA + , + VECTOR_DECLARATION(gamma) +#endif // GAMMA +#ifdef BETA + , + VECTOR_DECLARATION(beta) +#endif // BETA + ) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); +#ifndef IN_PLACE + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); +#endif /* IN_PLACE */ + + float sum = 0.f; + float sum_sq = 0.f; + +#if defined(NHWC) + + const int ch = get_global_id(0); // Current channel + const int batch = get_global_id(2); // Current batch + const int elements_plane = DIM_Y * DIM_Z; + + for (int i_w = 0; i_w < DIM_Y; ++i_w) + { + for (int i_h = 0; i_h < DIM_Z; ++i_h) + { + float data = (float)*((__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch)); + sum += data; + sum_sq += data * data; + } + } + +#else // !defined(NHWC) + const int ch = get_global_id(2) % DIM_Z; // Current channel + const int batch = get_global_id(2) / DIM_Z; // Current batch + const int elements_plane = DIM_X * DIM_Y; + + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + part_sum = 0.f; + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + part_sum_sq = 0.f; + // Calculate partial sum + for (int y = 0; y < DIM_Y; ++y) + { + int x = 0; + for (; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE) + { + // Load data + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch)); + part_sum += data; + part_sum_sq += data * data; + } + // Left-overs loop + for (; x < DIM_X; ++x) + { + DATA_TYPE data = *((__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch)); + part_sum.s0 += data; + part_sum_sq.s0 += data * data; + } + } +// Perform reduction +#if VEC_SIZE > 8 + part_sum.s01234567 += part_sum.s89abcdef; + part_sum_sq.s01234567 += part_sum_sq.s89abcdef; +#endif // VEC_SIZE > 8 +#if VEC_SIZE > 4 + part_sum.s0123 += part_sum.s4567; + part_sum_sq.s0123 += part_sum_sq.s4567; +#endif // VEC_SIZE > 4 +#if VEC_SIZE > 2 + part_sum.s01 += part_sum.s23; + part_sum_sq.s01 += part_sum_sq.s23; +#endif // VEC_SIZE > 2 + part_sum.s0 += part_sum.s1; + part_sum_sq.s0 += part_sum_sq.s1; + + sum = (float)part_sum.s0; + sum_sq = (float)part_sum_sq.s0; + +#endif // defined(NHWC) + + const float mean_float = (sum / elements_plane); + const DATA_TYPE mean = (DATA_TYPE)mean_float; + const float var_float = (sum_sq / elements_plane) - (mean_float * mean_float); +#if defined(GAMMA) + const float multip_float = *((__global DATA_TYPE *)gamma_ptr + ch) / sqrt(var_float + EPSILON); + const DATA_TYPE multip = (DATA_TYPE)multip_float; +#else // !defined(GAMMA) + const DATA_TYPE multip = (DATA_TYPE)0; +#endif // defined(GAMMA) +#if defined(BETA) + const DATA_TYPE beta = *((__global DATA_TYPE *)beta_ptr + ch); +#else // !defined(BETA) + const DATA_TYPE beta = 0; +#endif // defined(BETA) + +#if defined(NHWC) + + for (int i_w = 0; i_w < DIM_Y; ++i_w) + { + for (int i_h = 0; i_h < DIM_Z; ++i_h) + { + __global DATA_TYPE *input_address = + (__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch); +#ifdef IN_PLACE + __global DATA_TYPE *output_address = input_address; +#else /* !IN_PLACE */ + __global DATA_TYPE *output_address = + (__global DATA_TYPE *)tensor4D_offset(&out, ch, i_w, i_h, batch); +#endif /* IN_PLACE */ + *(output_address) = (*(input_address)-mean) * multip + beta; + } + } + +#else // !defined(NHWC) + for (int y = 0; y < DIM_Y; ++y) + { + int x = 0; + for (; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE) + { + __global DATA_TYPE *input_address = + (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch); +#ifdef IN_PLACE + __global DATA_TYPE *output_address = input_address; +#else /* !IN_PLACE */ + __global DATA_TYPE *output_address = + (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch); +#endif /* IN_PLACE */ + + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + data = VLOAD(VEC_SIZE)(0, input_address); + + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + res = (data - mean) * multip + beta; + VSTORE(VEC_SIZE) + (res, 0, output_address); + } + // Left-overs loop + for (; x < DIM_X; ++x) + { + __global DATA_TYPE *input_address = + (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch); +#ifdef IN_PLACE + __global DATA_TYPE *output_address = input_address; +#else /* !IN_PLACE */ + __global DATA_TYPE *output_address = + (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch); +#endif /* IN_PLACE */ + *(output_address) = (*(input_address)-mean) * multip + beta; + } + } +#endif // defined(NHWC) +} +#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(DIM_X) && \ + defined(DIM_Y) && defined(DIM_Z) */ diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp new file mode 100644 index 0000000..5db414f --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/CLValidate.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Window.h" + +#include "support/ToolchainSupport.h" + +namespace arm_compute +{ +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma, const ITensorInfo *beta, float epsilon) +{ + ARM_COMPUTE_UNUSED(gamma); + ARM_COMPUTE_UNUSED(beta); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(epsilon == 0.f, "Epsilon must be different than 0"); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); + + if (output != nullptr && output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), + "Input and output have different number of channels"); + } + + return Status{}; +} + +std::tuple validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + // We handle the planes manually + Window win = calculate_max_window(*input, Steps(1)); + + // Output auto initialization if not yet initialized + auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type()); + + // CLInstanceNormalizationLayerKernelEx doesn't need padding so update_window_and_padding() can be + // skipped + Coordinates coord; + coord.set_num_dimensions(output->num_dimensions()); + output->set_valid_region(ValidRegion(coord, output->tensor_shape())); + return std::make_pair(Status{}, win); +} +} // namespace + +CLInstanceNormalizationLayerKernelEx::CLInstanceNormalizationLayerKernelEx() + : _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon(1e-12), + _run_in_place(false) +{ +} + +void CLInstanceNormalizationLayerKernelEx::configure(ICLTensor *input, ICLTensor *output, + ICLTensor *gamma, ICLTensor *beta, + float epsilon) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + + _input = input; + _output = output == nullptr ? input : output; + _gamma = gamma; + _beta = beta; + _epsilon = epsilon; + + _run_in_place = (output == nullptr) || (output == input); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(_input->info(), _output->info(), + gamma ? gamma->info() : nullptr, + beta ? beta->info() : nullptr, epsilon)); + const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size(); + + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.add_option("-DVEC_SIZE=" + + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DDIM_X=" + support::cpp11::to_string(input->info()->dimension(0))); + build_opts.add_option("-DDIM_Y=" + support::cpp11::to_string(input->info()->dimension(1))); + build_opts.add_option("-DDIM_Z=" + support::cpp11::to_string(input->info()->dimension(2))); + build_opts.add_option("-DEPSILON=" + float_to_string_with_full_precision(epsilon)); + build_opts.add_option_if(gamma, "-DGAMMA"); + build_opts.add_option_if(beta, "-DBETA"); + build_opts.add_option_if(_run_in_place, "-DIN_PLACE"); + build_opts.add_option_if(_input->info()->data_layout() == DataLayout::NHWC, "-DNHWC"); + + // Create kernel + _kernel = static_cast( + CLKernelLibraryEx::get().create_kernel("instance_normalization_ex", build_opts.options())); + + // Configure kernel window + auto win_config = validate_and_configure_window(_input->info(), _output->info()); + ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); + ICLKernel::configure_internal(std::get<1>(win_config)); +} + +Status CLInstanceNormalizationLayerKernelEx::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *gamma, + const ITensorInfo *beta, float epsilon) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, gamma, beta, epsilon)); + ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window( + input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get())))); + return Status{}; +} + +void CLInstanceNormalizationLayerKernelEx::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window collapsed_window = window.collapse(window, Window::DimZ); + + // We will process the planes together + if (_input->info()->data_layout() == DataLayout::NCHW) + { + collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1)); + collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1)); + } + else + { + collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1)); + collapsed_window.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(3), 1)); + } + + Window vec_window; + vec_window.set(Window::DimX, Window::Dimension(0, 0, 0)); + + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, collapsed_window); + if (!_run_in_place) + { + add_4D_tensor_argument(idx, _output, collapsed_window); + } + if (_gamma) + { + add_1D_tensor_argument(idx, _gamma, vec_window); + } + if (_beta) + { + add_1D_tensor_argument(idx, _beta, vec_window); + } + + enqueue(queue, *this, collapsed_window, lws_hint()); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp new file mode 100644 index 0000000..86ea5a6 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h" + +#include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +CLInstanceNormalizationLayerEx::CLInstanceNormalizationLayerEx() {} + +void CLInstanceNormalizationLayerEx::configure(ICLTensor *input, ICLTensor *output, + ICLTensor *gamma, ICLTensor *beta, float epsilon) +{ + auto k = arm_compute::support::cpp14::make_unique(); + k->configure(input, output, gamma, beta, epsilon); + _kernel = std::move(k); +} + +Status CLInstanceNormalizationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma, const ITensorInfo *beta, + float epsilon) +{ + return CLInstanceNormalizationLayerKernelEx::validate(input, output, gamma, beta, epsilon); +} +} // namespace arm_compute diff --git a/runtime/neurun/backend/acl_cl/KernelGenerator.cc b/runtime/neurun/backend/acl_cl/KernelGenerator.cc index 6f91551..dae98fe 100644 --- a/runtime/neurun/backend/acl_cl/KernelGenerator.cc +++ b/runtime/neurun/backend/acl_cl/KernelGenerator.cc @@ -843,6 +843,32 @@ void KernelGenerator::visit(const model::operation::Exp &node) _execution_builder->append(std::move(acl_fn)); } +void KernelGenerator::visit(const model::operation::InstanceNorm &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(model::operation::InstanceNorm::Input::INPUT)}; + const auto gamma_index{node.getInputs().at(model::operation::InstanceNorm::Input::GAMMA)}; + const auto beta_index{node.getInputs().at(model::operation::InstanceNorm::Input::BETA)}; + + auto ofm_alloc = _tensor_builder->at(ofm_index).get(); + auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto gamma_alloc = _tensor_builder->at(gamma_index).get(); + auto beta_alloc = _tensor_builder->at(beta_index).get(); + auto epsilon = node.param().epsilon; + auto activation = node.param().activation; + + auto fn = nnfw::cpp14::make_unique<::arm_compute::CLInstanceNormalizationLayerEx>(); + + fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), gamma_alloc->handle(), + beta_alloc->handle(), epsilon); + + auto acl_fn = asAclFunction(std::move(fn)); + + _execution_builder->append(std::move(acl_fn)); + + ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle()); +} + void KernelGenerator::visit(const model::operation::Logistic &node) { const auto ofm_index{node.getOutputs().at(0)}; diff --git a/runtime/neurun/backend/acl_cl/KernelGenerator.h b/runtime/neurun/backend/acl_cl/KernelGenerator.h index e0741f8..2f8d90b 100644 --- a/runtime/neurun/backend/acl_cl/KernelGenerator.h +++ b/runtime/neurun/backend/acl_cl/KernelGenerator.h @@ -56,6 +56,7 @@ public: void visit(const model::operation::Cast &) override; void visit(const model::operation::Div &) override; void visit(const model::operation::Exp &) override; + void visit(const model::operation::InstanceNorm &) override; void visit(const model::operation::Logistic &) override; void visit(const model::operation::ReduceMax &) override; void visit(const model::operation::Comparison &) override; diff --git a/runtime/neurun/backend/acl_cl/ShapeFixer.cc b/runtime/neurun/backend/acl_cl/ShapeFixer.cc index 4c4878f..d8a8035 100644 --- a/runtime/neurun/backend/acl_cl/ShapeFixer.cc +++ b/runtime/neurun/backend/acl_cl/ShapeFixer.cc @@ -181,6 +181,8 @@ void ShapeFixer::visit(const model::operation::Div &node) void ShapeFixer::visit(const model::operation::Exp &) { /* DO NOTHING */} +void ShapeFixer::visit(const model::operation::InstanceNorm &) { /* DO NOTHING */} + void ShapeFixer::visit(const model::operation::Logistic &) { /* DO NOTHING */} void ShapeFixer::visit(const model::operation::LogicalAnd &node) diff --git a/runtime/neurun/backend/acl_cl/ShapeFixer.h b/runtime/neurun/backend/acl_cl/ShapeFixer.h index 4f91084..b7256f7 100644 --- a/runtime/neurun/backend/acl_cl/ShapeFixer.h +++ b/runtime/neurun/backend/acl_cl/ShapeFixer.h @@ -57,6 +57,7 @@ public: void visit(const model::operation::Cast &) override; void visit(const model::operation::Div &) override; void visit(const model::operation::Exp &) override; + void visit(const model::operation::InstanceNorm &) override; void visit(const model::operation::Logistic &) override; void visit(const model::operation::ReduceMax &) override; void visit(const model::operation::Comparison &) override; diff --git a/runtime/neurun/core/src/compiler/ManualScheduler.cc b/runtime/neurun/core/src/compiler/ManualScheduler.cc index 42c5989..f7d859c 100644 --- a/runtime/neurun/core/src/compiler/ManualScheduler.cc +++ b/runtime/neurun/core/src/compiler/ManualScheduler.cc @@ -74,6 +74,7 @@ std::unique_ptr ManualScheduler::schedule(const graph::Graph &g std::unordered_map op_type_map; // By default, Custom uses cpu backend op_type_map[model::OpCode::Custom] = backend::BackendManager::get().get("cpu"); + #define OP(InternalName) \ { \ const auto &backend_str = util::getConfigString(util::config::OP_BACKEND_##InternalName); \