From 5cb9a4a4642de09414925f110485a247d10c0e4f Mon Sep 17 00:00:00 2001 From: =?utf8?q?=EC=98=A4=ED=98=95=EC=84=9D/On-Device=20Lab=28SR=29/Staff?= =?utf8?q?=20Engineer/=EC=82=BC=EC=84=B1=EC=A0=84=EC=9E=90?= Date: Wed, 27 Mar 2019 18:53:14 +0900 Subject: [PATCH] Fix cl codes' format (#4820) Fix cl codes' format using c++ format Signed-off-by: Hyeongseok Oh --- .../src/core/CL/cl_kernels/activation_layer_ex.cl | 99 ++++--- .../src/core/CL/cl_kernels/arg_operation.cl | 135 +++++---- .../core/CL/cl_kernels/arithmetic_op_quantized.cl | 193 ++++++++----- .../src/core/CL/cl_kernels/binary_logical_op.cl | 122 ++++---- libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl | 238 +++++++++------ .../src/core/CL/cl_kernels/depth_to_space.cl | 202 +++++++------ .../src/core/CL/cl_kernels/embedding_lookup.cl | 123 +++++--- libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl | 63 ++-- libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl | 156 ++++++---- .../src/core/CL/cl_kernels/hashtable_lookup.cl | 131 +++++---- .../src/core/CL/cl_kernels/neg_tensor.cl | 37 ++- libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl | 127 ++++---- .../src/core/CL/cl_kernels/permute_ex.cl | 97 ++++--- .../core/CL/cl_kernels/pixelwise_mul_quantized.cl | 146 ++++++---- libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl | 100 ++++--- .../src/core/CL/cl_kernels/prelu_quantized.cl | 124 ++++---- .../src/core/CL/cl_kernels/reduce_operation.cl | 240 ++++++++------- .../src/core/CL/cl_kernels/space_to_batch.cl | 321 +++++++++++++-------- .../src/core/CL/cl_kernels/space_to_depth.cl | 202 +++++++------ .../src/core/CL/cl_kernels/squared_difference.cl | 102 ++++--- libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl | 61 ++-- .../src/core/CL/cl_kernels/topkv2_quicksort.cl | 125 ++++---- .../src/core/CL/cl_kernels/topkv2_radixsort.cl | 182 ++++++------ scripts/command/format-check | 2 +- 24 files changed, 1947 insertions(+), 1381 deletions(-) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl index f54c7bd..5f73e41 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/activation_layer_ex.cl @@ -23,10 +23,7 @@ #define RSQRT_OP(a) DIV_OP(CONST_ONE, sqrt((a))) // Inverse Square-root Activation -inline TYPE rsqrt_op(TYPE x) -{ - return RSQRT_OP(x); -} +inline TYPE rsqrt_op(TYPE x) { return RSQRT_OP(x); } #define ACTIVATION_OP2(op, x) op##_op(x) #define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x) @@ -35,55 +32,75 @@ inline TYPE rsqrt_op(TYPE x) /** This performs an activation function floating point inputs. * - * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time + * @note In order to perform the activation function "in-place", + * the pre-processor -DIN_PLACE must be passed at compile time * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short - * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH - * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively. + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. + * e.g. -DDATA_TYPE=short + * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. + * e.g. -DVEC_SIZE=16 + * @note Activation function should be given as a preprocessor argument using -DACT=name. + * e.g. -DACT=TANH + * @note A, B variables required by some activation functions are set + * using -DA_VAL= and -DB_VAL= respectively. * - * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] input_ptr Pointer to the source image. + * Supported data types: F16/F32 + * @param[in] input_stride_x Stride of the source image in X dimension + * (in bytes) + * @param[in] input_step_x input_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension + * (in bytes) + * @param[in] input_step_y input_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension + * (in bytes) + * @param[in] input_step_z input_stride_z * number of elements + * along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element + * in the source image + * @param[out] output_ptr Pointer to the destination image. + * Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image + * in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image + * in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension + * (in bytes) + * @param[in] output_step_z output_stride_z * number of elements + * along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element + * in the destination image */ -__kernel void activation_layer_ex( - TENSOR3D_DECLARATION(input) +__kernel void activation_layer_ex(TENSOR3D_DECLARATION(input) #ifndef IN_PLACE - , - TENSOR3D_DECLARATION(output) + , + TENSOR3D_DECLARATION(output) #endif /* not IN_PLACE */ -) + ) { - // Get pixels pointer - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + // Get pixels pointer + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); #ifdef IN_PLACE - Tensor3D output = input; + Tensor3D output = input; #else /* IN_PLACE */ - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); #endif /* IN_PLACE */ - // Load data - TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr); + // Load data + TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr); - // Perform activation - data = ACTIVATION_OP(ACT, data); + // Perform activation + data = ACTIVATION_OP(ACT, data); - // Store result - VSTORE(VEC_SIZE) - (data, 0, (__global DATA_TYPE *)output.ptr); + // Store result + VSTORE(VEC_SIZE) + (data, 0, (__global DATA_TYPE *)output.ptr); } #endif /* defined(ACT) */ diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl index a7b3550..2a6dfc9 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl @@ -19,76 +19,95 @@ #if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) /** Perform arg_max/arg_min * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short - * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 - * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using - * -DOP_CODE = number. e.g. -DOP_CODE=1 + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. + * e.g. -DDATA_TYPE=short + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. + * e.g. -DDEPTH_OUT=16 + * @attention Operation type(code) specifying which operation to perform should be passed as + * preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1 * - * @param[in] input_ptr Pointer to the source image. Supported data types: U8/QASYMM8/S8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[out] output_ptr Pointer to the destination image. Supported data types: U32 - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] input_ptr Pointer to the source image. Supported data + * types: + * U8/QASYMM8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension + * (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension + * (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension + * (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element + * in the source image + * @param[in] input_stride_w Stride of the source tensor in W dimension + * (in bytes) + * @param[in] input_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination image. + * Supported data types: U32 + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension + * (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension + * (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image * @param[in] axis Axis through which reduction occurs * @param[in] dim Dimension across the axis to be reduced. */ -__kernel void arg_op(TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output), - const int axis, - const int dim) +__kernel void arg_op(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), const int axis, + const int dim) { - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); - int indices[4] = - { - get_global_id(0), - get_global_id(1), - get_global_id(2) % DEPTH_OUT, - get_global_id(2) / DEPTH_OUT, - }; + int indices[4] = { + get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT, + get_global_id(2) / DEPTH_OUT, + }; - DATA_TYPE value = *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); - DATA_TYPE tval = value; - int idx = 0; - for(int i = 1; i < dim; ++i) - { - indices[axis] = i; + DATA_TYPE value = + *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); + DATA_TYPE tval = value; + int idx = 0; + for (int i = 1; i < dim; ++i) + { + indices[axis] = i; - #if OP_CODE == 1 // ArgMax - value = max(value, *((__global DATA_TYPE *) - tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]))); - #elif OP_CODE == 2 //ArgMin - value = min(value, *((__global DATA_TYPE *) - tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]))); - #else - return; +#if OP_CODE == 1 // ArgMax + value = max(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], + indices[2], indices[3]))); +#elif OP_CODE == 2 // ArgMin + value = min(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], + indices[2], indices[3]))); +#else + return; - #endif +#endif - if(tval!=value) - { - idx = indices[axis]; - tval = value; - } + if (tval != value) + { + idx = indices[axis]; + tval = value; } + } - *((__global uint *)out.ptr) = idx; + *((__global uint *)out.ptr) = idx; } #endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl index 5cd0a43..77e239f 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl @@ -24,7 +24,8 @@ #define SUB(x, y) (x) - (y) #endif /* SATURATE */ -/** Performs a pixelwise addition used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8 +/** Performs a pixelwise addition used to quantize down the int32 accumulator values of GEMMLowp to + * QASYMM8 * * The following computations will be performed: * @@ -36,91 +37,131 @@ * -# Shift the int32 accumulator by result_shift * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8. * - * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: - * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar - * @attention The number of bits to shift left of input tensors must be passed at compile time using -DLEFT_SHIFT - * @attention The offset, scalar scale factor and number of bits to shift right of input tensors must be passed at compile time using -DIN1_OFFSET, -RIN1_MULT_INT, -DIN1_SHIFT, -DIN2_OFFSET, -RIN2_MULT_INT and -DIN2_SHIFT - * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT + * @attention The inputs and output data types need to be passed at compile time using + * -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: + * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar + * @attention The number of bits to shift left of input tensors must be passed at compile time using + * -DLEFT_SHIFT + * @attention The offset, scalar scale factor and number of bits to shift right of input tensors + * must be passed at compile time using -DIN1_OFFSET, -RIN1_MULT_INT, -DIN1_SHIFT, + -DIN2_OFFSET, + * -RIN2_MULT_INT and -DIN2_SHIFT + * @attention The offset, scalar scale factor and number of bits to shift right of output tensor + * must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and + -DRESULT_SHIFT * - * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: - * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar - * @attention The inputs and output scale information of qasymm8 need to be passed at compile time using -DSCALE_IN1, -DSCALE_IN2 and -DSCALE_OUT: - * e.g. -DSCALE_IN1=1.f -DSCALE_IN2=1.f -DSCALE_OUT=2.f - * @attention The inputs and output scale offset need to be passed at compile time using -DOFFSET_IN1, -DOFFSET_IN2 and -DOFFSET_OUT: - * e.g. -DOFFSET_IN1=0 -DOFFSET_IN2=0 -DOFFSET_OUT=0 - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used. + * @attention The input and output data_types need to be passed at compile time using + * -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: + * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar + * @attention The inputs and output scale information of qasymm8 need to be passed at compile time + * using -DSCALE_IN1, -DSCALE_IN2 and -DSCALE_OUT: + * e.g. -DSCALE_IN1=1.f -DSCALE_IN2=1.f -DSCALE_OUT=2.f + * @attention The inputs and output scale offset need to be passed at compile time using + * -DOFFSET_IN1, -DOFFSET_IN2 and -DOFFSET_OUT: + * e.g. -DOFFSET_IN1=0 -DOFFSET_IN2=0 -DOFFSET_OUT=0 + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise + * wrapping policy will be used. * - * @param[in] in1_ptr Pointer to the source tensor. Supported data types: QASYMM8 - * @param[in] in1_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in1_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in1_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] in1_step_z in1_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[in] in2_ptr Pointer to the source tensor. Supported data types: QASYMM8 - * @param[in] in2_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in2_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in2_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] in2_step_z in2_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] out_ptr Pointer to the destination tensor. Supported data types: QASYMM8 - * @param[in] out_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] out_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] out_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] out_step_z out_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] in1_ptr Pointer to the source tensor. + * Supported data types: QASYMM8 + * @param[in] in1_stride_x Stride of the source tensor in X dimension + * (in bytes) + * @param[in] in1_step_x in1_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] in1_stride_y Stride of the source tensor in Y dimension + * (in bytes) + * @param[in] in1_step_y in1_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] in1_stride_z Stride of the source tensor in Z dimension + * (in bytes) + * @param[in] in1_step_z in1_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] in2_ptr Pointer to the source tensor. Supported data types: + * QASYMM8 + * @param[in] in2_stride_x Stride of the source tensor in X dimension + * (in bytes) + * @param[in] in2_step_x in2_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] in2_stride_y Stride of the source tensor in Y dimension + * (in bytes) + * @param[in] in2_step_y in2_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] in2_stride_z Stride of the source tensor in Z dimension + * (in bytes) + * @param[in] in2_step_z in2_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[out] out_ptr Pointer to the destination tensor. + * Supported data types: QASYMM8 + * @param[in] out_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] out_stride_z Stride of the source tensor in Z dimension + * (in bytes) + * @param[in] out_step_z out_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination + * tensor */ -__kernel void arithmetic_add_qasymm8( - TENSOR3D_DECLARATION(in1), - TENSOR3D_DECLARATION(in2), - TENSOR3D_DECLARATION(out)) +__kernel void arithmetic_add_qasymm8(TENSOR3D_DECLARATION(in1), TENSOR3D_DECLARATION(in2), + TENSOR3D_DECLARATION(out)) { - // Get pixels pointer - Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); - Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); - Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); + // Get pixels pointer + Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); + Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); - // Load data - VEC_DATA_TYPE(int, 16) - in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16)); - VEC_DATA_TYPE(int, 16) - in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16)); + // Load data + VEC_DATA_TYPE(int, 16) + in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16)); + VEC_DATA_TYPE(int, 16) + in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16)); - // Get scaled value of two inputs - VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET); - VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET); + // Get scaled value of two inputs + VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET); + VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET); - VEC_DATA_TYPE(int, 16) left_shift = (VEC_DATA_TYPE(int, 16))1 << (VEC_DATA_TYPE(int, 16))(LEFT_SHIFT); - VEC_DATA_TYPE(int, 16) shifted_in1_val = in1_val * left_shift; - VEC_DATA_TYPE(int, 16) shifted_in2_val = in2_val * left_shift; + VEC_DATA_TYPE(int, 16) + left_shift = (VEC_DATA_TYPE(int, 16))1 << (VEC_DATA_TYPE(int, 16))(LEFT_SHIFT); + VEC_DATA_TYPE(int, 16) shifted_in1_val = in1_val * left_shift; + VEC_DATA_TYPE(int, 16) shifted_in2_val = in2_val * left_shift; - VEC_DATA_TYPE(int, 16) scaled_in1_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in1_val, IN1_MULT_INT, IN1_SHIFT, 16); - VEC_DATA_TYPE(int, 16) scaled_in2_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in2_val, IN2_MULT_INT, IN2_SHIFT, 16); + VEC_DATA_TYPE(int, 16) + scaled_in1_val = + ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in1_val, IN1_MULT_INT, IN1_SHIFT, 16); + VEC_DATA_TYPE(int, 16) + scaled_in2_val = + ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in2_val, IN2_MULT_INT, IN2_SHIFT, 16); - // Add inputs and multiply with a multiplier smaller than 1 - VEC_DATA_TYPE(int, 16) sum_val = scaled_in1_val + scaled_in2_val; - VEC_DATA_TYPE(int, 16) out_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(sum_val, RESULT_MULT_INT, RESULT_SHIFT, 16); - out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET); + // Add inputs and multiply with a multiplier smaller than 1 + VEC_DATA_TYPE(int, 16) sum_val = scaled_in1_val + scaled_in2_val; + VEC_DATA_TYPE(int, 16) + out_val = + ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(sum_val, RESULT_MULT_INT, RESULT_SHIFT, 16); + out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET); - VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16)); + VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16)); -// TODO: Apply min-max BOUND to support fuse with relu. -/* -#if defined(MIN_BOUND) - res = max(res, (uchar16)MIN_BOUND); -#endif // defined(MIN_BOUND) -#if defined(MAX_BOUND) - res = min(res, (uchar16)MAX_BOUND); -#endif // defined(MAX_BOUND) -*/ + // TODO: Apply min-max BOUND to support fuse with relu. + /* + #if defined(MIN_BOUND) + res = max(res, (uchar16)MIN_BOUND); + #endif // defined(MIN_BOUND) + #if defined(MAX_BOUND) + res = min(res, (uchar16)MAX_BOUND); + #endif // defined(MAX_BOUND) + */ - // Store result - VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), - 0, (__global DATA_TYPE_OUT *)out.ptr); + // Store result + VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr); } diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl index bea61f5..8c87551 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl @@ -22,63 +22,85 @@ #if defined(OP_CODE) && defined(DATA_TYPE) /** returns truth value of the two input tensors for BINARY LOGICAL OP. - * where BINARY LOGICAL OP can be AND, OR. + * where BINARY LOGICAL OP can be AND, OR. * * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=uchar - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using - * -DOP_CODE = number. e.g. -DOP_CODE=1 + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. + * e.g. -DVEC_SIZE=16 + * @attention Operation type(code) specifying which operation to perform should be passed as + * preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1 * - * @param[in] input1_ptr Pointer to the source tensor. Supported data types: QASYMM8 - * @param[in] input1_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input1_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source tensor - * - * @param[in] input2_ptr Pointer to the source tensor.Supported data types: QASYMM8 - * @param[in] input2_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input2_step_x input2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input2_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input2_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input2_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input2_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the source tensor - * - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: QASYMM8 - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input1_ptr Pointer to the source tensor. + * Supported data types: QASYMM8 + * @param[in] input1_stride_x Stride of the source tensor in X dimension + * (in bytes) + * @param[in] input1_step_x input1_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input1_stride_y Stride of the source tensor in Y dimension + * (in bytes) + * @param[in] input1_step_y input1_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input1_stride_z Stride of the source tensor in Z dimension + * (in bytes) + * @param[in] input1_step_z input1_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] input2_ptr Pointer to the source tensor. + * Supported data types: QASYMM8 + * @param[in] input2_stride_x Stride of the source tensor in X dimension + * (in bytes) + * @param[in] input2_step_x input2_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input2_stride_y Stride of the source tensor in Y dimension + * (in bytes) + * @param[in] input2_step_y input2_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input2_stride_z Stride of the source tensor in Z dimension + * (in bytes) + * @param[in] input2_step_z input2_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[out] output_ptr Pointer to the destination tensor. + * Supported data types: QASYMM8 + * @param[in] output_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) */ -__kernel void binary_logical_op( - TENSOR3D_DECLARATION(input1), - TENSOR3D_DECLARATION(input2), - TENSOR3D_DECLARATION(output)) +__kernel void binary_logical_op(TENSOR3D_DECLARATION(input1), TENSOR3D_DECLARATION(input2), + TENSOR3D_DECLARATION(output)) { - Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1); - Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1); + Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - #if OP_CODE == 1 // LOGICAL AND - VSTORE(VEC_SIZE) - (CONVERT(VLOAD(VEC_SIZE) - (0, (__global DATA_TYPE *)input1.ptr) && VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr), - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)output.ptr); +#if OP_CODE == 1 // LOGICAL AND + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr) && + VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr), + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), + 0, (__global DATA_TYPE *)output.ptr); - #elif OP_CODE == 2 // LOGICAL OR - VSTORE(VEC_SIZE) - (CONVERT(VLOAD(VEC_SIZE) - (0, (__global DATA_TYPE *)input1.ptr) || VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr), - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)output.ptr); +#elif OP_CODE == 2 // LOGICAL OR + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr) || + VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr), + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), + 0, (__global DATA_TYPE *)output.ptr); - #else // OP NOT SUPPORTED - return +#else // OP NOT SUPPORTED + return - #endif +#endif } -#endif //if defined(OP_CODE) && defined(DATA_TYPE) +#endif // if defined(OP_CODE) && defined(DATA_TYPE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl index 3d4675e..d5a0747 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl @@ -17,10 +17,10 @@ #include "helpers.h" #ifndef SCALE -#define SCALE 1.0f +#define SCALE 1.0f #endif #ifndef OFFSET -#define OFFSET 0 +#define OFFSET 0 #endif #ifndef VEC_SIZE #define VEC_SIZE 1 @@ -29,118 +29,170 @@ #if defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) /** Perform a cast operation on an input tensor. * - * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and + * -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 * - * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] input_ptr Pointer to the source image. Supported data + * types: F16/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[out] output_ptr Pointer to the destination image. Supported data + * types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image */ -__kernel void cast( - TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output)) +__kernel void cast(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output)) { - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr), - VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), - 0, (__global DATA_TYPE_OUT *)output.ptr); + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr), + VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), + 0, (__global DATA_TYPE_OUT *)output.ptr); } /** Perform a cast operation on an QASYMM8 input tensor. - * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int - * @attention Offset and Scale of input should be given as a preprocessor argument using -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5 - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and + * -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int + * @attention Offset and Scale of input should be given as a preprocessor argument using + * -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5 + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 * - * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] input_ptr Pointer to the source image. Supported data + * types: F16/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[out] output_ptr Pointer to the destination image. Supported data + * types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image */ -__kernel void cast_qasymm_in( - TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output)) +__kernel void cast_qasymm_in(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output)) { - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) in_data = - VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr); - VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET); - VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE); + VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) + in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr); + VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET); + VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE); - VEC_DATA_TYPE(int, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(int, VEC_SIZE)) - offset; - VEC_DATA_TYPE(float, VEC_SIZE) out_data = CONVERT(tmp, VEC_DATA_TYPE(float, VEC_SIZE)) * scale; + VEC_DATA_TYPE(int, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(int, VEC_SIZE)) - offset; + VEC_DATA_TYPE(float, VEC_SIZE) out_data = CONVERT(tmp, VEC_DATA_TYPE(float, VEC_SIZE)) * scale; - VSTORE(VEC_SIZE)(CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), - 0, (__global DATA_TYPE_OUT *)output.ptr); + VSTORE(VEC_SIZE) + (CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, + (__global DATA_TYPE_OUT *)output.ptr); } - /** Perform a cast operation on an QASYMM8 output tensor. - * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int - * @attention Offset and Scale of output should be given as a preprocessor argument using -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5 - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and + * -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int + * @attention Offset and Scale of output should be given as a preprocessor argument using + * -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5 + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 * - * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: U8 - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] input_ptr Pointer to the source image. Supported data + * types: F16/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[out] output_ptr Pointer to the destination image. Supported data + * types: U8 + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image */ -__kernel void cast_qasymm_out( - TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output)) +__kernel void cast_qasymm_out(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output)) { - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) in_data = - VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr); - VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET); - VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE); + VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) + in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr); + VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET); + VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE); - VEC_DATA_TYPE(float, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(float, VEC_SIZE)) / scale; - VEC_DATA_TYPE(float, VEC_SIZE) out_data = tmp + CONVERT(offset, VEC_DATA_TYPE(float, VEC_SIZE)); + VEC_DATA_TYPE(float, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(float, VEC_SIZE)) / scale; + VEC_DATA_TYPE(float, VEC_SIZE) out_data = tmp + CONVERT(offset, VEC_DATA_TYPE(float, VEC_SIZE)); - VSTORE(VEC_SIZE)(CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), - 0, (__global DATA_TYPE_OUT *)output.ptr); + VSTORE(VEC_SIZE) + (CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, + (__global DATA_TYPE_OUT *)output.ptr); } #endif // defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl index 08f7266..e005322 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl @@ -20,104 +20,142 @@ /** Perform space to depth rearrangement of tensor * * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 - * @attention The value of the z-axis of output tensor should be given as a preprocessor argument using -DZ_OUT=size. e.g. -DZ_OUT=16 - * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1 + * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. + * e.g. -DDEPTH_OUT=16 + * @attention The value of the z-axis of output tensor should be given as a preprocessor argument + * using -DZ_OUT=size. e.g. -DZ_OUT=16 + * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. + * -DBLOCK_SIZE=1 * - * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p inpu -t_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in -bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] input_ptr Pointer to the source image. Supported data + * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[out] output_ptr Pointer to the destination image. Supported data + * types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image */ -__kernel void depth_to_space_nchw( - TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output)) - { - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT); +__kernel void depth_to_space_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output)) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT); - int out_index[4]={0}; - int in_index[4]={0}; + int out_index[4] = {0}; + int in_index[4] = {0}; - out_index[0] = get_global_id(0);//W - out_index[1] = get_global_id(1);//H - out_index[2] = get_global_id(2) % Z_OUT;//C - out_index[3] = get_global_id(2) / Z_OUT;//B + out_index[0] = get_global_id(0); // W + out_index[1] = get_global_id(1); // H + out_index[2] = get_global_id(2) % Z_OUT; // C + out_index[3] = get_global_id(2) / Z_OUT; // B - in_index[0] = out_index[0]/BLOCK_SIZE; - in_index[1] = out_index[1]/BLOCK_SIZE; - in_index[2] = out_index[2] + ((out_index[1] % BLOCK_SIZE) * BLOCK_SIZE + out_index[0] % BLOCK_SIZE) * DEPTH_OUT; - in_index[3] = out_index[3]; + in_index[0] = out_index[0] / BLOCK_SIZE; + in_index[1] = out_index[1] / BLOCK_SIZE; + in_index[2] = out_index[2] + + ((out_index[1] % BLOCK_SIZE) * BLOCK_SIZE + out_index[0] % BLOCK_SIZE) * DEPTH_OUT; + in_index[3] = out_index[3]; - *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2],in_index[3])); - } + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset( + &in, in_index[0], in_index[1], in_index[2], in_index[3])); +} #endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT) #if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT) /** Perform space to depth rearrangement of tensor (NHWC) * * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 - * @attention The value of the z-axis of output tensor should be given as a preprocessor argument using -DZ_OUT=size. e.g. -DZ_OUT=16 - * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1 + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. + * e.g. -DDEPTH_OUT=16 + * @attention The value of the z-axis of output tensor should be given as a preprocessor argument + * using -DZ_OUT=size. e.g. -DZ_OUT=16 + * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. + * -DBLOCK_SIZE=1 * - * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p inpu -t_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in -bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] input_ptr Pointer to the source image. Supported data + * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[out] output_ptr Pointer to the destination image. Supported data + * types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image */ -__kernel void depth_to_space_nhwc( - TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output)) - { - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT); +__kernel void depth_to_space_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output)) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT); - int out_index[4]={0}; - int in_index[4]={0}; + int out_index[4] = {0}; + int in_index[4] = {0}; - out_index[0] = get_global_id(0);//C - out_index[1] = get_global_id(1);//W - out_index[2] = get_global_id(2) % Z_OUT;//H - out_index[3] = get_global_id(2) / Z_OUT;//B + out_index[0] = get_global_id(0); // C + out_index[1] = get_global_id(1); // W + out_index[2] = get_global_id(2) % Z_OUT; // H + out_index[3] = get_global_id(2) / Z_OUT; // B - in_index[0] = out_index[0] + ((out_index[2] % BLOCK_SIZE) * BLOCK_SIZE + out_index[1] % BLOCK_SIZE) * DEPTH_OUT; - in_index[1] = out_index[1]/BLOCK_SIZE; - in_index[2] = out_index[2]/BLOCK_SIZE; - in_index[3] = out_index[3]; + in_index[0] = out_index[0] + + ((out_index[2] % BLOCK_SIZE) * BLOCK_SIZE + out_index[1] % BLOCK_SIZE) * DEPTH_OUT; + in_index[1] = out_index[1] / BLOCK_SIZE; + in_index[2] = out_index[2] / BLOCK_SIZE; + in_index[3] = out_index[3]; - *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2],in_index[3])); - } + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset( + &in, in_index[0], in_index[1], in_index[2], in_index[3])); +} #endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl index 348458f..dd8cb6d 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl @@ -23,62 +23,91 @@ #if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) /** Perform embedding_lookup of input tensor * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16 - * @attention Number of input dimensions are passed as a preprocessor argument using -DNUM_DIMS=size, e.g. -DNUM_DIMS=4 + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. + * -DDATA_TYPE=short + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * @attention Output tensor depth should be given as a preprocessor argument using + * -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16 + * @attention Number of input dimensions are passed as a preprocessor argument using + * -DNUM_DIMS=size, e.g. -DNUM_DIMS=4 * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] lookups_ptr Pointer to the lookups vector. Supported data types: S32 - * @param[in] lookups_stride_x Stride of the lookups vector in X dimension (in bytes) - * @param[in] lookups_step_x lookups_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector + * @param[in] input_ptr Pointer to the source tensor. Supported data + * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] input_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] input_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination tensor. Supported + * data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + * @param[in] lookups_ptr Pointer to the lookups vector. Supported data + * types: S32 + * @param[in] lookups_stride_x Stride of the lookups vector in X dimension (in + * bytes) + * @param[in] lookups_step_x lookups_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] lookups_offset_first_element_in_bytes The offset of the first element in the lookups + * vector */ -__kernel void embedding_lookup(TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output), +__kernel void embedding_lookup(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), VECTOR_DECLARATION(lookups)) { - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT); - Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups); + Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups); - //lookup ids for based on the tensor dimensions - int lup_id[4] = {0}; + // lookup ids for based on the tensor dimensions + int lup_id[4] = {0}; - lup_id[0] = (NUM_DIMS == 1)?*((__global int *)vector_offset(&lups,get_global_id(0))) - :get_global_id(0); - lup_id[1] = (NUM_DIMS == 2)?*((__global int *)vector_offset(&lups,get_global_id(1))) - :get_global_id(1); - lup_id[2] = (NUM_DIMS == 3)?*((__global int *)vector_offset(&lups,get_global_id(2))) - :get_global_id(2)%DEPTH_OUT; - lup_id[3] = (NUM_DIMS == 4)?*((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)) - :get_global_id(2) / DEPTH_OUT; + lup_id[0] = (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0))) + : get_global_id(0); + lup_id[1] = (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1))) + : get_global_id(1); + lup_id[2] = (NUM_DIMS == 3) ? *((__global int *)vector_offset(&lups, get_global_id(2))) + : get_global_id(2) % DEPTH_OUT; + lup_id[3] = (NUM_DIMS == 4) + ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)) + : get_global_id(2) / DEPTH_OUT; - in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + lup_id[1] * input_step_y - + lup_id[2] * input_step_z + lup_id[3] * input_step_w; + in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + + lup_id[1] * input_step_y + lup_id[2] * input_step_z + lup_id[3] * input_step_w; - VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), - 0, (__global DATA_TYPE *)out.ptr); + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, + (__global DATA_TYPE *)out.ptr); } #endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl index 69d94f3..cb80359 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/exp.cl @@ -24,34 +24,49 @@ /** Perform an exponential operation on an input tensor. * * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 * @note Can only take floating point data types. * - * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] input_ptr Pointer to the source image. Supported data + * types: F16/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[out] output_ptr Pointer to the destination image. Supported data + * types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image */ -__kernel void exp_layer( - TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output)) +__kernel void exp_layer(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output)) { - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - VSTORE(VEC_SIZE) - (exp(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr)), 0, (__global DATA_TYPE *)output.ptr); + VSTORE(VEC_SIZE) + (exp(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr)), 0, (__global DATA_TYPE *)output.ptr); } #endif // defined(DATA_TYPE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl index 05560e8..26d2830 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl @@ -19,93 +19,121 @@ #if defined(DATA_TYPE) && defined(AXIS) && defined(INDICES_DIM) /** Performs the Gather operation along the chosen axis - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. + * -DDATA_TYPE=short * @note Axis should be given as a preprocessor argument using -DAXIS=axis. e.g. -DAXIS=1 - * @attention Output tensor depth should be given as a preprocessor argument using -DOUTPUT_DIM_Z=size. e.g. -DOUTPUT_DIM_Z=16 - * @attention Input tensor depth should be given as a preprocessor argument using -DINPUT_DIM_Z=size. e.g. -DINPUT_DIM_Z=16 + * @attention Output tensor depth should be given as a preprocessor argument using + * -DOUTPUT_DIM_Z=size. e.g. -DOUTPUT_DIM_Z=16 + * @attention Input tensor depth should be given as a preprocessor argument using + * -DINPUT_DIM_Z=size. e.g. -DINPUT_DIM_Z=16 * - * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/U16/S16/U32/S32/F16/F32 - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per work item (in bytes) - * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per work item (in bytes) - * @param[in] input_stride_z Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per work item (in bytes) - * @param[in] input_stride_w Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_w input_stride_w * number of elements along W processed per work item (in bytes) - * @param[in] input_offset_first_element_in_bytes Offset of the first element in the source tensor - * @param[in] indices_ptr Pointer to the source tensor. Supported data types: S32 - * @param[in] indices_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] indices_step_x indices_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] indices_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] indices_step_y indices_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] indices_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] indices_step_z indices_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] indices_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per work item (in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per work item (in bytes) - * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per work item (in bytes) - * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per work item (in bytes) - * @param[in] output_offset_first_element_in_bytes Offset of the first element in the destination tensor + * @param[in] input_ptr Pointer to the source tensor. Supported data + * types: U8/S8/U16/S16/U32/S32/F16/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per work item (in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per work item (in bytes) + * @param[in] input_stride_z Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per work item (in bytes) + * @param[in] input_stride_w Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_w input_stride_w * number of elements along W + * processed per work item (in bytes) + * @param[in] input_offset_first_element_in_bytes Offset of the first element in the source + * tensor + * @param[in] indices_ptr Pointer to the source tensor. Supported data + * types: S32 + * @param[in] indices_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] indices_step_x indices_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] indices_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] indices_step_y indices_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] indices_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] indices_step_z indices_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] indices_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported + * data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per work item (in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per work item (in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per work item (in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W dimension + * (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per work item (in bytes) + * @param[in] output_offset_first_element_in_bytes Offset of the first element in the destination + * tensor */ -__kernel void gather( - TENSOR4D_DECLARATION(input), - TENSOR3D_DECLARATION(indices), - TENSOR4D_DECLARATION(output)) +__kernel void gather(TENSOR4D_DECLARATION(input), TENSOR3D_DECLARATION(indices), + TENSOR4D_DECLARATION(output)) { - const int px = get_global_id(0); - const int py = get_global_id(1); - const int pz = get_global_id(2) % OUTPUT_DIM_Z; - const int pw = get_global_id(2) / OUTPUT_DIM_Z; + const int px = get_global_id(0); + const int py = get_global_id(1); + const int pz = get_global_id(2) % OUTPUT_DIM_Z; + const int pw = get_global_id(2) / OUTPUT_DIM_Z; - const Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, INPUT_DIM_Z); - const Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(indices); - Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, OUTPUT_DIM_Z); + const Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, INPUT_DIM_Z); + const Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(indices); + Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, OUTPUT_DIM_Z); #if AXIS == 0 #if INDICES_DIM == 1 - const uint index = *(__global const uint *)tensor3D_offset(&indices, px, 0, 0); - __global const uchar *input_addr = tensor4D_offset(&input, index, py, pz, pw); + const uint index = *(__global const uint *)tensor3D_offset(&indices, px, 0, 0); + __global const uchar *input_addr = tensor4D_offset(&input, index, py, pz, pw); #elif INDICES_DIM == 2 - const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, 0); - __global const uchar *input_addr = tensor4D_offset(&input, index, pz, pw, 0); + const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, 0); + __global const uchar *input_addr = tensor4D_offset(&input, index, pz, pw, 0); #elif INDICES_DIM == 3 - const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, pz); - __global const uchar *input_addr = tensor4D_offset(&input, index, pw, 0, 0); + const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, pz); + __global const uchar *input_addr = tensor4D_offset(&input, index, pw, 0, 0); #endif #elif AXIS == 1 #if INDICES_DIM == 1 - const uint index = *(__global const uint *)tensor3D_offset(&indices, py, 0, 0); - __global const uchar *input_addr = tensor4D_offset(&input, px, index, pz, pw); + const uint index = *(__global const uint *)tensor3D_offset(&indices, py, 0, 0); + __global const uchar *input_addr = tensor4D_offset(&input, px, index, pz, pw); #elif INDICES_DIM == 2 - const uint index = *(__global const uint *)tensor3D_offset(&indices, py, pz, 0); - __global const uchar *input_addr = tensor4D_offset(&input, px, index, pw, 0); + const uint index = *(__global const uint *)tensor3D_offset(&indices, py, pz, 0); + __global const uchar *input_addr = tensor4D_offset(&input, px, index, pw, 0); #elif INDICES_DIM == 3 - const uint index = *(__global const uint *)tensor3D_offset(&indices, py, pz, pw); - __global const uchar *input_addr = tensor4D_offset(&input, px, index, 0, 0); + const uint index = *(__global const uint *)tensor3D_offset(&indices, py, pz, pw); + __global const uchar *input_addr = tensor4D_offset(&input, px, index, 0, 0); #endif #elif AXIS == 2 #if INDICES_DIM == 1 - const uint index = *(__global const uint *)tensor3D_offset(&indices, pz, 0, 0); - __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, pw); + const uint index = *(__global const uint *)tensor3D_offset(&indices, pz, 0, 0); + __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, pw); #elif INDICES_DIM == 2 - const uint index = *(__global const uint *)tensor3D_offset(&indices, pz, pw, 0); - __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, 0); + const uint index = *(__global const uint *)tensor3D_offset(&indices, pz, pw, 0); + __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, 0); #endif #elif AXIS == 3 #if INDICES_DIM == 1 - const uint index = *(__global const uint *)tensor3D_offset(&indices, pw, 0, 0); - __global const uchar *input_addr = tensor4D_offset(&input, px, py, pz, index); + const uint index = *(__global const uint *)tensor3D_offset(&indices, pw, 0, 0); + __global const uchar *input_addr = tensor4D_offset(&input, px, py, pz, index); #endif -#endif //AXIS +#endif // AXIS - *(__global DATA_TYPE *)output.ptr = *((__global const DATA_TYPE *)input_addr); + *(__global DATA_TYPE *)output.ptr = *((__global const DATA_TYPE *)input_addr); } -#endif //defined(DATA_TYPE) && defined(AXIS) && defined(INDICES_DIM) +#endif // defined(DATA_TYPE) && defined(AXIS) && defined(INDICES_DIM) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl index ed74098..73f29e3 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl @@ -23,66 +23,95 @@ #if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) /** Perform hashtable_lookup of input tensor * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16 - * @attention Number of input dimensions are passed as a preprocessor argument using -DNUM_DIMS=size, e.g. -DNUM_DIMS=4 + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. + * -DDATA_TYPE=short + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * @attention Output tensor depth should be given as a preprocessor argument using + * -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16 + * @attention Number of input dimensions are passed as a preprocessor argument using + * -DNUM_DIMS=size, e.g. -DNUM_DIMS=4 * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] lookups_ptr Pointer to the lookups vector. Supported data types: S32 - * @param[in] lookups_stride_x Stride of the lookups vector in X dimension (in bytes) - * @param[in] lookups_step_x lookups_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] lookups_offset_first_element_in_bytes The offset of the first element in the lookups vector + * @param[in] input_ptr Pointer to the source tensor. Supported data + * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] input_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] input_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination tensor. Supported + * data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + * @param[in] lookups_ptr Pointer to the lookups vector. Supported data + * types: S32 + * @param[in] lookups_stride_x Stride of the lookups vector in X dimension (in + * bytes) + * @param[in] lookups_step_x lookups_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] lookups_offset_first_element_in_bytes The offset of the first element in the lookups + * vector */ -__kernel void hashtable_lookup(TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output), +__kernel void hashtable_lookup(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), VECTOR_DECLARATION(lookups)) { - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT); - Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups); + Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups); - int lup_id[4] = {0}; + int lup_id[4] = {0}; - lup_id[0] = (NUM_DIMS == 1)?*((__global int *)vector_offset(&lups,get_global_id(0))) - :get_global_id(0); - lup_id[1] = (NUM_DIMS == 2)?*((__global int *)vector_offset(&lups,get_global_id(1))) - :get_global_id(1); - lup_id[2] = (NUM_DIMS == 3)?*((__global int *)vector_offset(&lups,get_global_id(2))) - :get_global_id(2)%DEPTH_OUT; - lup_id[3] = (NUM_DIMS == 4)?*((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)) - :get_global_id(2) / DEPTH_OUT; + lup_id[0] = (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0))) + : get_global_id(0); + lup_id[1] = (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1))) + : get_global_id(1); + lup_id[2] = (NUM_DIMS == 3) ? *((__global int *)vector_offset(&lups, get_global_id(2))) + : get_global_id(2) % DEPTH_OUT; + lup_id[3] = (NUM_DIMS == 4) + ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)) + : get_global_id(2) / DEPTH_OUT; - if (lup_id[NUM_DIMS-1] < 0) - { - VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, 0, (__global DATA_TYPE *)out.ptr); - return; - } + if (lup_id[NUM_DIMS - 1] < 0) + { + VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, 0, (__global DATA_TYPE *)out.ptr); + return; + } - in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + lup_id[1] * input_step_y - + lup_id[2] * input_step_z + lup_id[3] * input_step_w; + in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + + lup_id[1] * input_step_y + lup_id[2] * input_step_z + lup_id[3] * input_step_w; - VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), - 0, (__global DATA_TYPE *)out.ptr); + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, + (__global DATA_TYPE *)out.ptr); } #endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl index e3aa463..4aa7883 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl @@ -23,26 +23,33 @@ #if defined(DATA_TYPE) /** Performs a negation of input tensor. * - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float * - * @param[in] in_ptr Pointer to the source image. Supported data types: S16/S32/F16/F32. - * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] in_ptr Pointer to the source image. Supported data types: + * S16/S32/F16/F32. + * @param[in] in_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] in_step_x in_stride_x * number of elements along X processed + * per work item (in bytes) * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image - * @param[out] out_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes) - * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image + * @param[out] out_ptr Pointer to the destination image. Supported data + * types: same as @p input_ptr + * @param[in] out_stride_x Stride of the destination image in X dimension (in + * bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed + * per work item (in bytes) + * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination + * image + * */ -__kernel void neg_tensor( - TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output)) +__kernel void neg_tensor(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output)) { - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - VSTORE(VEC_SIZE) - (-VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr), 0, (__global DATA_TYPE *)output.ptr); + VSTORE(VEC_SIZE) + (-VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr), 0, (__global DATA_TYPE *)output.ptr); } #endif // defined(DATA_TYPE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl index 91106a8..e74c6b5 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/pad.cl @@ -16,71 +16,84 @@ */ #include "helpers.h" -#if defined(IW) && defined(IH) && defined(ID) && defined(IB) && defined(DEPTH_OUT) && defined(ZERO_VALUE) +#if defined(IW) && defined(IH) && defined(ID) && defined(IB) && defined(DEPTH_OUT) && \ + defined(ZERO_VALUE) /** Basic function to pad a tensor * * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 - * @attention Input dimensions should be passed as a preprocessor argument using -DIW(width), -DIH(height), -DID(depth) and -DIB(batch). e.g. -DIW = 4 + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. + * e.g. -DDEPTH_OUT=16 + * @attention Input dimensions should be passed as a preprocessor argument using -DIW(width), + * -DIH(height), -DID(depth) and -DIB(batch). e.g. -DIW = 4 * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0 * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor - * - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p inpu -t_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in -bytes) - * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - * - * @param[in] pad_values Padding values for each of the dimensions. Only pad values for Up(for - * batch), Top(for height), Left(for width) and Front(for depth) are - * required. Supported data type: S32 + * @param[in] input_ptr Pointer to the source tensor. Supported data + * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported + * data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W dimension + * (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + * @param[in] pad_values Padding values for each of the dimensions. Only + * pad values for Up(for batch), Top(for height), + * Left(for width) and Front(for depth) are + * required. Supported data type: S32 */ -__kernel void pad( - TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output), - const int4 pad_values) - { - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); +__kernel void pad(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), const int4 pad_values) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); - int index[4]={0}; + int index[4] = {0}; - index[0] = get_global_id(0);//W - index[1] = get_global_id(1);//H - index[2] = get_global_id(2) % DEPTH_OUT;//C - index[3] = get_global_id(2) / DEPTH_OUT;//N + index[0] = get_global_id(0); // W + index[1] = get_global_id(1); // H + index[2] = get_global_id(2) % DEPTH_OUT; // C + index[3] = get_global_id(2) / DEPTH_OUT; // N - if (index[0] < pad_values.x || index[0] >= (IW + pad_values.x) || - index[1] < pad_values.y || index[1] >= (IH + pad_values.y) || - index[2] < pad_values.z || index[2] >= (ID + pad_values.z) || - index[3] < pad_values.w || index[3] >= (IB + pad_values.w)) - { - *((__global DATA_TYPE *)out.ptr) = (DATA_TYPE)ZERO_VALUE; - } - else - { - *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *) - tensor4D_offset(&in, index[0] - pad_values.x, - index[1] - pad_values.y, - index[2] - pad_values.z, - index[3] - pad_values.w)); - } - } + if (index[0] < pad_values.x || index[0] >= (IW + pad_values.x) || index[1] < pad_values.y || + index[1] >= (IH + pad_values.y) || index[2] < pad_values.z || + index[2] >= (ID + pad_values.z) || index[3] < pad_values.w || index[3] >= (IB + pad_values.w)) + { + *((__global DATA_TYPE *)out.ptr) = (DATA_TYPE)ZERO_VALUE; + } + else + { + *((__global DATA_TYPE *)out.ptr) = *( + (__global DATA_TYPE *)tensor4D_offset(&in, index[0] - pad_values.x, index[1] - pad_values.y, + index[2] - pad_values.z, index[3] - pad_values.w)); + } +} -#endif //if defined(IW) && defined(IH) && defined(ID) && defined(IB) && defined(DEPTH_OUT) && defined(ZERO_VALUE) +#endif // if defined(IW) && defined(IH) && defined(ID) && defined(IB) && defined(DEPTH_OUT) && + // defined(ZERO_VALUE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl index c628c88..637788a 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/permute_ex.cl @@ -17,55 +17,66 @@ #include "helpers.h" -#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4) +#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && \ + defined(P4) /** Perform a Generic permute operation on an input tensor of Shape DCHW. * * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16 - * @attention Permutation vector is passed as a preprocessor arguement using -DP1, -DP2, -DP3 and -DP4=int, e.g. -DP1=2 + * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. + e.g. -DDEPTH_IN=16 + * @attention Permutation vector is passed as a preprocessor arguement using -DP1, -DP2, -DP3 and + -DP4=int, e.g. -DP1=2 * - * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U1 -6/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in b -ytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in b -ytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in b -ytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p inpu -t_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in -bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] input_ptr Pointer to the source image. Supported data + * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[out] output_ptr Pointer to the destination image. Supported data + * types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image */ -__kernel void permute_generic( - TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output)) +__kernel void permute_generic(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output)) { - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); - // WHCN format - int in_index[]={ - get_global_id(0), - get_global_id(1), - get_global_id(2) % DEPTH_IN, - get_global_id(2) / DEPTH_IN, - }; + // WHCN format + int in_index[] = { + get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_IN, get_global_id(2) / DEPTH_IN, + }; - // New locations based on Permuted index calc as out_index[index] = in_index[new_index] - *((__global DATA_TYPE *) - tensor4D_offset(&out, in_index[P1], in_index[P2], in_index[P3], in_index[P4])) = *((__global DATA_TYPE *)in.ptr); + // New locations based on Permuted index calc as out_index[index] = in_index[new_index] + *((__global DATA_TYPE *)tensor4D_offset(&out, in_index[P1], in_index[P2], in_index[P3], + in_index[P4])) = *((__global DATA_TYPE *)in.ptr); } -#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4) +#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && + // defined(P4) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl index ab1307e..2074d3c 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl @@ -24,7 +24,8 @@ #define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round) #if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT) -/** Performs a pixelwise multiplication used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8 +/** Performs a pixelwise multiplication used to quantize down the int32 accumulator values of + * GEMMLowp to QASYMM8 * * The following computations will be performed by the kernel: * @@ -35,77 +36,100 @@ * -# Shift the int32 accumulator by result_shift * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8. * - * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: - * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar - * @attention The offset factor of inputs must be passed at compile time using -DIN1_OFFSET and -DIN2_OFFSET - * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT + * @attention The inputs and output data types need to be passed at compile time using + * -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: + * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar + * @attention The offset factor of inputs must be passed at compile time using -DIN1_OFFSET and + * -DIN2_OFFSET + * @attention The offset, scalar scale factor and number of bits to shift right of output tensor + * must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and + * -DRESULT_SHIFT * - * @param[in] in1_ptr Pointer to the source image. Supported data types: U8 - * @param[in] in1_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in1_stride_z Stride of the source image in Y dimension (in bytes) - * @param[in] in1_step_z in1_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] in1_ptr Pointer to the source image. Supported data types: + * U8 + * @param[in] in1_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] in1_step_x in1_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] in1_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] in1_step_y in1_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] in1_stride_z Stride of the source image in Y dimension (in + * bytes) + * @param[in] in1_step_z in1_stride_z * number of elements along Y processed + * per workitem(in bytes) * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] in2_ptr Pointer to the source image. Supported data types: U8 - * @param[in] in2_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in2_stride_z Stride of the source image in Y dimension (in bytes) - * @param[in] in2_step_z in2_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] in2_ptr Pointer to the source image. Supported data types: + * U8 + * @param[in] in2_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] in2_step_x in2_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] in2_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] in2_step_y in2_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] in2_stride_z Stride of the source image in Y dimension (in + * bytes) + * @param[in] in2_step_z in2_stride_z * number of elements along Y processed + * per workitem(in bytes) * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] out_ptr Pointer to the destination image. Supported data types: U8 - * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] out_stride_z Stride of the destination image in Y dimension (in bytes) - * @param[in] out_step_z out_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[out] out_ptr Pointer to the destination image. Supported data + * types: U8 + * @param[in] out_stride_x Stride of the destination image in X dimension (in + * bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in + * bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] out_stride_z Stride of the destination image in Y dimension (in + * bytes) + * @param[in] out_step_z out_stride_z * number of elements along Y processed + * per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination + * image * @param[in] scale Float scaling factor. Supported data types: F32 */ -__kernel void pixelwise_mul_qasymm8( - TENSOR3D_DECLARATION(in1), - TENSOR3D_DECLARATION(in2), - TENSOR3D_DECLARATION(out), - const float scale) +__kernel void pixelwise_mul_qasymm8(TENSOR3D_DECLARATION(in1), TENSOR3D_DECLARATION(in2), + TENSOR3D_DECLARATION(out), const float scale) { - // Get pixels pointer - Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); - Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); - Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); + // Get pixels pointer + Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); + Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); - // Load data - VEC_DATA_TYPE(int, 16) - in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16)); - VEC_DATA_TYPE(int, 16) - in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16)); + // Load data + VEC_DATA_TYPE(int, 16) + in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16)); + VEC_DATA_TYPE(int, 16) + in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16)); - // Perform multiplication of two inputs - VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET); - VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET); - VEC_DATA_TYPE(int, 16) out_val = in1_val * in2_val; + // Perform multiplication of two inputs + VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET); + VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET); + VEC_DATA_TYPE(int, 16) out_val = in1_val * in2_val; - // Multiply with a multiplier smaller than 1 - out_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(out_val, RESULT_MULT_INT, RESULT_SHIFT, 16); - out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET); + // Multiply with a multiplier smaller than 1 + out_val = + ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(out_val, RESULT_MULT_INT, RESULT_SHIFT, 16); + out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET); - VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16)); + VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16)); -// TODO: Apply min-max BOUND to support fuse with relu. -/* -#if defined(MIN_BOUND) - res = max(res, (uchar16)MIN_BOUND); -#endif // defined(MIN_BOUND) -#if defined(MAX_BOUND) - res = min(res, (uchar16)MAX_BOUND); -#endif // defined(MAX_BOUND) -*/ + // TODO: Apply min-max BOUND to support fuse with relu. + /* + #if defined(MIN_BOUND) + res = max(res, (uchar16)MIN_BOUND); + #endif // defined(MIN_BOUND) + #if defined(MAX_BOUND) + res = min(res, (uchar16)MAX_BOUND); + #endif // defined(MAX_BOUND) + */ - // Store result - VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), - 0, (__global DATA_TYPE_OUT *)out.ptr); + // Store result + VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr); } #endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl index 68da2ba..62a8901 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl @@ -25,50 +25,72 @@ * f(input) = alpha * input for input < 0, f(input) = input for input >= 0. * * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 * @note Can only take floating point data types. * - * @param[in] input1_ptr Pointer to the source image. Supported Data types : F16/F32 - * @param[in] input1_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input1_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] input1_ptr Pointer to the source image. Supported Data + * types : F16/F32 + * @param[in] input1_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input1_step_x input1_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input1_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input1_step_y input1_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input1_step_z input1_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[in] alpha_ptr Pointer to the source image. Supported Data + * types : F16/F32 + * @param[in] alpha_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] alpha_step_x input2_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] alpha_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] alpha_step_y input2_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] alpha_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] alpha_step_z input2_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] alpha_offset_first_element_in_bytes The offset of the first element in the source + * image * - * @param[in] alpha_ptr Pointer to the source image. Supported Data types : F16/F32 - * @param[in] alpha_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] alpha_step_x input2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] alpha_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] alpha_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] alpha_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] alpha_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] alpha_offset_first_element_in_bytes The offset of the first element in the source image - * - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[out] output_ptr Pointer to the destination image. Supported + * data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image */ -__kernel void prelu( - TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(alpha), - TENSOR3D_DECLARATION(output)) +__kernel void prelu(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(alpha), + TENSOR3D_DECLARATION(output)) { - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - - VSTORE(VEC_SIZE) - (VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) < 0 ? - VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) * VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)alpha.ptr) : - VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr), - 0, (__global DATA_TYPE *)output.ptr); + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + VSTORE(VEC_SIZE) + (VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) < 0 + ? VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) * + VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)alpha.ptr) + : VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr), + 0, (__global DATA_TYPE *)output.ptr); } #endif // defined(DATA_TYPE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl index 7e97b7e..e69728d 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl @@ -17,7 +17,8 @@ #include "helpers.h" #define SUB(x, y) (x) - (y) -#if defined(OFF_IN1) && defined(OFF_IN2) && defined(OFF_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(VEC_SIZE) +#if defined(OFF_IN1) && defined(OFF_IN2) && defined(OFF_OUT) && defined(SCALE_IN1) && \ + defined(SCALE_IN2) && defined(SCALE_OUT) && defined(VEC_SIZE) #define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE) #define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE) @@ -26,63 +27,86 @@ #define CONVERT_DOWN(x, type) CONVERT_RTE(x, type) /** Returns result of prelu function implemented as below: - * f(input) = alpha * input for input < 0, f(input) = input for input >= 0. + * f(input) = alpha * input for input < 0, f(input) = input for input >= 0. * - * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g. -DDATA_TYPE_IN=uchar - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g. + * -DDATA_TYPE_IN=uchar + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 * @note Can only take uchar data types. * - * @param[in] input1_ptr Pointer to the source image. Supported Data types : QASYMM8 - * @param[in] input1_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input1_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source image - * - * @param[in] alpha_ptr Pointer to the source image. Supported Data types : QASYMM8 - * @param[in] alpha_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] alpha_step_x input2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] alpha_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] alpha_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] alpha_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] alpha_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] alpha_offset_first_element_in_bytes The offset of the first element in the source image - * - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] input1_ptr Pointer to the source image. Supported Data + * types : QASYMM8 + * @param[in] input1_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input1_step_x input1_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input1_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input1_step_y input1_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input1_step_z input1_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[in] alpha_ptr Pointer to the source image. Supported Data + * types : QASYMM8 + * @param[in] alpha_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] alpha_step_x input2_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] alpha_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] alpha_step_y input2_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] alpha_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] alpha_step_z input2_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] alpha_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[out] output_ptr Pointer to the destination image. Supported + * data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image */ -__kernel void prelu_qasymm8( - TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(alpha), - TENSOR3D_DECLARATION(output)) +__kernel void prelu_qasymm8(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(alpha), + TENSOR3D_DECLARATION(output)) { - // Get pixels pointer - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + // Get pixels pointer + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)input.ptr), VEC_INT); - VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)alpha.ptr), VEC_INT); + VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)input.ptr), VEC_INT); + VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)alpha.ptr), VEC_INT); - in_a = SUB(in_a, (VEC_INT)((int)OFF_IN1)); - in_b = SUB(in_b, (VEC_INT)((int)OFF_IN2)); + in_a = SUB(in_a, (VEC_INT)((int)OFF_IN1)); + in_b = SUB(in_b, (VEC_INT)((int)OFF_IN2)); - const VEC_FLOAT in1f32 = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1); - const VEC_FLOAT in2f32 = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2); - const VEC_FLOAT outf32 = in1f32 < 0 ? in1f32 * in2f32 : in1f32; - const VEC_FLOAT qresf32 = outf32 / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFF_OUT)); - const VEC_UCHAR res = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_UCHAR); + const VEC_FLOAT in1f32 = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1); + const VEC_FLOAT in2f32 = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2); + const VEC_FLOAT outf32 = in1f32 < 0 ? in1f32 * in2f32 : in1f32; + const VEC_FLOAT qresf32 = outf32 / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFF_OUT)); + const VEC_UCHAR res = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_UCHAR); - VSTORE(VEC_SIZE) - (res, 0, (__global uchar *)output.ptr); + VSTORE(VEC_SIZE) + (res, 0, (__global uchar *)output.ptr); } -#endif // defined(OFF_IN1) && defined(OFF_IN2) && defined(OFF_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(VEC_SIZE) +#endif // defined(OFF_IN1) && defined(OFF_IN2) && defined(OFF_OUT) && defined(SCALE_IN1) && + // defined(SCALE_IN2) && defined(SCALE_OUT) && defined(VEC_SIZE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl index 8bef493..d7ea2e2 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl @@ -19,134 +19,170 @@ #if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) /** Perform reduce max/min * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short - * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 - * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using - * -DOP_CODE = number. e.g. -DOP_CODE=1 + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. + * -DDATA_TYPE=short + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. + * e.g. -DDEPTH_OUT=16 + * @attention Operation type(code) specifying which operation to perform should be passed as + * preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1 * - * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] input_ptr Pointer to the source image. Supported data + * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[in] input_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] input_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination image. Supported data + * types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image * @param[in] axis Axis through which reduction occurs * @param[in] dim Dimension across the axis to be reduced. */ -__kernel void reduce_min_max(TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output), - const int axis, - const int dim) +__kernel void reduce_min_max(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), + const int axis, const int dim) { - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); - int indices[4] = - { - get_global_id(0), - get_global_id(1), - get_global_id(2) % DEPTH_OUT, - get_global_id(2) / DEPTH_OUT, - }; + int indices[4] = { + get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT, + get_global_id(2) / DEPTH_OUT, + }; - DATA_TYPE value = *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); - for(int i = 1; i < dim; ++i) - { - indices[axis] = i; + DATA_TYPE value = + *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); + for (int i = 1; i < dim; ++i) + { + indices[axis] = i; - #if OP_CODE == 1 // REDUCE_MAX - value = max(value, *((__global DATA_TYPE *) - tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]))); +#if OP_CODE == 1 // REDUCE_MAX + value = max(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], + indices[2], indices[3]))); - #elif OP_CODE == 2 // REDUCE_MIN - value = min(value, *((__global DATA_TYPE *) - tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]))); +#elif OP_CODE == 2 // REDUCE_MIN + value = min(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], + indices[2], indices[3]))); - #else // OP NOT SUPPORTED - return; +#else // OP NOT SUPPORTED + return; - #endif - } +#endif + } - *((__global DATA_TYPE *)out.ptr) = value; + *((__global DATA_TYPE *)out.ptr) = value; } /** Perform reduce sum/mean * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short - * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 - * @attention Operation type(code) specifying which operation to perform should be passed as preprocessor argument using - * -DOP_CODE = number. e.g. -DOP_CODE=1 + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. + * -DDATA_TYPE=short + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. + * e.g. -DDEPTH_OUT=16 + * @attention Operation type(code) specifying which operation to perform should be passed as + * preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1 * - * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] input_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] input_ptr Pointer to the source image. Supported data + * types: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[in] input_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] input_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination image. Supported data + * types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image * @param[in] axis Axis through which reduction occurs * @param[in] dim Dimension across the axis to be reduced. */ -__kernel void reduce_sum_mean(TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output), - const int axis, - const int dim) +__kernel void reduce_sum_mean(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), + const int axis, const int dim) { - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); - int indices[4] = - { - get_global_id(0), - get_global_id(1), - get_global_id(2) % DEPTH_OUT, - get_global_id(2) / DEPTH_OUT, - }; + int indices[4] = { + get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT, + get_global_id(2) / DEPTH_OUT, + }; - DATA_TYPE sum_value = (DATA_TYPE)0; - for(int i = 0; i < dim; ++i) - { - indices[axis] = i; - sum_value += *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); - } + DATA_TYPE sum_value = (DATA_TYPE)0; + for (int i = 0; i < dim; ++i) + { + indices[axis] = i; + sum_value += *( + (__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); + } - #if OP_CODE == 3 // REDUCE_SUM - *((__global DATA_TYPE *)out.ptr) = sum_value; +#if OP_CODE == 3 // REDUCE_SUM + *((__global DATA_TYPE *)out.ptr) = sum_value; - #elif OP_CODE == 4 // REDUCE_MEAN - *((__global DATA_TYPE *)out.ptr) = sum_value / CONVERT(dim, DATA_TYPE); +#elif OP_CODE == 4 // REDUCE_MEAN + *((__global DATA_TYPE *)out.ptr) = sum_value / CONVERT(dim, DATA_TYPE); - #else // OP NOT SUPPORTED - return; +#else // OP NOT SUPPORTED + return; - #endif +#endif } #endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl index a0fc2d5..7367da7 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl @@ -16,148 +16,235 @@ */ #include "helpers.h" -#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && \ + defined(WIDTH_IN) && defined(ZERO_VALUE) /** Perform space to batch with input of 4D and NCHW format * * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 - * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size. e.g. -DBATCH_IN=16 - * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DHEIGHT_IN=16 - * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DWIDTH_IN=16 + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. + * e.g. -DDEPTH_OUT=16 + * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size. + * e.g. -DBATCH_IN=16 + * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size. + * e.g. -DHEIGHT_IN=16 + * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size. + * e.g. -DWIDTH_IN=16 * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0 * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] block_size_ptr Pointer to the source tensor. Supported data types: S32 - * @param[in] block_size_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] block_size_step_x block_size_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] block_size_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] padding_size_ptr Pointer to the source tensor. Supported data types: S32 - * @param[in] padding_size_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] padding_size_step_x padding_size_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] padding_size_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] padding_size_step_y padding_size_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] padding_size_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] input_ptr Pointer to the source tensor. Supported + * data types: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source tensor in X + * dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along + * X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y + * dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along + * Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z + * dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along + * Z processed per workitem(in bytes) + * @param[in] input_stride_w Stride of the destination tensor in W + * dimension (in bytes) + * @param[in] input_step_w input_stride_w * number of elements along + * W processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the + * source tensor + * @param[out] output_ptr Pointer to the destination tensor. + * Supported data types: same as @p + * input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X + * dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y + * dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z + * dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements + * along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W + * dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements + * along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + * @param[in] block_size_ptr Pointer to the source tensor. Supported + * data types: S32 + * @param[in] block_size_stride_x Stride of the source tensor in X + * dimension (in bytes) + * @param[in] block_size_step_x block_size_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] block_size_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + * @param[in] padding_size_ptr Pointer to the source tensor. Supported + * data types: S32 + * @param[in] padding_size_stride_x Stride of the source tensor in X + * dimension (in bytes) + * @param[in] padding_size_step_x padding_size_stride_x * number of + * elements along X processed per workitem + * (in bytes) + * @param[in] padding_size_stride_y Stride of the source tensor in Y + * dimension (in bytes) + * @param[in] padding_size_step_y padding_size_stride_y * number of + * elements along Y processed per workitem + * (in bytes) + * @param[in] padding_size_offset_first_element_in_bytes The offset of the first element in the + * destination tensor */ -__kernel void space_to_batch_4d_nchw(TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output), +__kernel void space_to_batch_4d_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), VECTOR_DECLARATION(block_size), IMAGE_DECLARATION(padding_size)) { - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); - int block_size_x = *((__global int *)(block_size_ptr)); - int block_size_y = *((__global int *)(block_size_ptr + block_size_stride_x)); - int shift_x = (get_global_id(2) / DEPTH_OUT / BATCH_IN) % block_size_x; - int shift_y = (get_global_id(2) / DEPTH_OUT / BATCH_IN) / block_size_x; + int block_size_x = *((__global int *)(block_size_ptr)); + int block_size_y = *((__global int *)(block_size_ptr + block_size_stride_x)); + int shift_x = (get_global_id(2) / DEPTH_OUT / BATCH_IN) % block_size_x; + int shift_y = (get_global_id(2) / DEPTH_OUT / BATCH_IN) / block_size_x; - int in_index[4] = {0, }; - in_index[0] = get_global_id(0) * block_size_x + shift_x - *((__global int *)(padding_size_ptr)); - in_index[1] = get_global_id(1) * block_size_y + shift_y - *((__global int *)(padding_size_ptr + padding_size_stride_y)); - in_index[2] = get_global_id(2) % DEPTH_OUT; - in_index[3] = (get_global_id(2) / DEPTH_OUT) % BATCH_IN; + int in_index[4] = { + 0, + }; + in_index[0] = get_global_id(0) * block_size_x + shift_x - *((__global int *)(padding_size_ptr)); + in_index[1] = get_global_id(1) * block_size_y + shift_y - + *((__global int *)(padding_size_ptr + padding_size_stride_y)); + in_index[2] = get_global_id(2) % DEPTH_OUT; + in_index[3] = (get_global_id(2) / DEPTH_OUT) % BATCH_IN; - if (in_index[0] < 0 || in_index[0] >= WIDTH_IN || in_index[1] < 0 || in_index[1] >= HEIGHT_IN) - { - *((__global DATA_TYPE *)out.ptr) = (DATA_TYPE)ZERO_VALUE; - } - else - { - *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3])); - } + if (in_index[0] < 0 || in_index[0] >= WIDTH_IN || in_index[1] < 0 || in_index[1] >= HEIGHT_IN) + { + *((__global DATA_TYPE *)out.ptr) = (DATA_TYPE)ZERO_VALUE; + } + else + { + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset( + &in, in_index[0], in_index[1], in_index[2], in_index[3])); + } } -#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && + // defined(WIDTH_IN) && defined(ZERO_VALUE) -#if defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE) +#if defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && \ + defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE) /** Perform space to batch with input of 4D and NHWC format * * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Output tensor depth should be given as a preprocessor argument using -DHEIGHT_OUT=size. e.g. -DHEIGHT_OUT=16 - * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size. e.g. -DBATCH_IN=16 - * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DHEIGHT_IN=16 - * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DWIDTH_IN=16 + * @attention Output tensor depth should be given as a preprocessor argument using + * -DHEIGHT_OUT=size. e.g. -DHEIGHT_OUT=16 + * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size. + * e.g. -DBATCH_IN=16 + * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size. + * e.g. -DHEIGHT_IN=16 + * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size. + * e.g. -DWIDTH_IN=16 * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0 - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] block_size_ptr Pointer to the source tensor. Supported data types: S32 - * @param[in] block_size_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] block_size_step_x block_size_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] block_size_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] padding_size_ptr Pointer to the source tensor. Supported data types: S32 - * @param[in] padding_size_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] padding_size_step_x padding_size_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] padding_size_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] padding_size_step_y padding_size_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] padding_size_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] input_ptr Pointer to the source tensor. Supported + * data types: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source tensor in X + * dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along + * X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y + * dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along + * Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z + * dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along + * Z processed per workitem(in bytes) + * @param[in] input_stride_w Stride of the destination tensor in W + * dimension (in bytes) + * @param[in] input_step_w input_stride_w * number of elements along + * W processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the + * source tensor + * @param[out] output_ptr Pointer to the destination tensor. + * Supported data types: same as @p + * input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X + * dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y + * dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z + * dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements + * along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W + * dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements + * along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + * @param[in] block_size_ptr Pointer to the source tensor. Supported + * data types: S32 + * @param[in] block_size_stride_x Stride of the source tensor in X + * dimension (in bytes) + * @param[in] block_size_step_x block_size_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] block_size_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + * @param[in] padding_size_ptr Pointer to the source tensor. Supported + * data types: S32 + * @param[in] padding_size_stride_x Stride of the source tensor in X + * dimension (in bytes) + * @param[in] padding_size_step_x padding_size_stride_x * number of + * elements along X processed per workitem + * (in bytes) + * @param[in] padding_size_stride_y Stride of the source tensor in Y + * dimension (in bytes) + * @param[in] padding_size_step_y padding_size_stride_y * number of + * elements along Y processed per workitem + * (in bytes) + * @param[in] padding_size_offset_first_element_in_bytes The offset of the first element in the + * destination tensor */ -__kernel void space_to_batch_4d_nhwc(TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output), +__kernel void space_to_batch_4d_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), VECTOR_DECLARATION(block_size), IMAGE_DECLARATION(padding_size)) { - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, HEIGHT_OUT); + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, HEIGHT_OUT); - int block_size_x = *((__global int *)(block_size_ptr)); - int block_size_y = *((__global int *)(block_size_ptr + block_size_stride_x)); - int shift_x = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) % block_size_x; - int shift_y = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) / block_size_x; + int block_size_x = *((__global int *)(block_size_ptr)); + int block_size_y = *((__global int *)(block_size_ptr + block_size_stride_x)); + int shift_x = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) % block_size_x; + int shift_y = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) / block_size_x; - int in_index[4] = {0, }; - in_index[0] = get_global_id(0) * VEC_SIZE; - in_index[1] = get_global_id(1) * block_size_x + shift_x - *((__global int *)(padding_size_ptr)); - in_index[2] = get_global_id(2) % HEIGHT_OUT * block_size_y + shift_y - *((__global int *)(padding_size_ptr + padding_size_stride_y)); - in_index[3] = (get_global_id(2) / HEIGHT_OUT) % BATCH_IN; + int in_index[4] = { + 0, + }; + in_index[0] = get_global_id(0) * VEC_SIZE; + in_index[1] = get_global_id(1) * block_size_x + shift_x - *((__global int *)(padding_size_ptr)); + in_index[2] = get_global_id(2) % HEIGHT_OUT * block_size_y + shift_y - + *((__global int *)(padding_size_ptr + padding_size_stride_y)); + in_index[3] = (get_global_id(2) / HEIGHT_OUT) % BATCH_IN; - if (in_index[1] < 0 || in_index[1] >= WIDTH_IN || in_index[2] < 0 || in_index[2] >= HEIGHT_IN) - { - VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))ZERO_VALUE, 0, (__global DATA_TYPE *)out.ptr); - } - else - { - VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3])), - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), - 0, (__global DATA_TYPE *)out.ptr); - } + if (in_index[1] < 0 || in_index[1] >= WIDTH_IN || in_index[2] < 0 || in_index[2] >= HEIGHT_IN) + { + VSTORE(VEC_SIZE) + ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))ZERO_VALUE, 0, (__global DATA_TYPE *)out.ptr); + } + else + { + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], + in_index[2], in_index[3])), + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), + 0, (__global DATA_TYPE *)out.ptr); + } } -#endif // defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE) +#endif // defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && + // defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl index 20eeb38..a26e762 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl @@ -20,104 +20,142 @@ /** Perform space to depth rearrangement of tensor * * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16 - * @attention The value of the z-axis of input tensor depth should be given as a preprocessor argument using -DZ_IN=size. e.g. -DZ_IN=16 - * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1 + * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. + * e.g. -DDEPTH_IN=16 + * @attention The value of the z-axis of input tensor depth should be given as a preprocessor + * argument using -DZ_IN=size. e.g. -DZ_IN=16 + * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. + * -DBLOCK_SIZE=1 * - * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p inpu -t_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in -bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] input_ptr Pointer to the source image. Supported data + * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[out] output_ptr Pointer to the destination image. Supported data + * types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image */ -__kernel void space_to_depth_nchw( - TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output)) - { - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); +__kernel void space_to_depth_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output)) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); - int out_index[4]={0}; - int in_index[4]={0}; + int out_index[4] = {0}; + int in_index[4] = {0}; - in_index[0] = get_global_id(0);//W - in_index[1] = get_global_id(1);//H - in_index[2] = get_global_id(2) % Z_IN;//C - in_index[3] = get_global_id(2) / Z_IN;//B + in_index[0] = get_global_id(0); // W + in_index[1] = get_global_id(1); // H + in_index[2] = get_global_id(2) % Z_IN; // C + in_index[3] = get_global_id(2) / Z_IN; // B - out_index[0] = in_index[0]/BLOCK_SIZE; - out_index[1] = in_index[1]/BLOCK_SIZE; - out_index[2] = in_index[2] + ((in_index[1] % BLOCK_SIZE) * BLOCK_SIZE + in_index[0] % BLOCK_SIZE) * DEPTH_IN; - out_index[3] = in_index[3]; + out_index[0] = in_index[0] / BLOCK_SIZE; + out_index[1] = in_index[1] / BLOCK_SIZE; + out_index[2] = + in_index[2] + ((in_index[1] % BLOCK_SIZE) * BLOCK_SIZE + in_index[0] % BLOCK_SIZE) * DEPTH_IN; + out_index[3] = in_index[3]; - *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0],out_index[1],out_index[2],out_index[3])) = *((__global DATA_TYPE *)in.ptr); - } + *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2], + out_index[3])) = *((__global DATA_TYPE *)in.ptr); +} #endif // defined(DATA_TYPE) && defined(Z_IN) && defined(BLOCK_SIZE) && defined(Z_IN) #if defined(DATA_TYPE) && defined(Z_IN) && defined(BLOCK_SIZE) && defined(Z_IN) /** Perform space to depth rearrangement of tensor * * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16 - * @attention The value of the z-axis of input tensor depth should be given as a preprocessor argument using -DZ_IN=size. e.g. -DZ_IN=16 - * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1 + * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. + * e.g. -DDEPTH_IN=16 + * @attention The value of the z-axis of input tensor depth should be given as a preprocessor + * argument using -DZ_IN=size. e.g. -DZ_IN=16 + * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. + * -DBLOCK_SIZE=1 * - * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p inpu -t_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in -bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] input_ptr Pointer to the source image. Supported data + * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[out] output_ptr Pointer to the destination image. Supported data + * types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image */ -__kernel void space_to_depth_nhwc( - TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output)) - { - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); +__kernel void space_to_depth_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output)) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); - int out_index[4]={0}; - int in_index[4]={0}; + int out_index[4] = {0}; + int in_index[4] = {0}; - in_index[0] = get_global_id(0);//C - in_index[1] = get_global_id(1);//W - in_index[2] = get_global_id(2) % Z_IN;//H - in_index[3] = get_global_id(2) / Z_IN;//B + in_index[0] = get_global_id(0); // C + in_index[1] = get_global_id(1); // W + in_index[2] = get_global_id(2) % Z_IN; // H + in_index[3] = get_global_id(2) / Z_IN; // B - out_index[0] = in_index[0] + ((in_index[2] % BLOCK_SIZE) * BLOCK_SIZE + in_index[1] % BLOCK_SIZE) * DEPTH_IN; - out_index[1] = in_index[1]/BLOCK_SIZE; - out_index[2] = in_index[2]/BLOCK_SIZE; - out_index[3] = in_index[3]; + out_index[0] = + in_index[0] + ((in_index[2] % BLOCK_SIZE) * BLOCK_SIZE + in_index[1] % BLOCK_SIZE) * DEPTH_IN; + out_index[1] = in_index[1] / BLOCK_SIZE; + out_index[2] = in_index[2] / BLOCK_SIZE; + out_index[3] = in_index[3]; - *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0],out_index[1],out_index[2],out_index[3])) = *((__global DATA_TYPE *)in.ptr); - } + *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2], + out_index[3])) = *((__global DATA_TYPE *)in.ptr); +} #endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) && defined(Z_IN) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl index 3e1a5c9..0e1e246 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/squared_difference.cl @@ -24,52 +24,74 @@ /** Returns true value of squared_difference of two tensors. * * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 * @note Can only take floating point data types. * - * @param[in] input1_ptr Pointer to the source image. Supported data types: F16/F32 - * @param[in] input1_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input1_step_x input1_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input1_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input1_step_y input1_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input1_step_z input1_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source image - * - * @param[in] input2_ptr Pointer to the source image. Supported data types: F16/F32 - * @param[in] input2_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input2_step_x input2_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input2_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input2_step_y input2_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input2_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input2_step_z input2_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the source image - * - * @param[out] output_ptr Pointer to the destination image. Supported data types: F16/F32 - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] input1_ptr Pointer to the source image. Supported data + * types: F16/F32 + * @param[in] input1_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input1_step_x input1_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input1_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input1_step_y input1_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input1_step_z input1_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[in] input2_ptr Pointer to the source image. Supported data + * types: F16/F32 + * @param[in] input2_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input2_step_x input2_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input2_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input2_step_y input2_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input2_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input2_step_z input2_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[out] output_ptr Pointer to the destination image. Supported + * data types: F16/F32 + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image */ -__kernel void squared_difference( - TENSOR3D_DECLARATION(input1), - TENSOR3D_DECLARATION(input2), - TENSOR3D_DECLARATION(output)) +__kernel void squared_difference(TENSOR3D_DECLARATION(input1), TENSOR3D_DECLARATION(input2), + TENSOR3D_DECLARATION(output)) { - Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1); - Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1); + Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - diff = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr)- VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr); + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + diff = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr) - + VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr); - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - sq_diff = diff * diff; + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + sq_diff = diff * diff; - VSTORE(VEC_SIZE) - (sq_diff, 0, (__global DATA_TYPE *)output.ptr); + VSTORE(VEC_SIZE) + (sq_diff, 0, (__global DATA_TYPE *)output.ptr); } #endif // defined(DATA_TYPE) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl index d97f23a..50472e4 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl @@ -17,10 +17,8 @@ #include "helpers.h" -__kernel void topkv2_init(VECTOR_DECLARATION(input), - __global float* in_key_buf, - __global int* in_ind_buf, - const int n) +__kernel void topkv2_init(VECTOR_DECLARATION(input), __global float *in_key_buf, + __global int *in_ind_buf, const int n) { int gid = get_global_id(0); int lws = get_local_size(0); @@ -30,53 +28,54 @@ __kernel void topkv2_init(VECTOR_DECLARATION(input), Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input); - for(int i = 0; i < iter; ++i) + for (int i = 0; i < iter; ++i) { int idx = i * gws + gid; - in_key_buf[idx] = *(__global float*)(input.ptr + idx * input.stride_x); + in_key_buf[idx] = *(__global float *)(input.ptr + idx * input.stride_x); in_ind_buf[idx] = idx; } } -__kernel void topkv2_find_first_negative( - __global float *out_key_buf, - __global int *first_negative_idx, - int n) +__kernel void topkv2_find_first_negative(__global float *out_key_buf, + __global int *first_negative_idx, int n) { int gid = get_global_id(0); - if( gid == n - 1 ) + if (gid == n - 1) { // if the last item is positive, the first negative index is n. - if( out_key_buf[gid] > 0.f ) + if (out_key_buf[gid] > 0.f) *first_negative_idx = n; - } else if ( gid == 0 ) { + } + else if (gid == 0) + { // if the first item is negative, set it 0. - if( out_key_buf[gid] < 0.f ) + if (out_key_buf[gid] < 0.f) *first_negative_idx = 0; - } else { + } + else + { // if its left is positive and it is negative, then it is the first negative item. - if( out_key_buf[gid-1] > 0.f && out_key_buf[gid] < 0.f ) + if (out_key_buf[gid - 1] > 0.f && out_key_buf[gid] < 0.f) *first_negative_idx = gid; } } -__kernel void topkv2_reorder_negatives( - __global float* in_key_buf, - __global float* out_key_buf, - __global float* in_ind_buf, - __global float* out_ind_buf, - __global int* first_negative_idx, - int n) +__kernel void topkv2_reorder_negatives(__global float *in_key_buf, __global float *out_key_buf, + __global float *in_ind_buf, __global float *out_ind_buf, + __global int *first_negative_idx, int n) { int gid = get_global_id(0); int num_negs = n - *first_negative_idx; int in_idx; - if( gid < num_negs ) { + if (gid < num_negs) + { in_idx = n - 1 - gid; - } else { + } + else + { in_idx = gid - num_negs; } @@ -84,12 +83,8 @@ __kernel void topkv2_reorder_negatives( out_ind_buf[gid] = in_ind_buf[in_idx]; } -__kernel void topkv2_store( - VECTOR_DECLARATION(values), - VECTOR_DECLARATION(indices), - __global float *out_key_buf, - __global int *out_ind_buf, - int n) +__kernel void topkv2_store(VECTOR_DECLARATION(values), VECTOR_DECLARATION(indices), + __global float *out_key_buf, __global int *out_ind_buf, int n) { int gid = get_global_id(0); @@ -98,6 +93,6 @@ __kernel void topkv2_store( int idx = n - 1 - gid; - *(__global float*)(values.ptr + gid * values.stride_x) = out_key_buf[idx]; - *(__global int*)(indices.ptr + gid * indices.stride_x) = out_ind_buf[idx]; + *(__global float *)(values.ptr + gid * values.stride_x) = out_key_buf[idx]; + *(__global int *)(indices.ptr + gid * indices.stride_x) = out_ind_buf[idx]; } diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl index 0292fab..9594daf 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl @@ -17,114 +17,113 @@ #include "helpers.h" -__global inline float* get_vec_elem(Vector* vec, int idx) +__global inline float *get_vec_elem(Vector *vec, int idx) { - return (__global float*)(vec->ptr + idx * vec->stride_x); + return (__global float *)(vec->ptr + idx * vec->stride_x); } -__global inline int* get_vec_elem_int(Vector* vec, int idx) +__global inline int *get_vec_elem_int(Vector *vec, int idx) { - return (__global int*)(vec->ptr + idx * vec->stride_x); + return (__global int *)(vec->ptr + idx * vec->stride_x); } // A utility function to swap two elements void swap(__global float *a, __global float *b) { - float t = *a; - *a = *b; - *b = t; + float t = *a; + *a = *b; + *b = t; } void swap_idx(__global int *a, __global int *b) { - int t = *a; - *a = *b; - *b = t; + int t = *a; + *a = *b; + *b = t; } /* This function is same in both iterative and recursive*/ -int partition (Vector* arr, __global int* indices, int l, int h) +int partition(Vector *arr, __global int *indices, int l, int h) { - float x = *get_vec_elem(arr, h); - int i = (l - 1); + float x = *get_vec_elem(arr, h); + int i = (l - 1); - for (int j = l; j <= h- 1; j++) + for (int j = l; j <= h - 1; j++) + { + if (*get_vec_elem(arr, j) >= x) { - if (*get_vec_elem(arr, j) >= x) - { - i++; - swap (get_vec_elem(arr,i), get_vec_elem(arr,j)); - swap_idx(&indices[i], &indices[j]); - } + i++; + swap(get_vec_elem(arr, i), get_vec_elem(arr, j)); + swap_idx(&indices[i], &indices[j]); } - swap (get_vec_elem(arr, i + 1), get_vec_elem(arr, h)); - swap_idx(&indices[i + 1], &indices[h]); - return (i + 1); + } + swap(get_vec_elem(arr, i + 1), get_vec_elem(arr, h)); + swap_idx(&indices[i + 1], &indices[h]); + return (i + 1); } /* A[] --> Array to be sorted, l --> Starting index, h --> Ending index */ -void quickSortIterative (Vector* arr, __global int* indices, - __global int *stack, int l, int h) +void quickSortIterative(Vector *arr, __global int *indices, __global int *stack, int l, int h) { - // Create an auxiliary stack + // Create an auxiliary stack + + // initialize top of stack + int top = -1; - // initialize top of stack - int top = -1; + // push initial values of l and h to stack + stack[++top] = l; + stack[++top] = h; + + // Keep popping from stack while is not empty + while (top >= 0) + { + // Pop h and l + h = stack[top--]; + l = stack[top--]; - // push initial values of l and h to stack - stack[ ++top ] = l; - stack[ ++top ] = h; + // Set pivot element at its correct position + // in sorted array + int p = partition(arr, indices, l, h); - // Keep popping from stack while is not empty - while ( top >= 0 ) + // If there are elements on left side of pivot, + // then push left side to stack + if (p - 1 > l) { - // Pop h and l - h = stack[ top-- ]; - l = stack[ top-- ]; - - // Set pivot element at its correct position - // in sorted array - int p = partition( arr, indices, l, h ); - - // If there are elements on left side of pivot, - // then push left side to stack - if ( p-1 > l ) - { - stack[ ++top ] = l; - stack[ ++top ] = p - 1; - } - - // If there are elements on right side of pivot, - // then push right side to stack - if ( p+1 < h ) - { - stack[ ++top ] = p + 1; - stack[ ++top ] = h; - } + stack[++top] = l; + stack[++top] = p - 1; } + + // If there are elements on right side of pivot, + // then push right side to stack + if (p + 1 < h) + { + stack[++top] = p + 1; + stack[++top] = h; + } + } } -__kernel void topkv2_quicksort(VECTOR_DECLARATION(input), - VECTOR_DECLARATION(topk_values), VECTOR_DECLARATION(topk_indices), - __global int* indices, __global int* temp_stack, int k, int n) +__kernel void topkv2_quicksort(VECTOR_DECLARATION(input), VECTOR_DECLARATION(topk_values), + VECTOR_DECLARATION(topk_indices), __global int *indices, + __global int *temp_stack, int k, int n) { Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input); Vector topk_values = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_values); Vector topk_indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_indices); - for( int i = 0; i < n; ++i ) + for (int i = 0; i < n; ++i) { indices[i] = i; } - quickSortIterative(&input, indices, temp_stack, 0, n-1); + quickSortIterative(&input, indices, temp_stack, 0, n - 1); // extract k items. - for(int i = 0; i < k; ++i) + for (int i = 0; i < k; ++i) { - *get_vec_elem(&topk_values, i) = *get_vec_elem(&input, i); + *get_vec_elem(&topk_values, i) = *get_vec_elem(&input, i); *get_vec_elem_int(&topk_indices, i) = indices[i]; } } diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl index c2c2d89..f6830d2 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl @@ -29,11 +29,8 @@ // http://www.codercorner.com/RadixSortRevisited.htm // compute the histogram for each radix and each virtual processor for the pass -__kernel void radixsort_histogram(__global float* in_key_buf, - __global int* d_Histograms, - const int pass, - __local int* loc_histo, - const int n) +__kernel void radixsort_histogram(__global float *in_key_buf, __global int *d_Histograms, + const int pass, __local int *loc_histo, const int n) { int it = get_local_id(0); // i local number of the processor int ig = get_global_id(0); // global number = i + g I @@ -41,45 +38,48 @@ __kernel void radixsort_histogram(__global float* in_key_buf, int gr = get_group_id(0); // g group number int groups = get_num_groups(0); - int items = get_local_size(0); + int items = get_local_size(0); // set the local histograms to zero - for(int ir=0;ir<_RADIX;ir++){ + for (int ir = 0; ir < _RADIX; ir++) + { loc_histo[ir * items + it] = 0; } barrier(CLK_LOCAL_MEM_FENCE); // range of keys that are analyzed by the work item - int size= n/groups/items; // size of the sub-list - int start= ig * size; // beginning of the sub-list + int size = n / groups / items; // size of the sub-list + int start = ig * size; // beginning of the sub-list unsigned int key; - int shortkey,k; + int shortkey, k; // compute the index // the computation depends on the transposition - for(int j = 0; j < size ; j++) { + for (int j = 0; j < size; j++) + { #ifdef TRANSPOSE - k= groups * items * j + ig; + k = groups * items * j + ig; #else - k=j+start; + k = j + start; #endif - key = *((__global unsigned int*)(in_key_buf + k)); + key = *((__global unsigned int *)(in_key_buf + k)); // extract the group of _BITS bits of the pass // the result is in the range 0.._RADIX-1 - shortkey=(( key >> (pass * _BITS)) & (_RADIX-1)); + shortkey = ((key >> (pass * _BITS)) & (_RADIX - 1)); // increment the local histogram - loc_histo[shortkey * items + it ]++; + loc_histo[shortkey * items + it]++; } barrier(CLK_LOCAL_MEM_FENCE); // copy the local histogram to the global one - for(int ir=0;ir<_RADIX;ir++) { + for (int ir = 0; ir < _RADIX; ir++) + { d_Histograms[items * (ir * groups + gr) + it] = loc_histo[ir * items + it]; } @@ -88,98 +88,92 @@ __kernel void radixsort_histogram(__global float* in_key_buf, // initial transpose of the list for improving // coalescent memory access -__kernel void transpose(const __global int* invect, - __global int* outvect, - const int nbcol, - const int nbrow, - const __global int* inperm, - __global int* outperm, - __local int* blockmat, - __local int* blockperm, - const int tilesize){ +__kernel void transpose(const __global int *invect, __global int *outvect, const int nbcol, + const int nbrow, const __global int *inperm, __global int *outperm, + __local int *blockmat, __local int *blockperm, const int tilesize) +{ - int i0 = get_global_id(0)*tilesize; // first row index - int j = get_global_id(1); // column index + int i0 = get_global_id(0) * tilesize; // first row index + int j = get_global_id(1); // column index - int jloc = get_local_id(1); // local column index + int jloc = get_local_id(1); // local column index // fill the cache - for(int iloc=0;iloc> (pass * _BITS)) & (_RADIX-1)); + key = *(__global unsigned int *)(in_key + k); + shortkey = ((key >> (pass * _BITS)) & (_RADIX - 1)); - newpos=loc_histo[shortkey * items + it]; + newpos = loc_histo[shortkey * items + it]; #ifdef TRANSPOSE - int ignew,jnew; - ignew= newpos/(n/groups/items); - jnew = newpos%(n/groups/items); - newpost = jnew * (groups*items) + ignew; + int ignew, jnew; + ignew = newpos / (n / groups / items); + jnew = newpos % (n / groups / items); + newpost = jnew * (groups * items) + ignew; #else - newpost=newpos; + newpost = newpos; #endif - //d_outKeys[newpost]= key; // killing line !!! + // d_outKeys[newpost]= key; // killing line !!! out_key[newpost] = org_value; #ifdef PERMUT @@ -187,32 +181,35 @@ __kernel void radixsort_reorder(__global float* in_key, #endif newpos++; - loc_histo[shortkey * items + it]=newpos; + loc_histo[shortkey * items + it] = newpos; } } // perform a parallel prefix sum (a scan) on the local histograms // (see Blelloch 1990) each workitem worries about two memories // see also http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html -__kernel void radixsort_scanhistograms(__global int* histo, __local int* temp, __global int* globsum) +__kernel void radixsort_scanhistograms(__global int *histo, __local int *temp, + __global int *globsum) { int it = get_local_id(0); int ig = get_global_id(0); int decale = 1; - int n=get_local_size(0) * 2 ; - int gr=get_group_id(0); + int n = get_local_size(0) * 2; + int gr = get_group_id(0); // load input into local memory // up sweep phase - temp[2*it] = histo[2*ig]; - temp[2*it+1] = histo[2*ig+1]; + temp[2 * it] = histo[2 * ig]; + temp[2 * it + 1] = histo[2 * ig + 1]; // parallel prefix sum (algorithm of Blelloch 1990) - for (int d = n>>1; d > 0; d >>= 1){ + for (int d = n >> 1; d > 0; d >>= 1) + { barrier(CLK_LOCAL_MEM_FENCE); - if (it < d){ - int ai = decale*(2*it+1)-1; - int bi = decale*(2*it+2)-1; + if (it < d) + { + int ai = decale * (2 * it + 1) - 1; + int bi = decale * (2 * it + 2) - 1; temp[bi] += temp[ai]; } decale *= 2; @@ -221,51 +218,52 @@ __kernel void radixsort_scanhistograms(__global int* histo, __local int* temp, _ // store the last element in the global sum vector // (maybe used in the next step for constructing the global scan) // clear the last element - if (it == 0) { - globsum[gr]=temp[n-1]; + if (it == 0) + { + globsum[gr] = temp[n - 1]; temp[n - 1] = 0; } // down sweep phase - for (int d = 1; d < n; d *= 2){ + for (int d = 1; d < n; d *= 2) + { decale >>= 1; barrier(CLK_LOCAL_MEM_FENCE); - if (it < d){ - int ai = decale*(2*it+1)-1; - int bi = decale*(2*it+2)-1; + if (it < d) + { + int ai = decale * (2 * it + 1) - 1; + int bi = decale * (2 * it + 2) - 1; int t = temp[ai]; temp[ai] = temp[bi]; temp[bi] += t; } - } barrier(CLK_LOCAL_MEM_FENCE); // write results to device memory - histo[2*ig] = temp[2*it]; - histo[2*ig+1] = temp[2*it+1]; + histo[2 * ig] = temp[2 * it]; + histo[2 * ig + 1] = temp[2 * it + 1]; barrier(CLK_GLOBAL_MEM_FENCE); - } // use the global sum for updating the local histograms // each work item updates two values -__kernel void radixsort_pastehistograms( __global int* histo,__global int* globsum) +__kernel void radixsort_pastehistograms(__global int *histo, __global int *globsum) { int ig = get_global_id(0); - int gr=get_group_id(0); + int gr = get_group_id(0); int s; - s=globsum[gr]; + s = globsum[gr]; // write results to device memory - histo[2*ig] += s; - histo[2*ig+1] += s; + histo[2 * ig] += s; + histo[2 * ig + 1] += s; barrier(CLK_GLOBAL_MEM_FENCE); } diff --git a/scripts/command/format-check b/scripts/command/format-check index cd9bfaa..8224870 100644 --- a/scripts/command/format-check +++ b/scripts/command/format-check @@ -68,7 +68,7 @@ check_cpp_files() { DIRECTORIES_NOT_TO_BE_TESTED=$1 # Check c++ files - CPP_FILES_TO_CHECK=$(git ls-files '*.h' '*.cpp' '*.cc' ':!:include/NeuralNetworks.h') + CPP_FILES_TO_CHECK=$(git ls-files '*.h' '*.cpp' '*.cc' '*.cl' ':!:include/NeuralNetworks.h') ARR=($CPP_FILES_TO_CHECK) for s in ${DIRECTORIES_NOT_TO_BE_TESTED[@]}; do skip=${s#'.'/}/ -- 2.7.4