src/core/CL/cl_kernels/reduction_operation.cl

   1 /*
   2  * Copyright (c) 2016-2018 ARM Limited.
   3  *
   4  * SPDX-License-Identifier: MIT
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to
   8  * deal in the Software without restriction, including without limitation the
   9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  10  * sell copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in all
  14  * copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24 #include "helpers.h"
  25
  26 /** Calculate square sum of a vector
  27  *
  28  * @param[in] input Pointer to the first pixel.
  29  *
  30  * @return square sum of vector.
  31  */
  32 inline DATA_TYPE square_sum(__global const DATA_TYPE *input)
  33 {
  34     VEC_DATA_TYPE(DATA_TYPE, 16)
  35     in = vload16(0, input);
  36
  37     in *= in;
  38
  39     in.s01234567 += in.s89ABCDEF;
  40     in.s0123 += in.s4567;
  41     in.s01 += in.s23;
  42
  43     return (in.s0 + in.s1);
  44 }
  45
  46 /** Calculate sum of a vector
  47  *
  48  * @param[in] input Pointer to the first pixel.
  49  *
  50  * @return sum of vector.
  51  */
  52 inline DATA_TYPE sum(__global const DATA_TYPE *input)
  53 {
  54     VEC_DATA_TYPE(DATA_TYPE, 16)
  55     in = vload16(0, input);
  56
  57     in.s01234567 += in.s89ABCDEF;
  58     in.s0123 += in.s4567;
  59     in.s01 += in.s23;
  60
  61     return (in.s0 + in.s1);
  62 }
  63
  64 /** This kernel performs reduction given an operation.
  65  *
  66  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
  67  * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
  68  * @note The operation we want to perform must be passed at compile time using -DOPERATION e.g. -DOPERATION=square_sum
  69  *
  70  * @param[in] src_ptr                                   Pointer to the source tensor. Supported data types: F32
  71  * @param[in] src_stride_x                              Stride of the source tensor in X dimension (in bytes)
  72  * @param[in] src_step_x                                src_stride_x * number of elements along X processed per workitem(in bytes)
  73  * @param[in] src_stride_y                              Stride of the source tensor in Y dimension (in bytes)
  74  * @param[in] src_step_y                                src_stride_y * number of elements along Y processed per workitem(in bytes)
  75  * @param[in] src_offset_first_element_in_bytes         The offset of the first element in the source tensor
  76  * @param[in] partial_sum_ptr                           The local buffer to hold sumed values. Supported data types: same as @p src_ptt
  77  * @param[in] partial_sum_stride_x                      Stride of the output tensor in X dimension (in bytes)
  78  * @param[in] partial_sum_step_x                        partial_sum_stride_x * number of elements along X processed per workitem(in bytes)
  79  * @param[in] partial_sum_stride_y                      Stride of the output tensor in Y dimension (in bytes)
  80  * @param[in] partial_sum_step_y                        partial_sum_stride_y * number of elements along Y processed per workitem(in bytes)
  81  * @param[in] partial_sum_offset_first_element_in_bytes The offset of the first element in the source tensor
  82  * @param[in] local_sums                                Local buffer for storing the partial sum
  83  */
  84 __kernel void reduction_operation(
  85     IMAGE_DECLARATION(src),
  86     IMAGE_DECLARATION(partial_sum),
  87     __local DATA_TYPE *local_sums)
  88 {
  89     Image src         = CONVERT_TO_IMAGE_STRUCT(src);
  90     Image partial_sum = CONVERT_TO_IMAGE_STRUCT(partial_sum);
  91
  92     unsigned int lsize = get_local_size(0);
  93     unsigned int lid   = get_local_id(0);
  94
  95     for(unsigned int y = 0; y < get_local_size(1); ++y)
  96     {
  97         local_sums[lid] = OPERATION((__global DATA_TYPE *)offset(&src, 0, y));
  98         barrier(CLK_LOCAL_MEM_FENCE);
  99
 100         // Perform parallel reduction
 101         for(unsigned int i = lsize >> 1; i > 0; i >>= 1)
 102         {
 103             if(lid < i)
 104             {
 105                 local_sums[lid] += local_sums[lid + i];
 106             }
 107             barrier(CLK_LOCAL_MEM_FENCE);
 108         }
 109
 110         if(lid == 0)
 111         {
 112             ((__global DATA_TYPE *)offset(&partial_sum, get_group_id(0), y))[0] = local_sums[0];
 113         }
 114     }
 115 }