src/core/CL/cl_kernels/gaussian_pyramid.cl

   1 /*
   2  * Copyright (c) 2017 ARM Limited.
   3  *
   4  * SPDX-License-Identifier: MIT
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to
   8  * deal in the Software without restriction, including without limitation the
   9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  10  * sell copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in all
  14  * copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24 #include "helpers.h"
  25
  26 /** Computes the Gaussian Filter 1x5 + sub-sampling along the X direction
  27  *
  28  * @note Each thread computes 8 pixels
  29  *
  30  * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
  31  * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
  32  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  33  * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
  34  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  35  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
  36  * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U16
  37  * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
  38  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  39  * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
  40  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
  41  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
  42  */
  43 __kernel void gaussian1x5_sub_x(
  44     IMAGE_DECLARATION(src),
  45     IMAGE_DECLARATION(dst))
  46 {
  47     Image src = CONVERT_TO_IMAGE_STRUCT(src);
  48     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
  49
  50     // Load values for the convolution (20 bytes needed)
  51     uchar16 temp0 = vload16(0, src.ptr);
  52     uchar4  temp1 = vload4(0, src.ptr + 16);
  53
  54     // Convert to USHORT8
  55     ushort8 l2_data = convert_ushort8((uchar8)(temp0.s02468ACE));
  56     ushort8 l1_data = convert_ushort8((uchar8)(temp0.s13579BDF));
  57     ushort8 m_data  = convert_ushort8((uchar8)(temp0.s2468, temp0.sACE, temp1.s0));
  58     ushort8 r1_data = convert_ushort8((uchar8)(temp0.s3579, temp0.sBDF, temp1.s1));
  59     ushort8 r2_data = convert_ushort8((uchar8)(temp0.s468A, temp0.sCE, temp1.s02));
  60
  61     // Compute convolution along the X direction
  62     ushort8 pixels = l2_data + r2_data;
  63     pixels += l1_data * (ushort8)4;
  64     pixels += m_data * (ushort8)6;
  65     pixels += r1_data * (ushort8)4;
  66
  67     // Store result
  68     vstore8(pixels, 0, (__global ushort *)dst.ptr);
  69 }
  70
  71 /** Computes the Gaussian Filter 5x1 + sub-sampling along the Y direction
  72  *
  73  * @note Each thread computes 8 pixels
  74  *
  75  * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U16
  76  * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
  77  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  78  * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
  79  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  80  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
  81  * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
  82  * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
  83  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  84  * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
  85  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
  86  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
  87  */
  88 __kernel void gaussian5x1_sub_y(
  89     IMAGE_DECLARATION(src),
  90     IMAGE_DECLARATION(dst))
  91 {
  92     Image src = CONVERT_TO_IMAGE_STRUCT(src);
  93     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
  94
  95     // Load values
  96     ushort8 u2_data = vload8(0, (__global ushort *)offset(&src, 0, 0));
  97     ushort8 u1_data = vload8(0, (__global ushort *)offset(&src, 0, 1));
  98     ushort8 m_data  = vload8(0, (__global ushort *)offset(&src, 0, 2));
  99     ushort8 d1_data = vload8(0, (__global ushort *)offset(&src, 0, 3));
 100     ushort8 d2_data = vload8(0, (__global ushort *)offset(&src, 0, 4));
 101
 102     // Compute convolution along the Y direction
 103     ushort8 pixels = u2_data + d2_data;
 104     pixels += u1_data * (ushort8)4;
 105     pixels += m_data * (ushort8)6;
 106     pixels += d1_data * (ushort8)4;
 107
 108     // Scale result
 109     pixels >>= (ushort8)8;
 110
 111     // Store result
 112     vstore8(convert_uchar8_sat(pixels), 0, dst.ptr);
 113 }