src/core/CL/cl_kernels/permute.cl

   1 /*
   2  * Copyright (c) 2018 ARM Limited.
   3  *
   4  * SPDX-License-Identifier: MIT
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to
   8  * deal in the Software without restriction, including without limitation the
   9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  10  * sell copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in all
  14  * copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24 #include "helpers.h"
  25
  26 #if defined(DATA_TYPE) && defined(DEPTH_IN)
  27 /** Perform a DCHW -> DHWC permute operation on an input tensor.
  28  *
  29  * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
  30  * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
  31  *
  32  * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
  33  * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
  34  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  35  * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
  36  * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
  37  * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
  38  * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
  39  * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
  40  * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
  41  * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
  42  * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
  43  * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
  44  * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
  45  * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  46  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  47  * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
  48  * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
  49  * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
  50  */
  51 __kernel void permute_201(
  52     TENSOR4D_DECLARATION(input),
  53     TENSOR4D_DECLARATION(output))
  54 {
  55     Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
  56     Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
  57
  58     *((__global DATA_TYPE *)tensor4D_offset(&out, (get_global_id(2) % DEPTH_IN), get_global_id(0), get_global_id(1), (get_global_id(2) / DEPTH_IN))) = *((__global DATA_TYPE *)in.ptr);
  59 }
  60
  61 /** Perform a DCHW -> DWCH permute operation on an input tensor.
  62  *
  63  * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
  64  * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
  65  *
  66  * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
  67  * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
  68  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  69  * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
  70  * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
  71  * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
  72  * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
  73  * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
  74  * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
  75  * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
  76  * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
  77  * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
  78  * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
  79  * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  80  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  81  * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
  82  * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
  83  * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
  84  */
  85 __kernel void permute_120(
  86     TENSOR4D_DECLARATION(input),
  87     TENSOR4D_DECLARATION(output))
  88 {
  89     Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
  90     Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
  91
  92     *((__global DATA_TYPE *)tensor4D_offset(&out, get_global_id(1), (get_global_id(2) % DEPTH_IN), get_global_id(0), (get_global_id(2) / DEPTH_IN))) = *((__global DATA_TYPE *)in.ptr);
  93 }
  94
  95 /** Perform a DCHW -> HWCD permute operation on an input tensor.
  96  *
  97  * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
  98  * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
  99  *
 100  * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
 101  * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
 102  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
 103  * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
 104  * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
 105  * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
 106  * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
 107  * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
 108  * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
 109  * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
 110  * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
 111  * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
 112  * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
 113  * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
 114  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
 115  * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
 116  * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
 117  * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
 118  */
 119 __kernel void permute_3201(
 120     TENSOR4D_DECLARATION(input),
 121     TENSOR4D_DECLARATION(output))
 122 {
 123     Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
 124     Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
 125
 126     *((__global DATA_TYPE *)tensor4D_offset(&out, (get_global_id(2) / DEPTH_IN), (get_global_id(2) % DEPTH_IN), get_global_id(0), get_global_id(1))) = *((__global DATA_TYPE *)in.ptr);
 127 }
 128 #endif // defined(DATA_TYPE) && defined(DEPTH_IN)