src/core/CL/cl_kernels/im2col.cl

   1 /*
   2  * Copyright (c) 2018 ARM Limited.
   3  *
   4  * SPDX-License-Identifier: MIT
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to
   8  * deal in the Software without restriction, including without limitation the
   9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  10  * sell copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in all
  14  * copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24 #include "helpers.h"
  25
  26 #if defined(FIXED_POINT_POSITION)
  27 #include "fixed_point.h"
  28 #endif // FIXED_POINT_POSITION
  29
  30 #if defined(DATA_TYPE) && defined(ELEMENT_SIZE)
  31 #if !defined(FIXED_POINT_POSITION)
  32
  33 #if ELEMENT_SIZE == 1
  34 #define COND_DATA_TYPE char
  35 #elif ELEMENT_SIZE == 2
  36 #define COND_DATA_TYPE short
  37 #elif ELEMENT_SIZE == 4
  38 #define COND_DATA_TYPE int
  39 #else // ELEMENT_SIZE
  40 #error "Element size not support"
  41 #endif // ELEMENT_SIZE
  42
  43 #if defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(KERNEL_DEPTH)
  44 /** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 1x1 and the stride_x = 1
  45  *
  46  * @note This kernel computes 4 elements
  47  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
  48  * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
  49  * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3
  50  * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
  51  * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
  52  *
  53  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32
  54  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  55  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  56  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
  57  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  58  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  59  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
  60  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
  61  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
  62  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  63  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  64  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  65  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
  66  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  67  * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
  68  * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
  69  */
  70 __kernel void im2col1x1_stridex1_dchw(
  71     TENSOR3D_DECLARATION(src),
  72     IMAGE_DECLARATION(dst),
  73     uint src_stride_w,
  74     uint dst_stride_w)
  75 {
  76     const uint xc    = get_global_id(0) * 4;            // x coordinate in the convolved tensor
  77     const uint yc    = get_global_id(1);                // y coordinate in the convolved tensor
  78     const uint ch    = get_global_id(2) % KERNEL_DEPTH; // input feature map
  79     const uint batch = get_global_id(2) / KERNEL_DEPTH; // batch size
  80
  81     // Clamp xc
  82     // The strategy clamps at "xc" as it will be a valid value for sure
  83     uint4 xc_clamped = xc + (uint4)(0, 1, 2, 3);
  84
  85     // Check which values are valid
  86     const VEC_DATA_TYPE(COND_DATA_TYPE, 4) cond0 = CONVERT((xc_clamped < SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 4));
  87
  88     xc_clamped = select((uint4)xc, xc_clamped, convert_int4(cond0));
  89
  90     // Calculate input indices
  91     const uint xi = xc;
  92     const uint yi = yc * STRIDE_Y;
  93
  94     // Calculate output indices
  95     const uint  xo = ch;
  96     const uint4 yo = xc_clamped + yc * CONVOLVED_WIDTH; // Index of the convolution
  97
  98     // Get input and output address
  99     __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w;
 100
 101     __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + batch * dst_stride_w;
 102
 103     VEC_DATA_TYPE(DATA_TYPE, 4)
 104     data = vload4(0, (__global DATA_TYPE *)input_ptr);
 105
 106     // If out-of-bound, overwrite with the first element
 107     data = select((VEC_DATA_TYPE(DATA_TYPE, 4))data.s0, data, cond0);
 108
 109     *(__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) = data.s0;
 110     *(__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) = data.s1;
 111     *(__global DATA_TYPE *)(output_ptr + yo.s2 * dst_stride_y) = data.s2;
 112     *(__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) = data.s3;
 113
 114 #ifdef HAS_BIAS
 115     if(ch == (KERNEL_DEPTH - 1))
 116     {
 117         *((__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) + 1) = 1.0f;
 118         *((__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) + 1) = 1.0f;
 119         *((__global DATA_TYPE *)(output_ptr + yo.s2 * dst_stride_y) + 1) = 1.0f;
 120         *((__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) + 1) = 1.0f;
 121     }
 122 #endif // HAS_BIAS
 123 }
 124 #endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(KERNEL_DEPTH)
 125
 126 #if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
 127 /** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 3x3
 128  *
 129  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
 130  * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
 131  * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
 132  * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3
 133  * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
 134  * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
 135  * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
 136  * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
 137  *
 138  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32
 139  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
 140  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
 141  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
 142  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
 143  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
 144  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
 145  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
 146  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
 147  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
 148  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
 149  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
 150  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
 151  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
 152  * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
 153  * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
 154  */
 155 __kernel void im2col3x3_dchw(
 156     TENSOR3D_DECLARATION(src),
 157     IMAGE_DECLARATION(dst),
 158     uint src_stride_w,
 159     uint dst_stride_w)
 160 {
 161     const int xc    = get_global_id(0);                // x coordinate in the convolved tensor
 162     const int yc    = get_global_id(1);                // y coordinate in the convolved tensor
 163     const int ch    = get_global_id(2) % KERNEL_DEPTH; // input feature map
 164     const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size
 165
 166     // Calculate input indices
 167     const int xi = xc * STRIDE_X - PAD_LEFT;
 168     const int yi = yc * STRIDE_Y - PAD_TOP;
 169
 170     // Calculate output indices
 171     const int xo = ch * 9;                    // 3x3
 172     const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
 173
 174     // Get input and output address
 175     __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w;
 176
 177     __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
 178
 179     VEC_DATA_TYPE(DATA_TYPE, 3)
 180     row0 = vload3(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
 181     VEC_DATA_TYPE(DATA_TYPE, 3)
 182     row1 = vload3(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
 183     VEC_DATA_TYPE(DATA_TYPE, 3)
 184     row2 = vload3(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
 185
 186 #if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
 187     // Put 0 if the value is out-of-bound
 188     int3 x = (int3)xi + (int3)(0, 1, 2);
 189     int3 y = (int3)yi + (int3)(0, 1, 2);
 190
 191     VEC_DATA_TYPE(COND_DATA_TYPE, 3)
 192     cond0 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s0 >= 0 && y.s0 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));
 193     VEC_DATA_TYPE(COND_DATA_TYPE, 3)
 194     cond1 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s1 >= 0 && y.s1 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));
 195     VEC_DATA_TYPE(COND_DATA_TYPE, 3)
 196     cond2 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s2 >= 0 && y.s2 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));
 197
 198     row0 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row0, cond0);
 199     row1 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row1, cond1);
 200     row2 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row2, cond2);
 201 #endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
 202
 203     vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row0.s012, row1.s012, row2.s01), 0, (__global DATA_TYPE *)output_ptr);
 204     *((__global DATA_TYPE *)output_ptr + 8) = row2.s2;
 205
 206 #ifdef HAS_BIAS
 207     if(ch == (KERNEL_DEPTH - 1))
 208     {
 209         *((__global DATA_TYPE *)output_ptr + 9) = 1.0f;
 210     }
 211 #endif // HAS_BIAS
 212 }
 213
 214 /** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 5x5
 215  *
 216  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
 217  * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
 218  * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
 219  * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3
 220  * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
 221  * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
 222  * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
 223  * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
 224  *
 225  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32
 226  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
 227  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
 228  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
 229  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
 230  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
 231  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
 232  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
 233  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
 234  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
 235  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
 236  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
 237  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
 238  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
 239  * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
 240  * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
 241  */
 242 __kernel void im2col5x5_dchw(
 243     TENSOR3D_DECLARATION(src),
 244     IMAGE_DECLARATION(dst),
 245     uint src_stride_w,
 246     uint dst_stride_w)
 247 {
 248     const int xc    = get_global_id(0);                // x coordinate in the convolved tensor
 249     const int yc    = get_global_id(1);                // y coordinate in the convolved tensor
 250     const int ch    = get_global_id(2) % KERNEL_DEPTH; // input feature map
 251     const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size
 252
 253     // Calculate input indices
 254     const int xi = xc * STRIDE_X - PAD_LEFT;
 255     const int yi = yc * STRIDE_Y - PAD_TOP;
 256
 257     // Calculate output indices
 258     const int xo = ch * 25;                   // 5x5
 259     const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
 260
 261 #if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
 262     // Put 0 if the value is out-of-bound
 263     int4 x0 = (int4)xi + (int4)(0, 1, 2, 3);
 264     int4 y0 = (int4)yi + (int4)(0, 1, 2, 3);
 265     int  x1 = xi + 4;
 266     int  y1 = yi + 4;
 267
 268     // Check if we could have out-of-bounds elements in the x direction
 269     VEC_DATA_TYPE(COND_DATA_TYPE, 4)
 270     x0_condition = CONVERT((x0 >= (int4)0 && x0 < (int4)SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 4));
 271     VEC_DATA_TYPE(COND_DATA_TYPE, 4)
 272     y0_condition                = CONVERT((y0 >= (int4)0 && y0 < (int4)SRC_HEIGHT), VEC_DATA_TYPE(COND_DATA_TYPE, 4));
 273     COND_DATA_TYPE x1_condition = (COND_DATA_TYPE)(x1 >= 0 && x1 < SRC_WIDTH);
 274     COND_DATA_TYPE y1_condition = (COND_DATA_TYPE)(y1 >= 0 && y1 < SRC_HEIGHT);
 275 #endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
 276
 277     // Get input and output address
 278     __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w;
 279
 280     __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
 281
 282     {
 283         VEC_DATA_TYPE(DATA_TYPE, 4)
 284         row00 = vload4(0, (__global DATA_TYPE *)input_ptr);
 285         DATA_TYPE
 286         row01 = *((__global DATA_TYPE *)input_ptr + 4);
 287
 288         input_ptr += src_stride_y;
 289
 290         VEC_DATA_TYPE(DATA_TYPE, 4)
 291         row10 = vload4(0, (__global DATA_TYPE *)input_ptr);
 292         DATA_TYPE
 293         row11 = *((__global DATA_TYPE *)input_ptr + 4);
 294
 295 #if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
 296         VEC_DATA_TYPE(COND_DATA_TYPE, 4)
 297         cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s0;
 298         VEC_DATA_TYPE(COND_DATA_TYPE, 4)
 299         cond10                = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s1;
 300         COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y0_condition.s0);
 301         COND_DATA_TYPE cond11 = (COND_DATA_TYPE)(x1_condition && y0_condition.s1);
 302
 303         // Replace with 0 if the value is not valid
 304         row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);
 305         row10 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row10, cond10);
 306         row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);
 307         row11 = select((DATA_TYPE)PAD_VALUE, row11, cond11);
 308 #endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
 309
 310         vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s0123, row01,
 311                                               row10.s012),
 312                 0, (__global DATA_TYPE *)output_ptr);
 313         vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(row10.s3, row11), 0, (__global DATA_TYPE *)output_ptr + 8);
 314
 315         input_ptr += src_stride_y;
 316         output_ptr += 10 * dst_stride_x;
 317     }
 318
 319     {
 320         VEC_DATA_TYPE(DATA_TYPE, 4)
 321         row00 = vload4(0, (__global DATA_TYPE *)input_ptr);
 322         DATA_TYPE
 323         row01 = *((__global DATA_TYPE *)input_ptr + 4);
 324
 325         input_ptr += src_stride_y;
 326
 327         VEC_DATA_TYPE(DATA_TYPE, 4)
 328         row10 = vload4(0, (__global DATA_TYPE *)input_ptr);
 329         DATA_TYPE
 330         row11 = *((__global DATA_TYPE *)input_ptr + 4);
 331
 332 #if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
 333         VEC_DATA_TYPE(COND_DATA_TYPE, 4)
 334         cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s2;
 335         VEC_DATA_TYPE(COND_DATA_TYPE, 4)
 336         cond10                = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s3;
 337         COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y0_condition.s2);
 338         COND_DATA_TYPE cond11 = (COND_DATA_TYPE)(x1_condition && y0_condition.s3);
 339
 340         // Replace with 0 if the value is not valid
 341         row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);
 342         row10 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row10, cond10);
 343         row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);
 344         row11 = select((DATA_TYPE)PAD_VALUE, row11, cond11);
 345 #endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
 346
 347         vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s0123, row01,
 348                                               row10.s012),
 349                 0, (__global DATA_TYPE *)output_ptr);
 350         vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(row10.s3, row11), 0, (__global DATA_TYPE *)output_ptr + 8);
 351
 352         input_ptr += src_stride_y;
 353         output_ptr += 10 * dst_stride_x;
 354     }
 355
 356     {
 357         VEC_DATA_TYPE(DATA_TYPE, 4)
 358         row00 = vload4(0, (__global DATA_TYPE *)input_ptr);
 359         DATA_TYPE
 360         row01 = *((__global DATA_TYPE *)input_ptr + 4);
 361
 362         input_ptr += src_stride_y;
 363
 364 #if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
 365         VEC_DATA_TYPE(COND_DATA_TYPE, 4)
 366         cond00                = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y1_condition;
 367         COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y1_condition);
 368
 369         // Replace with 0 if the value is not valid
 370         row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);
 371         row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);
 372 #endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
 373
 374         vstore4(row00, 0, (__global DATA_TYPE *)output_ptr);
 375         *((__global DATA_TYPE *)output_ptr + 4) = row01;
 376
 377         output_ptr += 5 * dst_stride_x;
 378     }
 379
 380 #ifdef HAS_BIAS
 381     if(ch == (KERNEL_DEPTH - 1))
 382     {
 383         *((__global DATA_TYPE *)output_ptr) = 1.0f;
 384     }
 385 #endif // HAS_BIAS
 386 }
 387 #endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
 388
 389 #if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH)
 390 /** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 11x11
 391  *
 392  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
 393  * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
 394  * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3
 395  * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
 396  * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
 397  *
 398  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32
 399  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
 400  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
 401  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
 402  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
 403  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
 404  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
 405  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
 406  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
 407  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
 408  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
 409  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
 410  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
 411  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
 412  * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
 413  * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
 414  */
 415 __kernel void im2col11x11_padx0_pady0_dchw(
 416     TENSOR3D_DECLARATION(src),
 417     IMAGE_DECLARATION(dst),
 418     uint src_stride_w,
 419     uint dst_stride_w)
 420 {
 421     const int xc    = get_global_id(0);                // x coordinate in the convolved tensor
 422     const int yc    = get_global_id(1);                // y coordinate in the convolved tensor
 423     const int ch    = get_global_id(2) % KERNEL_DEPTH; // input feature map
 424     const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size
 425
 426     // Calculate input indices
 427     const int xi = xc * STRIDE_X;
 428     const int yi = yc * STRIDE_Y;
 429
 430     // Calculate output indices
 431     const int xo = ch * 121;                  // 11x11
 432     const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
 433
 434     // Get input and output address
 435     __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w;
 436
 437     __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
 438     {
 439         VEC_DATA_TYPE(DATA_TYPE, 8)
 440         row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
 441         VEC_DATA_TYPE(DATA_TYPE, 3)
 442         row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
 443
 444         vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
 445         vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
 446
 447         input_ptr += src_stride_y;
 448         output_ptr += 11 * src_stride_x;
 449     }
 450
 451     {
 452         VEC_DATA_TYPE(DATA_TYPE, 8)
 453         row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
 454         VEC_DATA_TYPE(DATA_TYPE, 3)
 455         row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
 456
 457         vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
 458         vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
 459
 460         input_ptr += src_stride_y;
 461         output_ptr += 11 * src_stride_x;
 462     }
 463
 464     {
 465         VEC_DATA_TYPE(DATA_TYPE, 8)
 466         row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
 467         VEC_DATA_TYPE(DATA_TYPE, 3)
 468         row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
 469
 470         vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
 471         vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
 472
 473         input_ptr += src_stride_y;
 474         output_ptr += 11 * src_stride_x;
 475     }
 476
 477     {
 478         VEC_DATA_TYPE(DATA_TYPE, 8)
 479         row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
 480         VEC_DATA_TYPE(DATA_TYPE, 3)
 481         row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
 482
 483         vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
 484         vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
 485
 486         input_ptr += src_stride_y;
 487         output_ptr += 11 * src_stride_x;
 488     }
 489
 490     {
 491         VEC_DATA_TYPE(DATA_TYPE, 8)
 492         row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
 493         VEC_DATA_TYPE(DATA_TYPE, 3)
 494         row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
 495
 496         vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
 497         vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
 498
 499         input_ptr += src_stride_y;
 500         output_ptr += 11 * src_stride_x;
 501     }
 502
 503     {
 504         VEC_DATA_TYPE(DATA_TYPE, 8)
 505         row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
 506         VEC_DATA_TYPE(DATA_TYPE, 3)
 507         row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
 508
 509         vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
 510         vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
 511
 512         input_ptr += src_stride_y;
 513         output_ptr += 11 * src_stride_x;
 514     }
 515
 516     {
 517         VEC_DATA_TYPE(DATA_TYPE, 8)
 518         row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
 519         VEC_DATA_TYPE(DATA_TYPE, 3)
 520         row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
 521
 522         vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
 523         vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
 524
 525         input_ptr += src_stride_y;
 526         output_ptr += 11 * src_stride_x;
 527     }
 528
 529     {
 530         VEC_DATA_TYPE(DATA_TYPE, 8)
 531         row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
 532         VEC_DATA_TYPE(DATA_TYPE, 3)
 533         row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
 534
 535         vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
 536         vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
 537
 538         input_ptr += src_stride_y;
 539         output_ptr += 11 * src_stride_x;
 540     }
 541
 542     {
 543         VEC_DATA_TYPE(DATA_TYPE, 8)
 544         row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
 545         VEC_DATA_TYPE(DATA_TYPE, 3)
 546         row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
 547
 548         vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
 549         vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
 550
 551         input_ptr += src_stride_y;
 552         output_ptr += 11 * src_stride_x;
 553     }
 554
 555     {
 556         VEC_DATA_TYPE(DATA_TYPE, 8)
 557         row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
 558         VEC_DATA_TYPE(DATA_TYPE, 3)
 559         row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
 560
 561         vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
 562         vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
 563
 564         input_ptr += src_stride_y;
 565         output_ptr += 11 * src_stride_x;
 566     }
 567
 568     {
 569         VEC_DATA_TYPE(DATA_TYPE, 8)
 570         row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
 571         VEC_DATA_TYPE(DATA_TYPE, 3)
 572         row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
 573
 574         vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
 575         vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
 576
 577         output_ptr += 11 * src_stride_x;
 578     }
 579
 580 #ifdef HAS_BIAS
 581     if(ch == (KERNEL_DEPTH - 1))
 582     {
 583         *((__global DATA_TYPE *)output_ptr) = 1.0f;
 584     }
 585 #endif // HAS_BIAS
 586 }
 587 #endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH)
 588 #endif // !defined(FIXED_POINT_POSITION)
 589
 590 #if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
 591 /** This kernel reshapes the input tensor to a tensor used to perform convolution using GEMM when
 592  * the kernel width is greater than 1 (except when the kernel size is 3x3) and pad_x == pad_y == 0.
 593  *
 594  * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float.
 595  * @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=4.
 596  * @note The width modulo vector size must be passed at compile time using -DWIDTH_MOD_VECTOR_SIZE e.g. -DWIDTH_MOD_VECTOR_SIZE=3.
 597  * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
 598  *
 599  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
 600  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
 601  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
 602  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
 603  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
 604  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
 605  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
 606  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
 607  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
 608  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
 609  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
 610  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
 611  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
 612  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
 613  * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
 614  * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
 615  */
 616 __kernel void im2col_generic_padx0_pady0_dchw(
 617     TENSOR3D_DECLARATION(src),
 618     IMAGE_DECLARATION(dst),
 619     uint src_stride_w,
 620     uint dst_stride_w)
 621 {
 622     const int xc    = get_global_id(0);                // x coordinate in the convolved tensor
 623     const int yc    = get_global_id(1);                // y coordinate in the convolved tensor
 624     const int ch    = get_global_id(2) % KERNEL_DEPTH; // input feature map
 625     const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size
 626
 627     // Calculate input indices
 628     const int xi = xc * STRIDE_X;
 629     const int yi = yc * STRIDE_Y;
 630     // Calculate output indices
 631     const int xo                   = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
 632     const int yo                   = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
 633     __global uchar *input_ptr      = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
 634     __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
 635     // Linearize convolution elements
 636     for(int y = yi, y_e = yi + KERNEL_HEIGHT; y < y_e; ++y)
 637     {
 638         int last_x = 0;
 639         for(int x = xi, x_e = xi + KERNEL_WIDTH; x + VECTOR_SIZE <= x_e; x += VECTOR_SIZE, output_ptr += VECTOR_SIZE)
 640         {
 641             VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
 642             row = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
 643             VSTORE(VECTOR_SIZE)
 644             (row, 0, output_ptr);
 645             last_x = x;
 646         }
 647         // Copy the remainder of the row by doing VLOAD(WIDTH_MOD_VECTOR_SIZE) and VSTORE(WIDTH_MOD_VECTOR_SIZE).
 648         // Note that x and output_ptr have already been incremented by VECTOR_SIZE by the loop just before exit.
 649 #if WIDTH_MOD_VECTOR_SIZE == 1
 650         *output_ptr = *((__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y));
 651 #elif WIDTH_MOD_VECTOR_SIZE > 1
 652         VEC_DATA_TYPE(DATA_TYPE, WIDTH_MOD_VECTOR_SIZE)
 653         row = VLOAD(WIDTH_MOD_VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y));
 654         VSTORE(WIDTH_MOD_VECTOR_SIZE)
 655         (row, 0, output_ptr);
 656 #endif /* WIDTH_MOD_VECTOR_SIZE */
 657         output_ptr += WIDTH_MOD_VECTOR_SIZE;
 658     } /* End of loop over KERNEL_HEIGHT */
 659
 660 #ifdef HAS_BIAS
 661     if(ch == (KERNEL_DEPTH - 1))
 662     {
 663 #ifdef FIXED_POINT_POSITION
 664         *output_ptr = (DATA_TYPE)(1 << FIXED_POINT_POSITION);
 665 #else  // FIXED_POINT_POSITION
 666         *output_ptr = 1.0f;
 667 #endif // FIXED_POINT_POSITION
 668     }
 669 #endif // HAS_BIAS
 670 }
 671 #endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
 672
 673 #if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
 674 /** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM.
 675  *
 676  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
 677  * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
 678  * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
 679  * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DKERNEL_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DKERNEL_DEPTH=64
 680  * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
 681  * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
 682  * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
 683  * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
 684  *
 685  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32
 686  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
 687  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
 688  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
 689  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
 690  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
 691  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
 692  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
 693  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
 694  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
 695  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
 696  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
 697  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
 698  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
 699  * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
 700  * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
 701  */
 702 __kernel void im2col_generic_dchw(
 703     TENSOR3D_DECLARATION(src),
 704     IMAGE_DECLARATION(dst),
 705     uint src_stride_w,
 706     uint dst_stride_w)
 707 {
 708     const int xc    = get_global_id(0);                // x coordinate in the convolved tensor
 709     const int yc    = get_global_id(1);                // y coordinate in the convolved tensor
 710     const int ch    = get_global_id(2) % KERNEL_DEPTH; // input feature map
 711     const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size
 712
 713     // Calculate input indices
 714     const int xi = xc * STRIDE_X - PAD_LEFT;
 715     const int yi = yc * STRIDE_Y - PAD_TOP;
 716
 717     // Calculate output indices
 718     const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
 719     const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
 720
 721     __global uchar *input_ptr      = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
 722     __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
 723
 724     // Linearize convolution elements
 725     for(int y = yi, y_e = yi + KERNEL_HEIGHT; y < y_e; ++y)
 726     {
 727         for(int x = xi, x_e = xi + KERNEL_WIDTH; x < x_e; ++x, ++output_ptr)
 728         {
 729 #if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
 730             *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
 731 #else  // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
 732             if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
 733             {
 734                 *output_ptr = PAD_VALUE;
 735             }
 736             else
 737             {
 738                 *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
 739             }
 740 #endif // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
 741         }
 742     }
 743
 744 #ifdef HAS_BIAS
 745     if(ch == (KERNEL_DEPTH - 1))
 746     {
 747 #ifdef FIXED_POINT_POSITION
 748         *output_ptr = (DATA_TYPE)(1 << FIXED_POINT_POSITION);
 749 #else  // FIXED_POINT_POSITION
 750         *output_ptr = 1.0f;
 751 #endif // FIXED_POINT_POSITION
 752     }
 753 #endif // HAS_BIAS
 754 }
 755 #endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
 756
 757 /**This kernel reshapes the input tensor to a tensor used to perform convolution using GEMM when
 758  * the kernel width and height are the same of width and height of the input tensor
 759  *
 760  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
 761  * @note In case biases will be added in late stage, -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
 762  *
 763  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32
 764  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
 765  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
 766  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
 767  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
 768  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
 769  * @param[in]  src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
 770  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
 771  * @param[out] dst_ptr                           Pointer to the destination tensor. Same as @p src_ptr
 772  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
 773  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
 774  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
 775  * @param[in]  width                             The width of the input tensor
 776  * @param[in]  height                            The height of the input tensor
 777  */
 778 __kernel void im2col_reduced_dchw(
 779     TENSOR3D_DECLARATION(src),
 780     VECTOR_DECLARATION(dst),
 781     uint width, uint height)
 782 {
 783     Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
 784
 785     const uint image_size = width * height;
 786
 787     __global uchar *tmp_out_ptr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) + get_global_id(1) * width + get_global_id(2) * image_size) * dst_stride_x;
 788
 789     *((__global DATA_TYPE *)tmp_out_ptr) = *((__global DATA_TYPE *)src.ptr);
 790
 791 #ifdef HAS_BIAS
 792     // If it is the last thread in the 3 dimensional workgroup
 793     if(get_global_id(0) == (get_global_size(0) - 1) && get_global_id(1) == (get_global_size(1) - 1) && get_global_id(2) == (get_global_size(2) - 1))
 794     {
 795         tmp_out_ptr += dst_stride_x;
 796 #ifdef FIXED_POINT_POSITION
 797         *((__global DATA_TYPE *)tmp_out_ptr) = (DATA_TYPE)(1 << FIXED_POINT_POSITION);
 798 #else  // FIXED_POINT_POSITION
 799         *((__global DATA_TYPE *)tmp_out_ptr) = (DATA_TYPE)1.0f;
 800 #endif // FIXED_POINT_POSITION
 801     }
 802 #endif // HAS_BIAS
 803 }
 804 #endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE)