src/core/CL/cl_kernels/depthwise_convolution.cl

   1 /*
   2  * Copyright (c) 2017-2018 ARM Limited.
   3  *
   4  * SPDX-License-Identifier: MIT
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to
   8  * deal in the Software without restriction, including without limitation the
   9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  10  * sell copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in all
  14  * copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "helpers.h"
  26
  27 #if defined(CONV_STRIDE_X)
  28
  29 #if CONV_STRIDE_X == 1
  30 #define convolution1x3 convolution1x3_stride_1
  31 #elif CONV_STRIDE_X == 2
  32 #define convolution1x3 convolution1x3_stride_2
  33 #elif CONV_STRIDE_X == 3
  34 #define convolution1x3 convolution1x3_stride_3
  35 #else /* CONV_STRIDE_X */
  36 #error "Stride not supported"
  37 #endif /* CONV_STRIDE_X */
  38
  39 /** Compute a 1D horizontal convolution of size 3 and stride 1 for floating point type.
  40  *
  41  * @param[in] left_pixel   Pointer to the left pixel.
  42  * @param[in] left_coeff   Weight of the left pixel
  43  * @param[in] middle_coeff Weight of the middle pixel
  44  * @param[in] right_coeff  Weight of the right pixel
  45  *
  46  * @return a float2 containing 2 convoluted values.
  47  */
  48 inline float2 convolution1x3_stride_1(__global const uchar *left_pixel,
  49                                       const float           left_coeff,
  50                                       const float           middle_coeff,
  51                                       const float           right_coeff)
  52 {
  53     float4 temp = vload4(0, (__global float *)left_pixel);
  54
  55     float2 left   = CONVERT(temp.s01, float2);
  56     float2 middle = CONVERT(temp.s12, float2);
  57     float2 right  = CONVERT(temp.s23, float2);
  58
  59     return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
  60 }
  61
  62 /** Compute a 1D horizontal convolution of size 3 and stride 2 for floating point type.
  63  *
  64  * @param[in] left_pixel   Pointer to the left pixel.
  65  * @param[in] left_coeff   Weight of the left pixel
  66  * @param[in] middle_coeff Weight of the middle pixel
  67  * @param[in] right_coeff  Weight of the right pixel
  68  *
  69  * @return a float2 containing 2 convoluted values.
  70  */
  71 inline float2 convolution1x3_stride_2(__global const uchar *left_pixel,
  72                                       const float           left_coeff,
  73                                       const float           middle_coeff,
  74                                       const float           right_coeff)
  75 {
  76     float4 temp0 = vload4(0, (__global float *)left_pixel);
  77     float  temp1 = *((__global float *)(left_pixel + 4 * sizeof(float)));
  78
  79     float2 left   = CONVERT(temp0.s02, float2);
  80     float2 middle = CONVERT(temp0.s13, float2);
  81     float2 right  = CONVERT((float2)(temp0.s2, temp1), float2);
  82
  83     return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
  84 }
  85
  86 /** Compute a 1D horizontal convolution of size 3 and stride 3 for floating point type.
  87  *
  88  * @param[in] left_pixel   Pointer to the left pixel.
  89  * @param[in] left_coeff   Weight of the left pixel
  90  * @param[in] middle_coeff Weight of the middle pixel
  91  * @param[in] right_coeff  Weight of the right pixel
  92  *
  93  * @return a float2 containing 2 convoluted values.
  94  */
  95 inline float2 convolution1x3_stride_3(__global const uchar *left_pixel,
  96                                       const float           left_coeff,
  97                                       const float           middle_coeff,
  98                                       const float           right_coeff)
  99 {
 100     float4 temp0 = vload4(0, (__global float *)left_pixel);
 101     float2 temp1 = vload2(0, (__global float *)(left_pixel + 4 * sizeof(float)));
 102
 103     float2 left   = CONVERT(temp0.s03, float2);
 104     float2 middle = CONVERT((float2)(temp0.s1, temp1.s0), float2);
 105     float2 right  = CONVERT((float2)(temp0.s2, temp1.s1), float2);
 106
 107     return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
 108 }
 109
 110 /** Apply a 3x3 convolution matrix to a single channel F32 input image and return the result.
 111  *
 112  * Convolution matrix layout:
 113  *
 114  * [ mat0, mat1, mat2 ]\n
 115  * [ mat3, mat4, mat5 ]\n
 116  * [ mat6, mat7, mat8 ]\n
 117  *
 118  * @param[in] src  A pointer to source Image structure
 119  * @param[in] mat0 Coefficient from the convolution matrix
 120  * @param[in] mat1 Coefficient from the convolution matrix
 121  * @param[in] mat2 Coefficient from the convolution matrix
 122  * @param[in] mat3 Coefficient from the convolution matrix
 123  * @param[in] mat4 Coefficient from the convolution matrix
 124  * @param[in] mat5 Coefficient from the convolution matrix
 125  * @param[in] mat6 Coefficient from the convolution matrix
 126  * @param[in] mat0 Coefficient from the convolution matrix
 127  * @param[in] mat7 Coefficient from the convolution matrix
 128  * @param[in] mat8 Coefficient from the convolution matrix
 129  *
 130  * @return a float2 containing 2 convoluted values.
 131  */
 132 inline float2 convolution3x3(
 133     Image      *src,
 134     const float mat0, const float mat1, const float mat2,
 135     const float mat3, const float mat4, const float mat5,
 136     const float mat6, const float mat7, const float mat8)
 137 {
 138     float2 pixels;
 139
 140     pixels = convolution1x3(offset(src, 0, 0), mat0, mat1, mat2);
 141     pixels += convolution1x3(offset(src, 0, 1), mat3, mat4, mat5);
 142     pixels += convolution1x3(offset(src, 0, 2), mat6, mat7, mat8);
 143
 144     return pixels;
 145 }
 146
 147 /** This OpenCL kernel computes the depthwise convolution 3x3
 148  *
 149  * @param[in] src_ptr                               Pointer to the source image. Supported data types: F32
 150  * @param[in] src_stride_x                          Stride of the source image in X dimension (in bytes)
 151  * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
 152  * @param[in] src_stride_y                          Stride of the source image in Y dimension (in bytes)
 153  * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
 154  * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source image
 155  * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
 156  * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
 157  * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: F32
 158  * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
 159  * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
 160  * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
 161  * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
 162  * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
 163  * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
 164  * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
 165  * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: F32
 166  * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
 167  * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
 168  * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
 169  * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
 170  * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
 171  * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
 172  * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
 173  * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: F16/F32
 174  * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
 175  * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
 176  * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
 177  */
 178 __kernel void depthwise_convolution_3x3(
 179     TENSOR3D_DECLARATION(src),
 180     TENSOR3D_DECLARATION(dst),
 181     TENSOR3D_DECLARATION(weights)
 182 #if defined(HAS_BIAS)
 183     ,
 184     VECTOR_DECLARATION(biases)
 185 #endif //defined(HAS_BIAS)
 186 )
 187 {
 188     Image    src     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
 189     Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
 190     Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
 191 #if defined(HAS_BIAS)
 192     Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
 193 #endif //defined(HAS_BIAS)
 194
 195     uchar3 offset          = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
 196     float3 weights_values0 = vload3(0, (__global float *)(weights.ptr + offset.s0));
 197     float3 weights_values1 = vload3(0, (__global float *)(weights.ptr + offset.s1));
 198     float3 weights_values2 = vload3(0, (__global float *)(weights.ptr + offset.s2));
 199
 200     float2 pixels = convolution3x3(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
 201                                    weights_values1.s0, weights_values1.s1, weights_values1.s2,
 202                                    weights_values2.s0, weights_values2.s1, weights_values2.s2);
 203 #if defined(HAS_BIAS)
 204     pixels += (float2)(*((__global float *)(biases.ptr + get_global_id(2) * biases_stride_x)));
 205 #endif //defined(HAS_BIAS)
 206
 207     vstore2(pixels, 0, (__global float *)dst.ptr);
 208 }
 209 #endif //defined(CONV_STRIDE_X)
 210
 211 #define CONVOLUTION1x3_BIFROST2X1_STRIDE1(acc, src0, weights_row0) \
 212     ({                                                             \
 213         acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0);            \
 214         acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0);            \
 215         acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0);            \
 216         acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1);            \
 217         acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1);            \
 218         acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1);            \
 219     })
 220
 221 #define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0, src1, weights_row0) \
 222     ({                                                                   \
 223         acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0);                  \
 224         acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0);                  \
 225         acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0);                  \
 226         acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1);                  \
 227         acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1);                  \
 228         acc.s1 = fma(src1.s0, weights_row0.s2, acc.s1);                  \
 229     })
 230
 231 /** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
 232  * stride_x and stride_y are equal to 1
 233  *
 234  * @param[in] src_ptr                               Pointer to the source image. Supported data types: F32
 235  * @param[in] src_stride_x                          Stride of the source image in X dimension (in bytes)
 236  * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
 237  * @param[in] src_stride_y                          Stride of the source image in Y dimension (in bytes)
 238  * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
 239  * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source image
 240  * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
 241  * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
 242  * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: F32
 243  * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
 244  * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
 245  * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
 246  * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
 247  * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
 248  * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
 249  * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
 250  * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: F32
 251  * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
 252  * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
 253  * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
 254  * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
 255  * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
 256  * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
 257  * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
 258  * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: F32
 259  * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
 260  * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
 261  * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
 262  */
 263 __kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost(
 264     TENSOR3D_DECLARATION(src),
 265     TENSOR3D_DECLARATION(dst),
 266     TENSOR3D_DECLARATION(weights)
 267 #if defined(HAS_BIAS)
 268     ,
 269     VECTOR_DECLARATION(biases)
 270 #endif //defined(HAS_BIAS)
 271 )
 272 {
 273     Image    src     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
 274     Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
 275     Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
 276
 277     float2 pixels0 = 0.0f;
 278     float2 pixels1 = 0.0f;
 279     float2 pixels2 = 0.0f;
 280     float2 pixels3 = 0.0f;
 281
 282     __global uchar *weights_addr = (__global uchar *)weights.ptr;
 283     __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
 284
 285     // Load the weights
 286     float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
 287     float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
 288     float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
 289
 290     // Note: Since each work-item computes 4x2 elements, we need to load 4 rows from the input tensor
 291     float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
 292     float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
 293     float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
 294     float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
 295     float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row3
 296     float4 src50 = vload4(0, (__global float *)(src_addr + 5 * src_stride_y)); // Row3
 297
 298     CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src00, weights_row0);
 299     CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src10, weights_row1);
 300     CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src20, weights_row2);
 301     CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src10, weights_row0);
 302     CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src20, weights_row1);
 303     CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src30, weights_row2);
 304     CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src20, weights_row0);
 305     CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src30, weights_row1);
 306     CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src40, weights_row2);
 307     CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src30, weights_row0);
 308     CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src40, weights_row1);
 309     CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src50, weights_row2);
 310
 311 #ifdef HAS_BIAS
 312     Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
 313
 314     float bias = *((__global float *)(vector_offset(&biases, get_global_id(2))));
 315
 316     pixels0 += (float2)bias;
 317     pixels1 += (float2)bias;
 318     pixels2 += (float2)bias;
 319     pixels3 += (float2)bias;
 320 #endif /* defined(HAS_BIAS) */
 321
 322     vstore2(pixels0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
 323     vstore2(pixels1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
 324     vstore2(pixels2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
 325     vstore2(pixels3, 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
 326 }
 327
 328 /** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
 329  * stride_x and stride_y are equal to 2
 330  *
 331  * @param[in] src_ptr                               Pointer to the source image. Supported data types: F32
 332  * @param[in] src_stride_x                          Stride of the source image in X dimension (in bytes)
 333  * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
 334  * @param[in] src_stride_y                          Stride of the source image in Y dimension (in bytes)
 335  * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
 336  * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source image
 337  * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
 338  * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
 339  * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: F32
 340  * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
 341  * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
 342  * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
 343  * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
 344  * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
 345  * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
 346  * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
 347  * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: F32
 348  * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
 349  * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
 350  * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
 351  * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
 352  * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
 353  * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
 354  * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
 355  * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: F32
 356  * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
 357  * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
 358  * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
 359  */
 360 __kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost(
 361     TENSOR3D_DECLARATION(src),
 362     TENSOR3D_DECLARATION(dst),
 363     TENSOR3D_DECLARATION(weights)
 364 #if defined(HAS_BIAS)
 365     ,
 366     VECTOR_DECLARATION(biases)
 367 #endif //defined(HAS_BIAS)
 368 )
 369 {
 370     Image    src     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
 371     Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
 372     Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
 373
 374     float2 pixels0 = 0.0f;
 375     float2 pixels1 = 0.0f;
 376
 377     __global uchar *weights_addr = (__global uchar *)weights.ptr;
 378     __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
 379
 380     // Load the weights
 381     float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
 382     float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
 383     float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
 384
 385     // Note: Since each work-item computes 4x2 elements, we need to load 5 rows from the input tensor
 386     float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
 387     float2 src01 = vload2(2, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
 388     float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
 389     float2 src11 = vload2(2, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
 390     float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
 391     float2 src21 = vload2(2, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
 392     float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
 393     float2 src31 = vload2(2, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
 394     float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
 395     float2 src41 = vload2(2, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
 396
 397     CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src00, src01, weights_row0);
 398     CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src10, src11, weights_row1);
 399     CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src20, src21, weights_row2);
 400     CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src20, src21, weights_row0);
 401     CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src30, src31, weights_row1);
 402     CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src40, src41, weights_row2);
 403
 404 #ifdef HAS_BIAS
 405     Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
 406
 407     float bias = *((__global float *)(vector_offset(&biases, get_global_id(2))));
 408
 409     pixels0 += (float2)bias;
 410     pixels1 += (float2)bias;
 411 #endif /* defined(HAS_BIAS) */
 412
 413     vstore2(pixels0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
 414     vstore2(pixels1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
 415 }
 416
 417 #if defined(SRC_WIDTH) && defined(DATA_TYPE)
 418 /** This kernel reshapes each of the tensor's low three dimensions to single rows.
 419  *
 420  * @note Datatype and source width should be given as a preprocessor argument using -DDATA_TYPE=type and -DSRC_WIDTH=width. e.g. -DSRC_WIDTH=128
 421  *
 422  * @param[in]  src_ptr                              Pointer to the source tensor. Supported data types: F16/F32
 423  * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
 424  * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
 425  * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
 426  * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
 427  * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in bytes)
 428  * @param[in]  src_step_z                           src_stride_z * number of elements along Y processed per workitem(in bytes)
 429  * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source tensor
 430  * @param[out] dst_ptr                              Pointer to the destination tensor. Same as @p src_ptr
 431  * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension (in bytes)
 432  * @param[in]  dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
 433  * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension (in bytes)
 434  * @param[in]  dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
 435  * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination tensor
 436  * @param[in]  biases_ptr                           (Optional) Pointer to the biases vector. Supported data types: F16/F32
 437  * @param[in]  biases_stride_x                      (Optional) Stride of the biases vector in X dimension (in bytes)
 438  * @param[in]  biases_step_x                        (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
 439  * @param[in]  biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
 440  */
 441 __kernel void depthwise_weights_reshape(
 442     TENSOR3D_DECLARATION(src),
 443     IMAGE_DECLARATION(dst)
 444 #ifdef HAS_BIAS
 445     ,
 446     VECTOR_DECLARATION(biases)
 447 #endif /* HAS_BIAS */
 448 )
 449 {
 450     Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
 451 #ifdef HAS_BIAS
 452     Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
 453 #endif /* HAS_BIAS */
 454
 455     __global DATA_TYPE *input_ptr = (__global DATA_TYPE *)src.ptr;
 456     __global uchar *output_ptr    = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * SRC_WIDTH * dst_stride_x + get_global_id(2) * dst_stride_y;
 457
 458     for(int i = 0; i < SRC_WIDTH; ++i, ++input_ptr)
 459     {
 460         *((__global DATA_TYPE *)(output_ptr + i * dst_stride_x)) = *input_ptr;
 461     }
 462
 463 #if defined(HAS_BIAS)
 464     if(get_global_id(1) == 0)
 465     {
 466         *((__global DATA_TYPE *)(output_ptr + SRC_WIDTH * get_global_size(1) * dst_stride_x)) = *((__global float *)(biases.ptr + get_global_id(2) * biases_stride_x));
 467     }
 468 #endif // defined(HAS_BIAS)
 469 }
 470 #endif //defined(SRC_WIDTH) && defined(DATA_TYPE)
 471
 472 #if defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE) && defined(PAD_VALUE)
 473 /** This kernel performs a reshaping of the input tensor to a tensor used to perform depthwise convolution using vector to matrix multiplication.
 474  *
 475  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
 476  * @note The convolution information must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y, -DPAD_LEFT, -DPAD_TOP, -DPAD_RIGHT, -DPAD_BOTTOM, -DKERNEL_WIDHT, -DKERNEL_HEIGHT, -DSRC_WIDTH, -DSRC_HEIGHT
 477  *
 478  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
 479  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
 480  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
 481  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
 482  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
 483  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
 484  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
 485  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
 486  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
 487  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
 488  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
 489  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
 490  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
 491  * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
 492  * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
 493  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
 494  */
 495 __kernel void depthwise_im2col(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))
 496 {
 497     Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
 498
 499     const int src_pixel_linear = get_global_id(1) * STRIDE_X;
 500     const int full_length      = SRC_WIDTH + PAD_LEFT + PAD_RIGHT;
 501     const int max_initial_x    = STRIDE_X * (((full_length - KERNEL_WIDTH) / STRIDE_X) + 1);
 502
 503     const int src_x = -PAD_LEFT + src_pixel_linear % max_initial_x;
 504     const int src_y = -PAD_TOP + src_pixel_linear / max_initial_x * STRIDE_Y;
 505     const int src_z = get_global_id(2);
 506
 507     __global uchar *input_ptr      = src_ptr + src_offset_first_element_in_bytes + src_z * src_stride_z;
 508     __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst.ptr));
 509
 510     for(int y = src_y; y < src_y + KERNEL_HEIGHT; ++y)
 511     {
 512         for(int x = src_x; x < src_x + KERNEL_WIDTH; ++x, ++output_ptr)
 513         {
 514             if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
 515             {
 516                 *output_ptr = PAD_VALUE;
 517             }
 518             else
 519             {
 520                 *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
 521             }
 522         }
 523     }
 524 #if defined(HAS_BIAS)
 525     *output_ptr = (DATA_TYPE)(1);
 526 #endif // defined(HAS_BIAS)
 527 }
 528
 529 #endif //defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(DATA_TYPE) && defined(PAD_VALUE)
 530
 531 #if defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
 532
 533 /** This kernel performs a reshaping of the output of the depthwise generic convolution.
 534  *
 535  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
 536  * @note The convolution information must be passed at compile time using -DCONV_WIDTH, -DCONV_HEIGHT, e.g -DCONV_WIDTH=32, -DCONV_HEIGHT=42
 537  *
 538  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
 539  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
 540  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
 541  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
 542  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
 543  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
 544  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
 545  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
 546  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
 547  * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
 548  * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
 549  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
 550  */
 551 __kernel void depthwise_vector_to_tensor(
 552     VECTOR_DECLARATION(src),
 553     TENSOR3D_DECLARATION(dst))
 554 {
 555     Vector src = CONVERT_TO_VECTOR_STRUCT(src);
 556
 557     const int patch_size = CONV_WIDTH * CONV_HEIGHT;
 558     const int id0        = get_global_id(0);
 559     const int z          = id0 / patch_size;
 560     const int index2D    = id0 - z * patch_size;
 561
 562     __global uchar *out_ptr          = dst_ptr + dst_offset_first_element_in_bytes + index2D % CONV_WIDTH * dst_stride_x + index2D / CONV_WIDTH * dst_stride_y + z * dst_stride_z;
 563     *((__global DATA_TYPE *)out_ptr) = *((__global DATA_TYPE *)src.ptr);
 564 }
 565
 566 #endif //defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
 567
 568 #if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
 569 #if defined(CONV_STRIDE_X)
 570 #if CONV_STRIDE_X == 1
 571 #define convolution1x3_f16 convolution1x3_stride_1_f16
 572 #elif CONV_STRIDE_X == 2
 573 #define convolution1x3_f16 convolution1x3_stride_2_f16
 574 #elif CONV_STRIDE_X == 3
 575 #define convolution1x3_f16 convolution1x3_stride_3_f16
 576 #else /* CONV_STRIDE_X */
 577 #error "Stride not supported"
 578 #endif /* CONV_STRIDE_X */
 579
 580 /** Compute a 1D horizontal convolution of size 3 and stride 1 for 16bit floating point type.
 581  *
 582  * @param[in] left_pixel   Pointer to the left pixel.
 583  * @param[in] left_coeff   Weight of the left pixel
 584  * @param[in] middle_coeff Weight of the middle pixel
 585  * @param[in] right_coeff  Weight of the right pixel
 586  *
 587  * @return a half4 containing 4 convoluted values.
 588  */
 589 inline half4 convolution1x3_stride_1_f16(__global const uchar *left_pixel,
 590                                          const half            left_coeff,
 591                                          const half            middle_coeff,
 592                                          const half            right_coeff)
 593 {
 594     half8 temp = vload8(0, (__global half *)left_pixel);
 595
 596     half4 left   = CONVERT(temp.s0123, half4);
 597     half4 middle = CONVERT(temp.s1234, half4);
 598     half4 right  = CONVERT(temp.s2345, half4);
 599
 600     return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
 601 }
 602
 603 /** Compute a 1D horizontal convolution of size 3 and stride 2 for 16bit floating point type.
 604  *
 605  * @param[in] left_pixel   Pointer to the left pixel.
 606  * @param[in] left_coeff   Weight of the left pixel
 607  * @param[in] middle_coeff Weight of the middle pixel
 608  * @param[in] right_coeff  Weight of the right pixel
 609  *
 610  * @return a half4 containing 4 convoluted values.
 611  */
 612 inline half4 convolution1x3_stride_2_f16(__global const uchar *left_pixel,
 613                                          const half            left_coeff,
 614                                          const half            middle_coeff,
 615                                          const half            right_coeff)
 616 {
 617     half8 temp0 = vload8(0, (__global half *)left_pixel);
 618     half temp1  = *((__global half *)(left_pixel + 8 * sizeof(half)));
 619
 620     half4 left   = CONVERT(temp0.s0246, half4);
 621     half4 middle = CONVERT(temp0.s1357, half4);
 622     half4 right  = CONVERT((half4)(temp0.s246, temp1), half4);
 623
 624     return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
 625 }
 626
 627 /** Compute a 1D horizontal convolution of size 3 and stride 3 for 16bit floating point type.
 628  *
 629  * @param[in] left_pixel   Pointer to the left pixel.
 630  * @param[in] left_coeff   Weight of the left pixel
 631  * @param[in] middle_coeff Weight of the middle pixel
 632  * @param[in] right_coeff  Weight of the right pixel
 633  *
 634  * @return a half4 containing 4 convoluted values.
 635  */
 636 inline half4 convolution1x3_stride_3_f16(__global const uchar *left_pixel,
 637                                          const half            left_coeff,
 638                                          const half            middle_coeff,
 639                                          const half            right_coeff)
 640 {
 641     half16 temp0 = vload16(0, (__global half *)left_pixel);
 642
 643     half4 left   = CONVERT(temp0.s0369, half4);
 644     half4 middle = CONVERT(temp0.s147A, half4);
 645     half4 right  = CONVERT(temp0.s258B, half4);
 646
 647     return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
 648 }
 649
 650 /** Apply a 3x3 convolution matrix to a single channel F16 input image and return the result.
 651  *
 652  * Convolution matrix layout:
 653  *
 654  * [ mat0, mat1, mat2 ]\n
 655  * [ mat3, mat4, mat5 ]\n
 656  * [ mat6, mat7, mat8 ]\n
 657  *
 658  * @param[in] src  A pointer to source Image structure
 659  * @param[in] mat0 Coefficient from the convolution matrix
 660  * @param[in] mat1 Coefficient from the convolution matrix
 661  * @param[in] mat2 Coefficient from the convolution matrix
 662  * @param[in] mat3 Coefficient from the convolution matrix
 663  * @param[in] mat4 Coefficient from the convolution matrix
 664  * @param[in] mat5 Coefficient from the convolution matrix
 665  * @param[in] mat6 Coefficient from the convolution matrix
 666  * @param[in] mat0 Coefficient from the convolution matrix
 667  * @param[in] mat7 Coefficient from the convolution matrix
 668  * @param[in] mat8 Coefficient from the convolution matrix
 669  *
 670  * @return a half4 containing 4 convoluted values.
 671  */
 672 inline half4 convolution3x3_f16(
 673     Image     *src,
 674     const half mat0, const half mat1, const half mat2,
 675     const half mat3, const half mat4, const half mat5,
 676     const half mat6, const half mat7, const half mat8)
 677 {
 678     half4 pixels;
 679
 680     pixels = convolution1x3_f16(offset(src, 0, 0), mat0, mat1, mat2);
 681     pixels += convolution1x3_f16(offset(src, 0, 1), mat3, mat4, mat5);
 682     pixels += convolution1x3_f16(offset(src, 0, 2), mat6, mat7, mat8);
 683
 684     return pixels;
 685 }
 686
 687 /** This OpenCL kernel computes the depthwise convolution 3x3
 688  *
 689  * @param[in] src_ptr                               Pointer to the source image. Supported data types: F16
 690  * @param[in] src_stride_x                          Stride of the source image in X dimension (in bytes)
 691  * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
 692  * @param[in] src_stride_y                          Stride of the source image in Y dimension (in bytes)
 693  * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
 694  * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source image
 695  * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
 696  * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
 697  * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: F32
 698  * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
 699  * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
 700  * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
 701  * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
 702  * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
 703  * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
 704  * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
 705  * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: F32
 706  * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
 707  * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
 708  * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
 709  * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
 710  * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
 711  * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
 712  * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
 713  * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: F16/F32
 714  * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
 715  * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
 716  * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
 717  */
 718 __kernel void depthwise_convolution_3x3_f16(
 719     TENSOR3D_DECLARATION(src),
 720     TENSOR3D_DECLARATION(dst),
 721     TENSOR3D_DECLARATION(weights)
 722 #if defined(HAS_BIAS)
 723     ,
 724     VECTOR_DECLARATION(biases)
 725 #endif //defined(HAS_BIAS)
 726 )
 727 {
 728     Image    src     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
 729     Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
 730     Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
 731 #if defined(HAS_BIAS)
 732     Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
 733 #endif //defined(HAS_BIAS)
 734
 735     uchar3 offset         = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
 736     half3 weights_values0 = vload3(0, (__global half *)(weights.ptr + offset.s0));
 737     half3 weights_values1 = vload3(0, (__global half *)(weights.ptr + offset.s1));
 738     half3 weights_values2 = vload3(0, (__global half *)(weights.ptr + offset.s2));
 739
 740     half4 pixels = convolution3x3_f16(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
 741                                       weights_values1.s0, weights_values1.s1, weights_values1.s2,
 742                                       weights_values2.s0, weights_values2.s1, weights_values2.s2);
 743 #if defined(HAS_BIAS)
 744     pixels += (half4)(*((__global half *)(biases.ptr + get_global_id(2) * biases_stride_x)));
 745 #endif //defined(HAS_BIAS)
 746
 747     vstore4(pixels, 0, (__global half *)dst.ptr);
 748 }
 749 #endif // defined(CONV_STRIDE_X)
 750 #endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)