2 * Copyright (c) 2017-2018 ARM Limited.
4 * SPDX-License-Identifier: MIT
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27 #if defined(DEPTH_MULTIPLIER)
28 #if defined(CONV_STRIDE_X)
30 #if CONV_STRIDE_X == 1
31 #define convolution1x3 convolution1x3_stride_1
32 #elif CONV_STRIDE_X == 2
33 #define convolution1x3 convolution1x3_stride_2
34 #elif CONV_STRIDE_X == 3
35 #define convolution1x3 convolution1x3_stride_3
36 #else /* CONV_STRIDE_X */
37 #error "Stride not supported"
38 #endif /* CONV_STRIDE_X */
40 /** Compute a 1D horizontal convolution of size 3 and stride 1 for floating point type.
42 * @param[in] left_pixel Pointer to the left pixel.
43 * @param[in] left_coeff Weight of the left pixel
44 * @param[in] middle_coeff Weight of the middle pixel
45 * @param[in] right_coeff Weight of the right pixel
47 * @return a float2 containing 2 convoluted values.
49 inline float2 convolution1x3_stride_1(__global const uchar *left_pixel,
50 const float left_coeff,
51 const float middle_coeff,
52 const float right_coeff)
54 float4 temp = vload4(0, (__global float *)left_pixel);
56 float2 left = CONVERT(temp.s01, float2);
57 float2 middle = CONVERT(temp.s12, float2);
58 float2 right = CONVERT(temp.s23, float2);
60 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
63 /** Compute a 1D horizontal convolution of size 3 and stride 2 for floating point type.
65 * @param[in] left_pixel Pointer to the left pixel.
66 * @param[in] left_coeff Weight of the left pixel
67 * @param[in] middle_coeff Weight of the middle pixel
68 * @param[in] right_coeff Weight of the right pixel
70 * @return a float2 containing 2 convoluted values.
72 inline float2 convolution1x3_stride_2(__global const uchar *left_pixel,
73 const float left_coeff,
74 const float middle_coeff,
75 const float right_coeff)
77 float4 temp0 = vload4(0, (__global float *)left_pixel);
78 float temp1 = *((__global float *)(left_pixel + 4 * sizeof(float)));
80 float2 left = CONVERT(temp0.s02, float2);
81 float2 middle = CONVERT(temp0.s13, float2);
82 float2 right = CONVERT((float2)(temp0.s2, temp1), float2);
84 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
87 /** Compute a 1D horizontal convolution of size 3 and stride 3 for floating point type.
89 * @param[in] left_pixel Pointer to the left pixel.
90 * @param[in] left_coeff Weight of the left pixel
91 * @param[in] middle_coeff Weight of the middle pixel
92 * @param[in] right_coeff Weight of the right pixel
94 * @return a float2 containing 2 convoluted values.
96 inline float2 convolution1x3_stride_3(__global const uchar *left_pixel,
97 const float left_coeff,
98 const float middle_coeff,
99 const float right_coeff)
101 float4 temp0 = vload4(0, (__global float *)left_pixel);
102 float2 temp1 = vload2(0, (__global float *)(left_pixel + 4 * sizeof(float)));
104 float2 left = CONVERT(temp0.s03, float2);
105 float2 middle = CONVERT((float2)(temp0.s1, temp1.s0), float2);
106 float2 right = CONVERT((float2)(temp0.s2, temp1.s1), float2);
108 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
111 /** Apply a 3x3 convolution matrix to a single channel F32 input image and return the result.
113 * Convolution matrix layout:
115 * [ mat0, mat1, mat2 ]\n
116 * [ mat3, mat4, mat5 ]\n
117 * [ mat6, mat7, mat8 ]\n
119 * @param[in] src A pointer to source Image structure
120 * @param[in] mat0 Coefficient from the convolution matrix
121 * @param[in] mat1 Coefficient from the convolution matrix
122 * @param[in] mat2 Coefficient from the convolution matrix
123 * @param[in] mat3 Coefficient from the convolution matrix
124 * @param[in] mat4 Coefficient from the convolution matrix
125 * @param[in] mat5 Coefficient from the convolution matrix
126 * @param[in] mat6 Coefficient from the convolution matrix
127 * @param[in] mat0 Coefficient from the convolution matrix
128 * @param[in] mat7 Coefficient from the convolution matrix
129 * @param[in] mat8 Coefficient from the convolution matrix
131 * @return a float2 containing 2 convoluted values.
133 inline float2 convolution3x3(
135 const float mat0, const float mat1, const float mat2,
136 const float mat3, const float mat4, const float mat5,
137 const float mat6, const float mat7, const float mat8)
141 pixels = convolution1x3(offset(src, 0, 0), mat0, mat1, mat2);
142 pixels += convolution1x3(offset(src, 0, 1), mat3, mat4, mat5);
143 pixels += convolution1x3(offset(src, 0, 2), mat6, mat7, mat8);
148 /** This OpenCL kernel computes the depthwise convolution 3x3
150 * @param[in] src_ptr Pointer to the source image. Supported data types: F32
151 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
152 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
153 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
154 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
155 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
156 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
157 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
158 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
159 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
160 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
161 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
162 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
163 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
164 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
165 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
166 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
167 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
168 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
169 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
170 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
171 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
172 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
173 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
174 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32
175 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
176 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
177 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
179 __kernel void depthwise_convolution_3x3(
180 TENSOR3D_DECLARATION(src),
181 TENSOR3D_DECLARATION(dst),
182 TENSOR3D_DECLARATION(weights)
183 #if defined(HAS_BIAS)
185 VECTOR_DECLARATION(biases)
186 #endif //defined(HAS_BIAS)
189 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
190 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
191 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
192 #if defined(HAS_BIAS)
193 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
194 #endif //defined(HAS_BIAS)
196 src.ptr -= (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
198 uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
199 float3 weights_values0 = vload3(0, (__global float *)(weights.ptr + offset.s0));
200 float3 weights_values1 = vload3(0, (__global float *)(weights.ptr + offset.s1));
201 float3 weights_values2 = vload3(0, (__global float *)(weights.ptr + offset.s2));
203 float2 pixels = convolution3x3(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
204 weights_values1.s0, weights_values1.s1, weights_values1.s2,
205 weights_values2.s0, weights_values2.s1, weights_values2.s2);
206 #if defined(HAS_BIAS)
207 pixels += (float2)(*((__global float *)(biases.ptr + get_global_id(2) * biases_stride_x)));
208 #endif //defined(HAS_BIAS)
210 vstore2(pixels, 0, (__global float *)dst.ptr);
212 #endif //defined(CONV_STRIDE_X)
214 #define CONVOLUTION1x3_BIFROST2X1_STRIDE1(acc, src0, weights_row0) \
216 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
217 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
218 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
219 acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \
220 acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \
221 acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \
224 #define CONVOLUTION1x3_BIFROST4X1_STRIDE1(acc, src0, weights_row0) \
226 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
227 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
228 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
229 acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \
230 acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \
231 acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \
232 acc.s2 = fma(src0.s2, weights_row0.s0, acc.s2); \
233 acc.s2 = fma(src0.s3, weights_row0.s1, acc.s2); \
234 acc.s2 = fma(src0.s4, weights_row0.s2, acc.s2); \
235 acc.s3 = fma(src0.s3, weights_row0.s0, acc.s3); \
236 acc.s3 = fma(src0.s4, weights_row0.s1, acc.s3); \
237 acc.s3 = fma(src0.s5, weights_row0.s2, acc.s3); \
240 #define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0, src1, weights_row0) \
242 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
243 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
244 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
245 acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \
246 acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \
247 acc.s1 = fma(src1.s0, weights_row0.s2, acc.s1); \
250 #define CONVOLUTION1x3_BIFROST4X1_STRIDE2(acc, src0, src1, weights_row0) \
252 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \
253 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \
254 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \
255 acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \
256 acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \
257 acc.s1 = fma(src0.s4, weights_row0.s2, acc.s1); \
258 acc.s2 = fma(src0.s4, weights_row0.s0, acc.s2); \
259 acc.s2 = fma(src0.s5, weights_row0.s1, acc.s2); \
260 acc.s2 = fma(src0.s6, weights_row0.s2, acc.s2); \
261 acc.s3 = fma(src0.s6, weights_row0.s0, acc.s3); \
262 acc.s3 = fma(src0.s7, weights_row0.s1, acc.s3); \
263 acc.s3 = fma(src1.s0, weights_row0.s2, acc.s3); \
266 /** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
267 * stride_x and stride_y are equal to 1
269 * @param[in] src_ptr Pointer to the source image. Supported data types: F32
270 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
271 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
272 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
273 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
274 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
275 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
276 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
277 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
278 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
279 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
280 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
281 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
282 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
283 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
284 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
285 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
286 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
287 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
288 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
289 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
290 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
291 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
292 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
293 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F32
294 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
295 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
296 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
298 __kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32(
299 TENSOR3D_DECLARATION(src),
300 TENSOR3D_DECLARATION(dst),
301 TENSOR3D_DECLARATION(weights)
302 #if defined(HAS_BIAS)
304 VECTOR_DECLARATION(biases)
305 #endif //defined(HAS_BIAS)
308 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
309 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
310 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
312 float2 pixels0 = 0.0f;
313 float2 pixels1 = 0.0f;
314 float2 pixels2 = 0.0f;
315 float2 pixels3 = 0.0f;
317 __global uchar *weights_addr = (__global uchar *)weights.ptr;
318 __global uchar *src_addr = src.ptr - (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
321 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
322 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
323 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
325 // Note: Since each work-item computes 4x2 elements, we need to load 6 rows from the input tensor
326 float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
327 float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
328 float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
329 float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
330 float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
331 float4 src50 = vload4(0, (__global float *)(src_addr + 5 * src_stride_y)); // Row5
333 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src00, weights_row0);
334 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src10, weights_row1);
335 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src20, weights_row2);
336 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src10, weights_row0);
337 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src20, weights_row1);
338 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src30, weights_row2);
339 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src20, weights_row0);
340 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src30, weights_row1);
341 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src40, weights_row2);
342 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src30, weights_row0);
343 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src40, weights_row1);
344 CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src50, weights_row2);
347 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
349 float bias = *((__global float *)(vector_offset(&biases, get_global_id(2))));
351 pixels0 += (float2)bias;
352 pixels1 += (float2)bias;
353 pixels2 += (float2)bias;
354 pixels3 += (float2)bias;
355 #endif /* defined(HAS_BIAS) */
357 vstore2(pixels0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
358 vstore2(pixels1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
359 vstore2(pixels2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
360 vstore2(pixels3, 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
363 /** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
364 * stride_x and stride_y are equal to 2
366 * @param[in] src_ptr Pointer to the source image. Supported data types: F32
367 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
368 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
369 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
370 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
371 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
372 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
373 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
374 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32
375 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
376 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
377 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
378 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
379 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
380 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
381 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
382 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32
383 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
384 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
385 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
386 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
387 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
388 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
389 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
390 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F32
391 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
392 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
393 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
395 __kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32(
396 TENSOR3D_DECLARATION(src),
397 TENSOR3D_DECLARATION(dst),
398 TENSOR3D_DECLARATION(weights)
399 #if defined(HAS_BIAS)
401 VECTOR_DECLARATION(biases)
402 #endif //defined(HAS_BIAS)
405 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
406 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
407 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
409 float2 pixels0 = 0.0f;
410 float2 pixels1 = 0.0f;
412 __global uchar *weights_addr = (__global uchar *)weights.ptr;
413 __global uchar *src_addr = src.ptr - (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
416 float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
417 float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
418 float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
420 // Note: Since each work-item computes 4x2 elements, we need to load 5 rows from the input tensor
421 float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
422 float2 src01 = vload2(2, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
423 float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
424 float2 src11 = vload2(2, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
425 float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
426 float2 src21 = vload2(2, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
427 float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
428 float2 src31 = vload2(2, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
429 float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
430 float2 src41 = vload2(2, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
432 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src00, src01, weights_row0);
433 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src10, src11, weights_row1);
434 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src20, src21, weights_row2);
435 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src20, src21, weights_row0);
436 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src30, src31, weights_row1);
437 CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src40, src41, weights_row2);
440 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
442 float bias = *((__global float *)(vector_offset(&biases, get_global_id(2))));
444 pixels0 += (float2)bias;
445 pixels1 += (float2)bias;
446 #endif /* defined(HAS_BIAS) */
448 vstore2(pixels0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
449 vstore2(pixels1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
452 #endif // defined(DEPTH_MULTIPLIER)
454 #if defined(SRC_WIDTH) && defined(DATA_TYPE)
455 /** This kernel reshapes each of the tensor's low three dimensions to single rows.
457 * @note Datatype and source width should be given as a preprocessor argument using -DDATA_TYPE=type and -DSRC_WIDTH=width. e.g. -DSRC_WIDTH=128
459 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
460 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
461 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
462 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
463 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
464 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
465 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
466 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
467 * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr
468 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
469 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
470 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
471 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
472 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
473 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32
474 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
475 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
476 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
478 __kernel void depthwise_weights_reshape(
479 TENSOR3D_DECLARATION(src),
480 IMAGE_DECLARATION(dst)
483 VECTOR_DECLARATION(biases)
484 #endif /* HAS_BIAS */
487 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
489 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
490 #endif /* HAS_BIAS */
492 __global DATA_TYPE *input_ptr = (__global DATA_TYPE *)src.ptr;
493 __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * SRC_WIDTH * dst_stride_x + get_global_id(2) * dst_stride_y;
495 for(int i = 0; i < SRC_WIDTH; ++i, ++input_ptr)
497 *((__global DATA_TYPE *)(output_ptr + i * dst_stride_x)) = *input_ptr;
500 #if defined(HAS_BIAS)
501 if(get_global_id(1) == 0)
503 *((__global DATA_TYPE *)(output_ptr + SRC_WIDTH * get_global_size(1) * dst_stride_x)) = *((__global DATA_TYPE *)(biases.ptr + get_global_id(2) * biases_stride_x));
505 #endif // defined(HAS_BIAS)
507 #endif //defined(SRC_WIDTH) && defined(DATA_TYPE)
509 #if defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE) && defined(PAD_VALUE) && defined(DEPTH_MULTIPLIER)
510 /** This kernel performs a reshaping of the input tensor to a tensor used to perform depthwise convolution using vector to matrix multiplication.
512 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
513 * @note The convolution information must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y, -DPAD_LEFT, -DPAD_TOP, -DPAD_RIGHT, -DPAD_BOTTOM, -DKERNEL_WIDHT, -DKERNEL_HEIGHT, -DSRC_WIDTH, -DSRC_HEIGHT, -DDEPTH_MULTIPLIER
515 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
516 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
517 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
518 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
519 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
520 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
521 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
522 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
523 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
524 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
525 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
526 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
527 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
528 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
529 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
530 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
532 __kernel void depthwise_im2col(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))
534 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
536 const int src_pixel_linear = get_global_id(1) * STRIDE_X;
537 const int full_length = SRC_WIDTH + PAD_LEFT + PAD_RIGHT;
538 const int max_initial_x = STRIDE_X * (((full_length - KERNEL_WIDTH) / STRIDE_X) + 1);
540 const int src_x = -PAD_LEFT + src_pixel_linear % max_initial_x;
541 const int src_y = -PAD_TOP + src_pixel_linear / max_initial_x * STRIDE_Y;
542 const int src_z = get_global_id(2) / DEPTH_MULTIPLIER;
544 __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + src_z * src_stride_z;
545 __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst.ptr));
547 for(int y = src_y; y < src_y + KERNEL_HEIGHT; ++y)
549 for(int x = src_x; x < src_x + KERNEL_WIDTH; ++x, ++output_ptr)
551 if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
553 *output_ptr = PAD_VALUE;
557 *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
561 #if defined(HAS_BIAS)
562 *output_ptr = (DATA_TYPE)(1);
563 #endif // defined(HAS_BIAS)
566 #endif //defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(DATA_TYPE) && defined(PAD_VALUE) && defined(DEPTH_MULTIPLIER)
568 #if defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
570 /** This kernel performs a reshaping of the output of the depthwise generic convolution.
572 * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
573 * @note The convolution information must be passed at compile time using -DCONV_WIDTH, -DCONV_HEIGHT, e.g -DCONV_WIDTH=32, -DCONV_HEIGHT=42
575 * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
576 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
577 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
578 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
579 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
580 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
581 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
582 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
583 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
584 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
585 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
586 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
588 __kernel void depthwise_vector_to_tensor(
589 VECTOR_DECLARATION(src),
590 TENSOR3D_DECLARATION(dst))
592 Vector src = CONVERT_TO_VECTOR_STRUCT(src);
594 const int patch_size = CONV_WIDTH * CONV_HEIGHT;
595 const int id0 = get_global_id(0);
596 const int z = id0 / patch_size;
597 const int index2D = id0 - z * patch_size;
599 __global uchar *out_ptr = dst_ptr + dst_offset_first_element_in_bytes + index2D % CONV_WIDTH * dst_stride_x + index2D / CONV_WIDTH * dst_stride_y + z * dst_stride_z;
600 *((__global DATA_TYPE *)out_ptr) = *((__global DATA_TYPE *)src.ptr);
603 #endif //defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
605 #if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER)
606 #if defined(CONV_STRIDE_X)
607 #if CONV_STRIDE_X == 1
608 #define convolution1x3_f16 convolution1x3_stride_1_f16
609 #elif CONV_STRIDE_X == 2
610 #define convolution1x3_f16 convolution1x3_stride_2_f16
611 #elif CONV_STRIDE_X == 3
612 #define convolution1x3_f16 convolution1x3_stride_3_f16
613 #else /* CONV_STRIDE_X */
614 #error "Stride not supported"
615 #endif /* CONV_STRIDE_X */
617 /** Compute a 1D horizontal convolution of size 3 and stride 1 for 16bit floating point type.
619 * @param[in] left_pixel Pointer to the left pixel.
620 * @param[in] left_coeff Weight of the left pixel
621 * @param[in] middle_coeff Weight of the middle pixel
622 * @param[in] right_coeff Weight of the right pixel
624 * @return a half4 containing 4 convoluted values.
626 inline half4 convolution1x3_stride_1_f16(__global const uchar *left_pixel,
627 const half left_coeff,
628 const half middle_coeff,
629 const half right_coeff)
631 half8 temp = vload8(0, (__global half *)left_pixel);
633 half4 left = CONVERT(temp.s0123, half4);
634 half4 middle = CONVERT(temp.s1234, half4);
635 half4 right = CONVERT(temp.s2345, half4);
637 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
640 /** Compute a 1D horizontal convolution of size 3 and stride 2 for 16bit floating point type.
642 * @param[in] left_pixel Pointer to the left pixel.
643 * @param[in] left_coeff Weight of the left pixel
644 * @param[in] middle_coeff Weight of the middle pixel
645 * @param[in] right_coeff Weight of the right pixel
647 * @return a half4 containing 4 convoluted values.
649 inline half4 convolution1x3_stride_2_f16(__global const uchar *left_pixel,
650 const half left_coeff,
651 const half middle_coeff,
652 const half right_coeff)
654 half8 temp0 = vload8(0, (__global half *)left_pixel);
655 half temp1 = *((__global half *)(left_pixel + 8 * sizeof(half)));
657 half4 left = CONVERT(temp0.s0246, half4);
658 half4 middle = CONVERT(temp0.s1357, half4);
659 half4 right = CONVERT((half4)(temp0.s246, temp1), half4);
661 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
664 /** Compute a 1D horizontal convolution of size 3 and stride 3 for 16bit floating point type.
666 * @param[in] left_pixel Pointer to the left pixel.
667 * @param[in] left_coeff Weight of the left pixel
668 * @param[in] middle_coeff Weight of the middle pixel
669 * @param[in] right_coeff Weight of the right pixel
671 * @return a half4 containing 4 convoluted values.
673 inline half4 convolution1x3_stride_3_f16(__global const uchar *left_pixel,
674 const half left_coeff,
675 const half middle_coeff,
676 const half right_coeff)
678 half16 temp0 = vload16(0, (__global half *)left_pixel);
680 half4 left = CONVERT(temp0.s0369, half4);
681 half4 middle = CONVERT(temp0.s147A, half4);
682 half4 right = CONVERT(temp0.s258B, half4);
684 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
687 /** Apply a 3x3 convolution matrix to a single channel F16 input image and return the result.
689 * Convolution matrix layout:
691 * [ mat0, mat1, mat2 ]\n
692 * [ mat3, mat4, mat5 ]\n
693 * [ mat6, mat7, mat8 ]\n
695 * @param[in] src A pointer to source Image structure
696 * @param[in] mat0 Coefficient from the convolution matrix
697 * @param[in] mat1 Coefficient from the convolution matrix
698 * @param[in] mat2 Coefficient from the convolution matrix
699 * @param[in] mat3 Coefficient from the convolution matrix
700 * @param[in] mat4 Coefficient from the convolution matrix
701 * @param[in] mat5 Coefficient from the convolution matrix
702 * @param[in] mat6 Coefficient from the convolution matrix
703 * @param[in] mat0 Coefficient from the convolution matrix
704 * @param[in] mat7 Coefficient from the convolution matrix
705 * @param[in] mat8 Coefficient from the convolution matrix
707 * @return a half4 containing 4 convoluted values.
709 inline half4 convolution3x3_f16(
711 const half mat0, const half mat1, const half mat2,
712 const half mat3, const half mat4, const half mat5,
713 const half mat6, const half mat7, const half mat8)
717 pixels = convolution1x3_f16(offset(src, 0, 0), mat0, mat1, mat2);
718 pixels += convolution1x3_f16(offset(src, 0, 1), mat3, mat4, mat5);
719 pixels += convolution1x3_f16(offset(src, 0, 2), mat6, mat7, mat8);
724 #if defined(DEPTH_MULTIPLIER)
726 /** This OpenCL kernel computes the depthwise convolution 3x3
728 * @param[in] src_ptr Pointer to the source image. Supported data types: F16
729 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
730 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
731 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
732 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
733 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
734 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
735 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
736 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
737 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
738 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
739 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
740 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
741 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
742 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
743 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
744 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
745 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
746 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
747 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
748 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
749 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
750 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
751 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
752 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32
753 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
754 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
755 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
757 __kernel void depthwise_convolution_3x3_f16(
758 TENSOR3D_DECLARATION(src),
759 TENSOR3D_DECLARATION(dst),
760 TENSOR3D_DECLARATION(weights)
761 #if defined(HAS_BIAS)
763 VECTOR_DECLARATION(biases)
764 #endif //defined(HAS_BIAS)
767 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
768 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
769 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
770 #if defined(HAS_BIAS)
771 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
772 #endif //defined(HAS_BIAS)
774 src.ptr -= (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
776 uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
777 half3 weights_values0 = vload3(0, (__global half *)(weights.ptr + offset.s0));
778 half3 weights_values1 = vload3(0, (__global half *)(weights.ptr + offset.s1));
779 half3 weights_values2 = vload3(0, (__global half *)(weights.ptr + offset.s2));
781 half4 pixels = convolution3x3_f16(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
782 weights_values1.s0, weights_values1.s1, weights_values1.s2,
783 weights_values2.s0, weights_values2.s1, weights_values2.s2);
784 #if defined(HAS_BIAS)
785 pixels += (half4)(*((__global half *)(biases.ptr + get_global_id(2) * biases_stride_x)));
786 #endif //defined(HAS_BIAS)
788 vstore4(pixels, 0, (__global half *)dst.ptr);
790 #endif // defined(DEPTH_MULTIPLIER)
791 #endif // defined(CONV_STRIDE_X)
793 /** This OpenCL kernel is optimized for Bifrost architectures and computes the 16bit floating point depthwise convolution 3x3
794 * when both stride_x and stride_y are equal to 1
796 * @param[in] src_ptr Pointer to the source image. Supported data types: F16
797 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
798 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
799 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
800 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
801 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
802 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
803 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
804 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
805 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
806 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
807 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
808 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
809 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
810 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
811 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
812 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
813 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
814 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
815 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
816 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
817 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
818 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
819 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
820 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
821 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
822 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
823 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
825 __kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16(
826 TENSOR3D_DECLARATION(src),
827 TENSOR3D_DECLARATION(dst),
828 TENSOR3D_DECLARATION(weights)
829 #if defined(HAS_BIAS)
831 VECTOR_DECLARATION(biases)
832 #endif //defined(HAS_BIAS)
835 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
836 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
837 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
840 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
842 half bias = *((__global half *)(vector_offset(&biases, get_global_id(2))));
843 #endif /* defined(HAS_BIAS) */
845 half4 pixels0 = 0.0f;
846 half4 pixels1 = 0.0f;
847 half4 pixels2 = 0.0f;
848 half4 pixels3 = 0.0f;
850 __global uchar *weights_addr = (__global uchar *)weights.ptr;
851 __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0) - (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
854 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
855 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
856 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
858 // Note: Since each work-item computes 4x4 elements, we need to load 6 rows from the input tensor
859 half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
860 half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
861 half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
862 half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
863 half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
864 half8 src50 = vload8(0, (__global half *)(src_addr + 5 * src_stride_y)); // Row5
866 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src00, weights_row0);
867 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src10, weights_row1);
868 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src20, weights_row2);
869 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src10, weights_row0);
870 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src20, weights_row1);
871 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src30, weights_row2);
872 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src20, weights_row0);
873 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src30, weights_row1);
874 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src40, weights_row2);
875 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src30, weights_row0);
876 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src40, weights_row1);
877 CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src50, weights_row2);
880 pixels0 += (half4)bias;
881 pixels1 += (half4)bias;
882 pixels2 += (half4)bias;
883 pixels3 += (half4)bias;
884 #endif /* defined(HAS_BIAS) */
886 vstore4(pixels0, 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
887 vstore4(pixels1, 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
888 vstore4(pixels2, 0, (__global half *)(dst.ptr + 2 * dst_stride_y));
889 vstore4(pixels3, 0, (__global half *)(dst.ptr + 3 * dst_stride_y));
892 /** This OpenCL kernel is optimized for Bifrost architectures and computes 16bit floating point the depthwise convolution 3x3
893 * when both stride_x and stride_y are equal to 2
895 * @param[in] src_ptr Pointer to the source image. Supported data types: F16
896 * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
897 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
898 * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
899 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
900 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
901 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
902 * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
903 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
904 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
905 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
906 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
907 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
908 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
909 * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
910 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
911 * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
912 * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
913 * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
914 * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
915 * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
916 * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
917 * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
918 * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
919 * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
920 * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
921 * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
922 * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
924 __kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16(
925 TENSOR3D_DECLARATION(src),
926 TENSOR3D_DECLARATION(dst),
927 TENSOR3D_DECLARATION(weights)
928 #if defined(HAS_BIAS)
930 VECTOR_DECLARATION(biases)
931 #endif //defined(HAS_BIAS)
934 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
935 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
936 Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
939 Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
941 half bias = *((__global half *)(vector_offset(&biases, get_global_id(2))));
942 #endif /* defined(HAS_BIAS) */
944 half4 pixels0 = 0.0f;
945 half4 pixels1 = 0.0f;
947 __global uchar *weights_addr = (__global uchar *)weights.ptr;
948 __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0) - (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
951 half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
952 half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
953 half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
955 // Note: Since each work-item computes 2x4 elements, we need to load 5 rows from the input tensor
956 half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
957 half2 src01 = vload2(4, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
958 half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
959 half2 src11 = vload2(4, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
960 half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
961 half2 src21 = vload2(4, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
962 half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
963 half2 src31 = vload2(4, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
964 half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
965 half2 src41 = vload2(4, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
967 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src00, src01, weights_row0);
968 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src10, src11, weights_row1);
969 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src20, src21, weights_row2);
970 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src20, src21, weights_row0);
971 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src30, src31, weights_row1);
972 CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src40, src41, weights_row2);
975 pixels0 += (half4)bias;
976 pixels1 += (half4)bias;
977 #endif /* defined(HAS_BIAS) */
979 vstore4(pixels0, 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
980 vstore4(pixels1, 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
982 #endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER)