2 * Copyright (c) 2016, 2017 ARM Limited.
4 * SPDX-License-Identifier: MIT
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 #include "arm_compute/core/NEON/kernels/NEConvolutionKernel.h"
26 #include "arm_compute/core/Coordinates.h"
27 #include "arm_compute/core/Error.h"
28 #include "arm_compute/core/Helpers.h"
29 #include "arm_compute/core/ITensor.h"
30 #include "arm_compute/core/TensorInfo.h"
31 #include "arm_compute/core/Types.h"
32 #include "arm_compute/core/Utils.h"
33 #include "arm_compute/core/Validate.h"
34 #include "arm_compute/core/Window.h"
43 using namespace arm_compute;
47 const uint16x8_t max_int16 = vdupq_n_u16(INT16_MAX);
49 inline void store_results(const int32x4_t &out, const int32x4_t &out2, int16_t *output)
51 const int16x8_t s16results = vcombine_s16(vqmovn_s32(out),
53 vst1q_s16(output, s16results);
56 inline void store_results(const int32x4_t &out, const int32x4_t &out2, uint8_t *output)
58 const uint8x8_t u8results = vqmovn_u16(vcombine_u16(vqmovun_s32(out),
60 vst1_u8(output, u8results);
63 inline void store_results(const uint32x4_t &out, const uint32x4_t &out2, int16_t *output)
65 const uint16x8_t u16results = vcombine_u16(vqmovn_u32(out), vqmovn_u32(out2));
66 const int16x8_t s16results = vreinterpretq_s16_u16(vminq_u16(u16results, max_int16));
67 vst1q_s16(output, s16results);
70 inline void store_results(const uint32x4_t &out, const uint32x4_t &out2, uint8_t *output)
72 const uint8x8_t u8results = vqmovn_u16(vcombine_u16(vqmovn_u32(out),
74 vst1_u8(output, u8results);
77 inline void store_results(const int16x8_t &out, const int16x8_t &out2, int16_t *output)
79 vst1q_s16(output, out);
80 vst1q_s16(output + 8, out2);
83 inline void store_results(const int16x8_t &out, const int16x8_t &out2, uint8_t *output)
85 const uint8x16_t u8results = vcombine_u8(vqmovun_s16(out),
87 vst1q_u8(output, u8results);
90 inline void store_results(const uint16x8_t &out, const uint16x8_t &out2, uint8_t *output)
92 const uint8x16_t u8results = vcombine_u8(vqmovn_u16(out),
94 vst1q_u8(output, u8results);
97 inline void store_results(const uint16x8_t &out, const uint16x8_t &out2, int16_t *output)
99 vst1q_s16(output, vreinterpretq_s16_u16(vminq_u16(out, max_int16)));
100 vst1q_s16(output + 8, vreinterpretq_s16_u16(vminq_u16(out2, max_int16)));
103 inline void convolve_row3x1_unrolled(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16x4_t &mat0, const int16x4_t &mat1, const int16x4_t &mat2)
105 // Convert to s16 and split in blocks of 4 values:
106 const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
107 const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
109 const int16x4x3_t row =
112 vget_low_s16(s16_tmp0),
113 vget_high_s16(s16_tmp0),
114 vget_low_s16(s16_tmp1)
118 // Calculate row left value for pixels [0,3]
119 out = vmlal_s16(out, row.val[0], mat0);
120 // Calculate row middle value for pixels [0,3]
121 out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
122 // Calculate row right value for pixels [0,3]
123 out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
125 // Calculate row left value for pixels [4,7]
126 out2 = vmlal_s16(out2, row.val[1], mat0);
127 // Calculate row middle value for pixels [4,7]
128 out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
129 // Calculate row right value for pixels [4,7]
130 out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
133 inline void convolve_row3x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
135 const int16x4_t mat0 = vld1_dup_s16(convolution);
136 const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
137 const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
139 convolve_row3x1_unrolled(out, out2, row_data, mat0, mat1, mat2);
142 inline void convolve_row5x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
144 const int16x4_t mat0 = vld1_dup_s16(convolution);
145 const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
146 const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
147 const int16x4_t mat3 = vld1_dup_s16(convolution + 3);
148 const int16x4_t mat4 = vld1_dup_s16(convolution + 4);
150 // Convert to s16 and split in blocks of 4 values:
151 const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
152 const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
154 const int16x4x3_t row =
157 vget_low_s16(s16_tmp0),
158 vget_high_s16(s16_tmp0),
159 vget_low_s16(s16_tmp1)
163 // Calculate row left 2 value for pixels [0,3]
164 out = vmlal_s16(out, row.val[0], mat0);
165 // Calculate row left 1 value for pixels [0,3]
166 out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
167 // Calculate row middle value for pixels [0,3]
168 out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
169 // Calculate row right +1 value for pixels [0,3]
170 out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3);
171 // Calculate row right +2 value for pixels [0,3]
172 out = vmlal_s16(out, row.val[1], mat4);
174 // Calculate row left 2 value for pixels [4,7]
175 out2 = vmlal_s16(out2, row.val[1], mat0);
176 // Calculate row left 1 value for pixels [4,7]
177 out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
178 // Calculate row middle value for pixels [4,7]
179 out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
180 // Calculate row right +1 value for pixels [4,7]
181 out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3);
182 // Calculate row right +2 value for pixels [4,7]
183 out2 = vmlal_s16(out2, row.val[2], mat4);
186 inline void convolve_row7x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
188 const int16x4_t mat0 = vld1_dup_s16(convolution);
189 const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
190 const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
191 const int16x4_t mat3 = vld1_dup_s16(convolution + 3);
192 const int16x4_t mat4 = vld1_dup_s16(convolution + 4);
193 const int16x4_t mat5 = vld1_dup_s16(convolution + 5);
194 const int16x4_t mat6 = vld1_dup_s16(convolution + 6);
196 // Convert to s16 and split in blocks of 4 values:
197 const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
198 const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
200 const int16x4x4_t row =
203 vget_low_s16(s16_tmp0),
204 vget_high_s16(s16_tmp0),
205 vget_low_s16(s16_tmp1),
206 vget_high_s16(s16_tmp1)
210 // Calculate row left 3 value for pixels [0,3]
211 out = vmlal_s16(out, row.val[0], mat0);
212 // Calculate row left 2 value for pixels [0,3]
213 out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
214 // Calculate row left 1 value for pixels [0,3]
215 out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
216 // Calculate row middle value for pixels [0,3]
217 out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3);
218 // Calculate row right +1 value for pixels [0,3]
219 out = vmlal_s16(out, row.val[1], mat4);
220 // Calculate row right +2 value for pixels [0,3]
221 out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 1), mat5);
222 // Calculate row right +3 value for pixels [0,3]
223 out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 2), mat6);
225 // Calculate row left 3 value for pixels [4,7]
226 out2 = vmlal_s16(out2, row.val[1], mat0);
227 // Calculate row left 2 value for pixels [4,7]
228 out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
229 // Calculate row left 1 value for pixels [4,7]
230 out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
231 // Calculate row middle value for pixels [4,7]
232 out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3);
233 // Calculate row right +1 value for pixels [4,7]
234 out2 = vmlal_s16(out2, row.val[2], mat4);
235 // Calculate row right +2 value for pixels [4,7]
236 out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 1), mat5);
237 // Calculate row right +3 value for pixels [4,7]
238 out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 2), mat6);
241 inline void convolve_row9x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
243 const int16x4_t mat0 = vld1_dup_s16(convolution);
244 const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
245 const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
246 const int16x4_t mat3 = vld1_dup_s16(convolution + 3);
247 const int16x4_t mat4 = vld1_dup_s16(convolution + 4);
248 const int16x4_t mat5 = vld1_dup_s16(convolution + 5);
249 const int16x4_t mat6 = vld1_dup_s16(convolution + 6);
250 const int16x4_t mat7 = vld1_dup_s16(convolution + 7);
251 const int16x4_t mat8 = vld1_dup_s16(convolution + 8);
253 // Convert to s16 and split in blocks of 4 values:
254 const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
255 const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
257 const int16x4x4_t row =
260 vget_low_s16(s16_tmp0),
261 vget_high_s16(s16_tmp0),
262 vget_low_s16(s16_tmp1),
263 vget_high_s16(s16_tmp1)
267 // Calculate row left 4 value for pixels [0,3]
268 out = vmlal_s16(out, row.val[0], mat0);
269 // Calculate row left 3 value for pixels [0,3]
270 out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
271 // Calculate row left 2 value for pixels [0,3]
272 out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
273 // Calculate row left 1 value for pixels [0,3]
274 out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3);
275 // Calculate row middle value for pixels [0,3]
276 out = vmlal_s16(out, row.val[1], mat4);
277 // Calculate row right +1 value for pixels [0,3]
278 out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 1), mat5);
279 // Calculate row right +2 value for pixels [0,3]
280 out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 2), mat6);
281 // Calculate row right +3 value for pixels [0,3]
282 out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 3), mat7);
283 // Calculate row right +4 value for pixels [0,3]
284 out = vmlal_s16(out, row.val[2], mat8);
286 // Calculate row left 4 value for pixels [0,3]
287 out2 = vmlal_s16(out2, row.val[1], mat0);
288 // Calculate row left 3 value for pixels [0,3]
289 out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
290 // Calculate row left 2 value for pixels [0,3]
291 out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
292 // Calculate row left 1 value for pixels [0,3]
293 out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3);
294 // Calculate row middle value for pixels [0,3]
295 out2 = vmlal_s16(out2, row.val[2], mat4);
296 // Calculate row right +1 value for pixels [0,3]
297 out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 1), mat5);
298 // Calculate row right +2 value for pixels [0,3]
299 out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 2), mat6);
300 // Calculate row right +3 value for pixels [0,3]
301 out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 3), mat7);
302 // Calculate row right +4 value for pixels [0,3]
303 out2 = vmlal_s16(out2, row.val[3], mat8);
307 /****************************************************************************************\
308 * Square Convolution *
309 \****************************************************************************************/
311 template <unsigned int matrix_size>
312 NEConvolutionKernel<matrix_size>::NEConvolutionKernel()
313 : INESimpleKernel(), _scale(0), _convolution{ {} }
317 template <unsigned int matrix_size>
318 BorderSize NEConvolutionKernel<matrix_size>::border_size() const
320 return BorderSize(matrix_size / 2);
323 template <unsigned int matrix_size>
324 void NEConvolutionKernel<matrix_size>::configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, bool border_undefined)
326 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
327 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
328 ARM_COMPUTE_ERROR_ON(conv == nullptr);
333 std::copy_n(conv, _convolution.size(), _convolution.begin());
337 _scale = calculate_matrix_scale(_convolution.data(), matrix_size);
344 // Configure kernel window
345 constexpr unsigned int processed_elements(8);
346 constexpr unsigned int read_elements(16);
347 constexpr unsigned int written_elements(8);
349 Window win = calculate_max_window(*input->info(), Steps(processed_elements), border_undefined, border_size());
350 AccessWindowHorizontal output_access(output->info(), 0, written_elements);
352 update_window_and_padding(win,
353 AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, read_elements, matrix_size),
356 output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
358 INEKernel::configure(win);
361 #ifndef DOXYGEN_SKIP_THIS /* Doxygen gets confused by the templates and can't match the implementation to the declaration */
363 template <typename OutputType>
364 void NEConvolutionKernel<3>::convolution(const Window &win)
366 static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
367 ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
369 Iterator input(_input, win);
370 Iterator output(_output, win);
372 // Load the matrix's coefficients into NEON registers:
373 const int16x4_t mat00 = vld1_dup_s16(_convolution.data());
374 const int16x4_t mat01 = vld1_dup_s16(_convolution.data() + 1);
375 const int16x4_t mat02 = vld1_dup_s16(_convolution.data() + 2);
376 const int16x4_t mat10 = vld1_dup_s16(_convolution.data() + 3);
377 const int16x4_t mat11 = vld1_dup_s16(_convolution.data() + 4);
378 const int16x4_t mat12 = vld1_dup_s16(_convolution.data() + 5);
379 const int16x4_t mat20 = vld1_dup_s16(_convolution.data() + 6);
380 const int16x4_t mat21 = vld1_dup_s16(_convolution.data() + 7);
381 const int16x4_t mat22 = vld1_dup_s16(_convolution.data() + 8);
382 const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
384 const unsigned char *input_top_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, -1));
385 const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, 0));
386 const unsigned char *input_low_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, 1));
388 execute_window_loop(win, [&](const Coordinates & id)
390 int32x4_t out = vdupq_n_s32(0);
391 int32x4_t out2 = vdupq_n_s32(0);
393 // Load 16 bytes from the top row:
394 const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
395 convolve_row3x1_unrolled(out, out2, top_data, mat00, mat01, mat02);
397 // Load 16 bytes from the middle row:
398 const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
399 convolve_row3x1_unrolled(out, out2, mid_data, mat10, mat11, mat12);
401 // Load 16 bytes from the middle row:
402 const uint8x16_t low_data = vld1q_u8(input_low_ptr + input.offset());
403 convolve_row3x1_unrolled(out, out2, low_data, mat20, mat21, mat22);
408 // Convert to F32, scale and convert back to S32
409 out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
410 out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
413 // Clamp and store as U8 or S16:
414 store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
420 template <typename OutputType>
421 void NEConvolutionKernel<5>::convolution(const Window &win)
423 static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
424 ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
426 Iterator input(_input, win);
427 Iterator output(_output, win);
429 const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
431 const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, -2));
432 const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, -1));
433 const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 0));
434 const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 1));
435 const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 2));
437 execute_window_loop(win, [&](const Coordinates & id)
439 int32x4_t out = vdupq_n_s32(0);
440 int32x4_t out2 = vdupq_n_s32(0);
442 // Load 16 bytes from the top2 row:
443 const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset());
444 convolve_row5x1(out, out2, data_t2, _convolution.data());
446 // Load 16 bytes from the top1 row:
447 const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset());
448 convolve_row5x1(out, out2, data_t1, _convolution.data() + 5);
450 // Load 16 bytes from the middle row:
451 const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset());
452 convolve_row5x1(out, out2, data_m, _convolution.data() + 10);
454 // Load 16 bytes from the low1 row:
455 const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset());
456 convolve_row5x1(out, out2, data_b1, _convolution.data() + 15);
458 // Load 16 bytes from the low2 row:
459 const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset());
460 convolve_row5x1(out, out2, data_b2, _convolution.data() + 20);
465 // Convert to F32, scale and convert back to S32
466 out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
467 out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
470 // Clamp and store as U8 or S16:
471 store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
477 template <typename OutputType>
478 void NEConvolutionKernel<7>::convolution(const Window &win)
480 static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
481 ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
483 Iterator input(_input, win);
484 Iterator output(_output, win);
486 const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
488 const unsigned char *input_top3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, -3));
489 const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, -2));
490 const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, -1));
491 const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 0));
492 const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 1));
493 const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 2));
494 const unsigned char *input_low3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 3));
496 execute_window_loop(win, [&](const Coordinates & id)
498 int32x4_t out = vdupq_n_s32(0);
499 int32x4_t out2 = vdupq_n_s32(0);
501 // Load 16 bytes from the top3 row:
502 const uint8x16_t data_t3 = vld1q_u8(input_top3_ptr + input.offset());
503 convolve_row7x1(out, out2, data_t3, _convolution.data());
505 // Load 16 bytes from the top2 row:
506 const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset());
507 convolve_row7x1(out, out2, data_t2, _convolution.data() + 7);
509 // Load 16 bytes from the top1 row:
510 const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset());
511 convolve_row7x1(out, out2, data_t1, _convolution.data() + 14);
513 // Load 16 bytes from the middle row:
514 const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset());
515 convolve_row7x1(out, out2, data_m, _convolution.data() + 21);
517 // Load 16 bytes from the low1 row:
518 const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset());
519 convolve_row7x1(out, out2, data_b1, _convolution.data() + 28);
521 // Load 16 bytes from the low2 row:
522 const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset());
523 convolve_row7x1(out, out2, data_b2, _convolution.data() + 35);
525 // Load 16 bytes from the low3 row:
526 const uint8x16_t data_b3 = vld1q_u8(input_low3_ptr + input.offset());
527 convolve_row7x1(out, out2, data_b3, _convolution.data() + 42);
532 // Convert to F32, scale and convert back to S32
533 out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
534 out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
537 // Clamp and store as U8 or S16:
538 store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
544 template <typename OutputType>
545 void NEConvolutionKernel<9>::convolution(const Window &win)
547 static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
548 ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
550 Iterator input(_input, win);
551 Iterator output(_output, win);
553 const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
555 const unsigned char *input_top4_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -4));
556 const unsigned char *input_top3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -3));
557 const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -2));
558 const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -1));
559 const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 0));
560 const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 1));
561 const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 2));
562 const unsigned char *input_low3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 3));
563 const unsigned char *input_low4_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 4));
565 execute_window_loop(win, [&](const Coordinates & id)
567 int32x4_t out = vdupq_n_s32(0);
568 int32x4_t out2 = vdupq_n_s32(0);
570 // Load 16 bytes from the top4 row:
571 const uint8x16_t data_t4 = vld1q_u8(input_top4_ptr + input.offset());
572 convolve_row9x1(out, out2, data_t4, _convolution.data());
574 // Load 16 bytes from the top3 row:
575 const uint8x16_t data_t3 = vld1q_u8(input_top3_ptr + input.offset());
576 convolve_row9x1(out, out2, data_t3, _convolution.data() + 9);
578 // Load 16 bytes from the top2 row:
579 const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset());
580 convolve_row9x1(out, out2, data_t2, _convolution.data() + 18);
582 // Load 16 bytes from the top1 row:
583 const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset());
584 convolve_row9x1(out, out2, data_t1, _convolution.data() + 27);
586 // Load 16 bytes from the middle row:
587 const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset());
588 convolve_row9x1(out, out2, data_m, _convolution.data() + 36);
590 // Load 16 bytes from the low1 row:
591 const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset());
592 convolve_row9x1(out, out2, data_b1, _convolution.data() + 45);
594 // Load 16 bytes from the low2 row:
595 const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset());
596 convolve_row9x1(out, out2, data_b2, _convolution.data() + 54);
598 // Load 16 bytes from the low3 row:
599 const uint8x16_t data_b3 = vld1q_u8(input_low3_ptr + input.offset());
600 convolve_row9x1(out, out2, data_b3, _convolution.data() + 63);
602 // Load 16 bytes from the low4 row:
603 const uint8x16_t data_b4 = vld1q_u8(input_low4_ptr + input.offset());
604 convolve_row9x1(out, out2, data_b4, _convolution.data() + 72);
609 // Convert to F32, scale and convert back to S32
610 out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
611 out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
614 // Clamp and store as U8 or S16:
615 store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
619 #endif /* DOXYGEN_SKIP_THIS */
621 template <unsigned int matrix_size>
622 void NEConvolutionKernel<matrix_size>::run(const Window &window)
624 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
625 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
627 switch(_output->info()->format())
630 convolution<uint8_t>(window);
633 convolution<int16_t>(window);
636 ARM_COMPUTE_ERROR("Not supported");
640 template class arm_compute::NEConvolutionKernel<3>;
641 template class arm_compute::NEConvolutionKernel<5>;
642 template class arm_compute::NEConvolutionKernel<7>;
643 template class arm_compute::NEConvolutionKernel<9>;
645 /****************************************************************************************\
646 * Separable Square Convolution *
647 \****************************************************************************************/
649 template <unsigned int matrix_size>
650 NESeparableConvolutionHorKernel<matrix_size>::NESeparableConvolutionHorKernel()
651 : _conv_row{ { 0 } }, _border_size(0)
655 template <unsigned int matrix_size>
656 BorderSize NESeparableConvolutionHorKernel<matrix_size>::border_size() const
661 template <unsigned int matrix_size>
662 void NESeparableConvolutionHorKernel<matrix_size>::configure(const ITensor *input, ITensor *output, const int16_t *conv_row, bool border_undefined)
664 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
665 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16, DataType::S16, DataType::S32);
666 ARM_COMPUTE_ERROR_ON(conv_row == nullptr);
670 std::copy_n(conv_row, _conv_row.size(), _conv_row.begin());
671 _border_size = BorderSize(border_undefined ? 0 : matrix_size / 2, matrix_size / 2);
673 // Configure kernel window
674 constexpr unsigned int processed_elements(8);
675 constexpr unsigned int read_elements(16);
676 constexpr unsigned int written_elements(8);
678 Window win = calculate_max_window_horizontal(*input->info(), Steps(processed_elements), border_undefined, border_size());
679 AccessWindowHorizontal output_access(output->info(), 0, written_elements);
681 update_window_and_padding(win,
682 AccessWindowHorizontal(input->info(), -border_size().left, read_elements),
685 output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
687 INEKernel::configure(win);
690 template <unsigned int matrix_size>
691 void NESeparableConvolutionHorKernel<matrix_size>::run(const Window &window)
693 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
694 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
695 switch(_output->info()->data_type())
698 convolve<uint16_t>(window);
701 convolve<int16_t>(window);
704 convolve<int32_t>(window);
707 ARM_COMPUTE_ERROR("Unsupported intermediate data type!");
712 #ifndef DOXYGEN_SKIP_THIS /* Doxygen gets confused by the templates and can't match the implementation to the declaration */
713 namespace arm_compute
717 inline void NESeparableConvolutionHorKernel<5>::convolve<uint16_t>(const Window &window)
719 Window win_in(window);
720 win_in.shift(Window::DimX, -2);
722 Iterator input(_input, win_in);
723 Iterator output(_output, window);
725 execute_window_loop(window, [&](const Coordinates & id)
727 const uint8x16_t data = vld1q_u8(input.ptr());
729 const uint16x8x2_t data_u16 =
732 vmovl_u8(vget_low_u8(data)),
733 vmovl_u8(vget_high_u8(data))
737 uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]);
738 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]);
739 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]);
740 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]);
741 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]);
743 vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()), out);
750 inline void NESeparableConvolutionHorKernel<5>::convolve<int16_t>(const Window &window)
752 Window win_in(window);
753 win_in.shift(Window::DimX, -2);
755 Iterator input(_input, win_in);
756 Iterator output(_output, window);
758 execute_window_loop(window, [&](const Coordinates & id)
760 const uint8x16_t data = vld1q_u8(input.ptr());
762 const int16x8x2_t data_s16 =
765 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
766 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
770 int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]);
771 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]);
772 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]);
773 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]);
774 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]);
776 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), out);
783 void NESeparableConvolutionHorKernel<5>::convolve<int32_t>(const Window &window)
785 Window win_in(window);
786 win_in.shift(Window::DimX, -2);
788 Iterator input(_input, win_in);
789 Iterator output(_output, window);
791 execute_window_loop(window, [&](const Coordinates & id)
793 const uint8x16_t data = vld1q_u8(input.ptr());
795 const int16x8x2_t data_s16 =
798 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
799 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
803 const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 1);
804 const int16x8_t data_s16_m = vextq_s16(data_s16.val[0], data_s16.val[1], 2);
805 const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 3);
806 const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 4);
808 int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]);
809 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[1]);
810 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[2]);
811 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[3]);
812 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[4]);
814 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), out_low);
816 int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]);
817 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[1]);
818 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[2]);
819 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[3]);
820 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[4]);
822 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, out_high);
829 inline void NESeparableConvolutionHorKernel<7>::convolve<uint16_t>(const Window &window)
831 Window win_in(window);
832 win_in.shift(Window::DimX, -3);
834 Iterator input(_input, win_in);
835 Iterator output(_output, window);
837 execute_window_loop(window, [&](const Coordinates & id)
839 const uint8x16_t data = vld1q_u8(input.ptr());
841 const uint16x8x2_t data_u16 =
844 vmovl_u8(vget_low_u8(data)),
845 vmovl_u8(vget_high_u8(data))
849 uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]);
850 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]);
851 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]);
852 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]);
853 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]);
854 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 5), _conv_row[5]);
855 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 6), _conv_row[6]);
857 vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()), out);
864 inline void NESeparableConvolutionHorKernel<7>::convolve<int16_t>(const Window &window)
866 Window win_in(window);
867 win_in.shift(Window::DimX, -3);
869 Iterator input(_input, win_in);
870 Iterator output(_output, window);
872 execute_window_loop(window, [&](const Coordinates & id)
874 const uint8x16_t data = vld1q_u8(input.ptr());
876 const int16x8x2_t data_s16 =
879 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
880 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
884 int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]);
885 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]);
886 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]);
887 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]);
888 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]);
889 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 5), _conv_row[5]);
890 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 6), _conv_row[6]);
892 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), out);
899 void NESeparableConvolutionHorKernel<7>::convolve<int32_t>(const Window &window)
901 Window win_in(window);
902 win_in.shift(Window::DimX, -3);
904 Iterator input(_input, win_in);
905 Iterator output(_output, window);
907 execute_window_loop(window, [&](const Coordinates & id)
909 const uint8x16_t data = vld1q_u8(input.ptr());
911 const int16x8x2_t data_s16 =
914 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
915 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
919 const int16x8_t data_s16_l2 = vextq_s16(data_s16.val[0], data_s16.val[1], 1);
920 const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 2);
921 const int16x8_t data_s16_m = vextq_s16(data_s16.val[0], data_s16.val[1], 3);
922 const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 4);
923 const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 5);
924 const int16x8_t data_s16_r3 = vextq_s16(data_s16.val[0], data_s16.val[1], 6);
926 int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]);
927 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l2), _conv_row[1]);
928 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[2]);
929 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[3]);
930 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[4]);
931 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[5]);
932 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r3), _conv_row[6]);
934 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), out_low);
936 int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]);
937 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l2), _conv_row[1]);
938 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[2]);
939 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[3]);
940 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[4]);
941 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[5]);
942 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r3), _conv_row[6]);
944 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, out_high);
951 inline void NESeparableConvolutionHorKernel<9>::convolve<uint16_t>(const Window &window)
953 Window win_in(window);
954 win_in.shift(Window::DimX, -4);
956 Iterator input(_input, win_in);
957 Iterator output(_output, window);
959 execute_window_loop(window, [&](const Coordinates & id)
961 const uint8x16_t data = vld1q_u8(input.ptr());
963 const uint16x8x2_t data_u16 =
966 vmovl_u8(vget_low_u8(data)),
967 vmovl_u8(vget_high_u8(data))
971 uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]);
972 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]);
973 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]);
974 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]);
975 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]);
976 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 5), _conv_row[5]);
977 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 6), _conv_row[6]);
978 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 7), _conv_row[7]);
979 out = vmlaq_n_u16(out, data_u16.val[1], _conv_row[8]);
981 vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()), out);
988 inline void NESeparableConvolutionHorKernel<9>::convolve<int16_t>(const Window &window)
990 Window win_in(window);
991 win_in.shift(Window::DimX, -4);
993 Iterator input(_input, win_in);
994 Iterator output(_output, window);
996 execute_window_loop(window, [&](const Coordinates & id)
998 const uint8x16_t data = vld1q_u8(input.ptr());
1000 const int16x8x2_t data_s16 =
1003 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
1004 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
1008 int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]);
1009 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]);
1010 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]);
1011 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]);
1012 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]);
1013 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 5), _conv_row[5]);
1014 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 6), _conv_row[6]);
1015 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 7), _conv_row[7]);
1016 out = vmlaq_n_s16(out, data_s16.val[1], _conv_row[8]);
1018 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), out);
1025 void NESeparableConvolutionHorKernel<9>::convolve<int32_t>(const Window &window)
1027 Window win_in(window);
1028 win_in.shift(Window::DimX, -4);
1030 Iterator input(_input, win_in);
1031 Iterator output(_output, window);
1033 execute_window_loop(window, [&](const Coordinates & id)
1035 const uint8x16_t data = vld1q_u8(input.ptr());
1037 const int16x8x2_t data_s16 =
1040 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
1041 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
1045 const int16x8_t data_s16_l3 = vextq_s16(data_s16.val[0], data_s16.val[1], 1);
1046 const int16x8_t data_s16_l2 = vextq_s16(data_s16.val[0], data_s16.val[1], 2);
1047 const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 3);
1048 const int16x8_t data_s16_m = vextq_s16(data_s16.val[0], data_s16.val[1], 4);
1049 const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 5);
1050 const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 6);
1051 const int16x8_t data_s16_r3 = vextq_s16(data_s16.val[0], data_s16.val[1], 7);
1053 int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]);
1054 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l3), _conv_row[1]);
1055 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l2), _conv_row[2]);
1056 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[3]);
1057 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[4]);
1058 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[5]);
1059 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[6]);
1060 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r3), _conv_row[7]);
1061 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16.val[1]), _conv_row[8]);
1063 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), out_low);
1065 int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]);
1066 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l3), _conv_row[1]);
1067 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l2), _conv_row[2]);
1068 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[3]);
1069 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[4]);
1070 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[5]);
1071 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[6]);
1072 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r3), _conv_row[7]);
1073 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16.val[1]), _conv_row[8]);
1075 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, out_high);
1079 } // namespace arm_compute
1082 template class arm_compute::NESeparableConvolutionHorKernel<5>;
1083 template class arm_compute::NESeparableConvolutionHorKernel<7>;
1084 template class arm_compute::NESeparableConvolutionHorKernel<9>;
1086 template <unsigned int matrix_size>
1087 NESeparableConvolutionVertKernel<matrix_size>::NESeparableConvolutionVertKernel()
1088 : _conv_col{ { 0 } }, _scale(0)
1092 template <unsigned int matrix_size>
1093 BorderSize NESeparableConvolutionVertKernel<matrix_size>::border_size() const
1095 return BorderSize(matrix_size / 2, 0);
1098 template <unsigned int matrix_size>
1099 void NESeparableConvolutionVertKernel<matrix_size>::configure(const ITensor *input, ITensor *output, const int16_t *conv_col, uint32_t scale, bool border_undefined)
1101 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::S16, DataType::S32);
1102 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
1103 ARM_COMPUTE_ERROR_ON(conv_col == nullptr);
1104 ARM_COMPUTE_ERROR_ON(scale == 0);
1108 std::copy_n(conv_col, _conv_col.size(), _conv_col.begin());
1111 // Configure kernel window
1112 constexpr unsigned int processed_elements(16);
1113 constexpr unsigned int read_elements(16);
1114 constexpr unsigned int written_elements(16);
1116 Window win = calculate_max_window(*input->info(), Steps(processed_elements), border_undefined, border_size());
1117 AccessWindowHorizontal output_access(output->info(), 0, written_elements);
1119 update_window_and_padding(win,
1120 AccessWindowRectangle(input->info(), 0, -border_size().top, read_elements, matrix_size),
1123 output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
1125 INEKernel::configure(win);
1128 template <unsigned int matrix_size>
1129 void NESeparableConvolutionVertKernel<matrix_size>::run(const Window &window)
1131 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
1132 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
1134 switch(_input->info()->data_type())
1137 switch(_output->info()->data_type())
1140 convolution_u16<uint8_t>(window);
1143 convolution_u16<int16_t>(window);
1146 ARM_COMPUTE_ERROR("Not supported");
1150 switch(_output->info()->data_type())
1153 convolution_s16<uint8_t>(window);
1156 convolution_s16<int16_t>(window);
1159 ARM_COMPUTE_ERROR("Not supported");
1163 switch(_output->info()->data_type())
1166 convolution_s32<uint8_t>(window);
1169 convolution_s32<int16_t>(window);
1172 ARM_COMPUTE_ERROR("Not supported");
1176 ARM_COMPUTE_ERROR("Unsupported intermediate data type!");
1181 template <unsigned int matrix_size>
1182 template <typename OutputType>
1183 void NESeparableConvolutionVertKernel<matrix_size>::convolution_u16(const Window &win)
1185 static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
1188 win_in.set_dimension_step(Window::DimX, 8);
1190 Iterator in(_input, win_in);
1191 Iterator out(_output, win);
1193 std::array<unsigned char *, matrix_size> input_ptrs{ {} };
1194 const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale);
1195 const int k_half = matrix_size / 2;
1198 for(int i = -k_half; i <= k_half; ++i)
1200 input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i));
1203 execute_window_loop(win, [&](const Coordinates & id)
1205 uint16x8_t out0 = vdupq_n_u16(0);
1206 uint16x8_t out1 = vdupq_n_u16(0);
1209 for(unsigned int r = 0; r < matrix_size; ++r)
1211 const uint16x8_t data = vld1q_u16(reinterpret_cast<const uint16_t *>(input_ptrs[r] + in.offset()));
1212 out0 = vmlaq_n_u16(out0, data, _conv_col[r]);
1215 in.increment(Window::DimX);
1218 for(unsigned int r = 0; r < matrix_size; ++r)
1220 const uint16x8_t data = vld1q_u16(reinterpret_cast<const uint16_t *>(input_ptrs[r] + in.offset()));
1221 out1 = vmlaq_n_u16(out1, data, _conv_col[r]);
1224 //scale the result if needed
1227 float32x4_t out0_f32_high = vcvtq_f32_u32(vmovl_u16(vget_high_u16(out0)));
1228 float32x4_t out0_f32_low = vcvtq_f32_u32(vmovl_u16(vget_low_u16(out0)));
1229 out0_f32_high = vmulq_f32(out0_f32_high, oneoverscale);
1230 out0_f32_low = vmulq_f32(out0_f32_low, oneoverscale);
1231 store_results(vcvtq_u32_f32(out0_f32_low), vcvtq_u32_f32(out0_f32_high), reinterpret_cast<OutputType *>(out.ptr()));
1233 float32x4_t out1_f32_high = vcvtq_f32_u32(vmovl_u16(vget_high_u16(out1)));
1234 float32x4_t out1_f32_low = vcvtq_f32_u32(vmovl_u16(vget_low_u16(out1)));
1235 out1_f32_high = vmulq_f32(out1_f32_high, oneoverscale);
1236 out1_f32_low = vmulq_f32(out1_f32_low, oneoverscale);
1237 store_results(vcvtq_u32_f32(out1_f32_low), vcvtq_u32_f32(out1_f32_high), reinterpret_cast<OutputType *>(out.ptr()) + 8);
1241 store_results(out0, out1, reinterpret_cast<OutputType *>(out.ptr()));
1247 template <unsigned int matrix_size>
1248 template <typename OutputType>
1249 void NESeparableConvolutionVertKernel<matrix_size>::convolution_s16(const Window &win)
1251 static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
1254 win_in.set_dimension_step(Window::DimX, 8);
1256 Iterator in(_input, win_in);
1257 Iterator out(_output, win);
1259 std::array<unsigned char *, matrix_size> input_ptrs{ {} };
1260 const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale);
1261 const int k_half = matrix_size / 2;
1264 for(int i = -k_half; i <= k_half; ++i)
1266 input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i));
1269 execute_window_loop(win, [&](const Coordinates & id)
1271 int16x8_t out0 = vdupq_n_s16(0);
1272 int16x8_t out1 = vdupq_n_s16(0);
1275 for(unsigned int r = 0; r < matrix_size; ++r)
1277 const int16x8_t data = vld1q_s16(reinterpret_cast<const int16_t *>(input_ptrs[r] + in.offset()));
1278 out0 = vmlaq_n_s16(out0, data, _conv_col[r]);
1281 in.increment(Window::DimX);
1284 for(unsigned int r = 0; r < matrix_size; ++r)
1286 const int16x8_t data = vld1q_s16(reinterpret_cast<const int16_t *>(input_ptrs[r] + in.offset()));
1287 out1 = vmlaq_n_s16(out1, data, _conv_col[r]);
1290 //scale the result if needed
1293 float32x4_t out0_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(out0)));
1294 float32x4_t out0_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(out0)));
1295 out0_f32_high = vmulq_f32(out0_f32_high, oneoverscale);
1296 out0_f32_low = vmulq_f32(out0_f32_low, oneoverscale);
1297 store_results(vcvtq_s32_f32(out0_f32_low), vcvtq_s32_f32(out0_f32_high), reinterpret_cast<OutputType *>(out.ptr()));
1299 float32x4_t out1_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(out1)));
1300 float32x4_t out1_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(out1)));
1301 out1_f32_high = vmulq_f32(out1_f32_high, oneoverscale);
1302 out1_f32_low = vmulq_f32(out1_f32_low, oneoverscale);
1303 store_results(vcvtq_s32_f32(out1_f32_low), vcvtq_s32_f32(out1_f32_high), reinterpret_cast<OutputType *>(out.ptr()) + 8);
1307 store_results(out0, out1, reinterpret_cast<OutputType *>(out.ptr()));
1313 template <unsigned int matrix_size>
1314 template <typename OutputType>
1315 void NESeparableConvolutionVertKernel<matrix_size>::convolution_s32(const Window &win)
1317 static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
1320 win_in.set_dimension_step(Window::DimX, 8);
1322 Iterator in(_input, win_in);
1323 Iterator out(_output, win);
1325 std::array<unsigned char *, matrix_size> input_ptrs{ {} };
1326 const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale);
1327 const int k_half = matrix_size / 2;
1330 for(int i = -k_half; i <= k_half; ++i)
1332 input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i));
1335 const int32x4_t zero = vdupq_n_s32(0);
1337 execute_window_loop(win, [&](const Coordinates & id)
1356 for(unsigned int r = 0; r < matrix_size; ++r)
1358 const int32x4x2_t data = vld2q_s32(reinterpret_cast<const int32_t *>(input_ptrs[r] + in.offset()));
1359 out0.val[0] = vmlaq_n_s32(out0.val[0], data.val[0], _conv_col[r]);
1360 out0.val[1] = vmlaq_n_s32(out0.val[1], data.val[1], _conv_col[r]);
1363 in.increment(Window::DimX);
1366 for(unsigned int r = 0; r < matrix_size; ++r)
1368 const int32x4x2_t data = vld2q_s32(reinterpret_cast<const int32_t *>(input_ptrs[r] + in.offset()));
1369 out1.val[0] = vmlaq_n_s32(out1.val[0], data.val[0], _conv_col[r]);
1370 out1.val[1] = vmlaq_n_s32(out1.val[1], data.val[1], _conv_col[r]);
1373 //scale the result if needed
1376 float32x4_t out0_f32_odd = vcvtq_f32_s32(out0.val[0]);
1377 float32x4_t out0_f32_even = vcvtq_f32_s32(out0.val[1]);
1378 out0_f32_odd = vmulq_f32(out0_f32_odd, oneoverscale);
1379 out0_f32_even = vmulq_f32(out0_f32_even, oneoverscale);
1380 out0.val[0] = vcvtq_s32_f32(out0_f32_odd);
1381 out0.val[1] = vcvtq_s32_f32(out0_f32_even);
1383 float32x4_t out1_f32_odd = vcvtq_f32_s32(out1.val[0]);
1384 float32x4_t out1_f32_even = vcvtq_f32_s32(out1.val[1]);
1385 out1_f32_odd = vmulq_f32(out1_f32_odd, oneoverscale);
1386 out1_f32_even = vmulq_f32(out1_f32_even, oneoverscale);
1387 out1.val[0] = vcvtq_s32_f32(out1_f32_odd);
1388 out1.val[1] = vcvtq_s32_f32(out1_f32_even);
1391 const int32x4x2_t out0_s32 = vzipq_s32(out0.val[0], out0.val[1]);
1392 store_results(out0_s32.val[0], out0_s32.val[1], reinterpret_cast<OutputType *>(out.ptr()));
1394 const int32x4x2_t out1_s32 = vzipq_s32(out1.val[0], out1.val[1]);
1395 store_results(out1_s32.val[0], out1_s32.val[1], reinterpret_cast<OutputType *>(out.ptr()) + 8);
1400 template class arm_compute::NESeparableConvolutionVertKernel<5>;
1401 template class arm_compute::NESeparableConvolutionVertKernel<7>;
1402 template class arm_compute::NESeparableConvolutionVertKernel<9>;
1404 /****************************************************************************************\
1405 * Rectangle Convolution *
1406 \****************************************************************************************/
1408 NEConvolutionRectangleKernel::NEConvolutionRectangleKernel()
1409 : _input(nullptr), _output(nullptr), _scale(0), _convolution(), _border_size(), _func_idx(0)
1413 BorderSize NEConvolutionRectangleKernel::border_size() const
1415 return _border_size;
1418 void NEConvolutionRectangleKernel::configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined)
1420 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
1421 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
1422 ARM_COMPUTE_ERROR_ON(nullptr == conv);
1423 ARM_COMPUTE_ERROR_ON(3 != width && 5 != width && 7 != width && 9 != width);
1424 ARM_COMPUTE_ERROR_ON(3 != height && 5 != height && 7 != height && 9 != height);
1425 ARM_COMPUTE_ERROR_ON(0 == scale);
1430 _border_size = BorderSize(height / 2, width / 2);
1432 // Setup the convolution matrix
1433 const uint32_t nr_elements = width * height;
1434 _convolution.resize(nr_elements);
1435 std::copy_n(conv, nr_elements, _convolution.begin());
1437 // Set function index to help choose appropriate function in run()
1438 _func_idx = get_index(height) * 4 + get_index(width);
1439 ARM_COMPUTE_ERROR_ON(_func_idx > (_nr_supported_sizes * _nr_supported_sizes));
1441 // Configure kernel window
1442 constexpr unsigned int processed_elements(8);
1443 constexpr unsigned int read_elements(16);
1444 constexpr unsigned int written_elements(8);
1446 Window win = calculate_max_window(*input->info(), Steps(processed_elements), border_undefined, _border_size);
1447 AccessWindowHorizontal output_access = AccessWindowHorizontal(output->info(), 0, written_elements);
1449 update_window_and_padding(win,
1450 AccessWindowRectangle(input->info(), -_border_size.left, -_border_size.top, read_elements, height),
1453 output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, _border_size);
1455 INEKernel::configure(win);
1458 void NEConvolutionRectangleKernel::run(const Window &window)
1460 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
1461 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
1463 using ConvolutionRectangleFunction = void (NEConvolutionRectangleKernel::*)(const Window & window);
1465 // uint8_t function table
1466 static const std::array<ConvolutionRectangleFunction, 16> func_table_u8 =
1469 &NEConvolutionRectangleKernel::convolution<uint8_t, 3, 3>,
1470 &NEConvolutionRectangleKernel::convolution<uint8_t, 3, 5>,
1471 &NEConvolutionRectangleKernel::convolution<uint8_t, 3, 7>,
1472 &NEConvolutionRectangleKernel::convolution<uint8_t, 3, 9>,
1473 &NEConvolutionRectangleKernel::convolution<uint8_t, 5, 3>,
1474 &NEConvolutionRectangleKernel::convolution<uint8_t, 5, 5>,
1475 &NEConvolutionRectangleKernel::convolution<uint8_t, 5, 7>,
1476 &NEConvolutionRectangleKernel::convolution<uint8_t, 5, 9>,
1477 &NEConvolutionRectangleKernel::convolution<uint8_t, 7, 3>,
1478 &NEConvolutionRectangleKernel::convolution<uint8_t, 7, 5>,
1479 &NEConvolutionRectangleKernel::convolution<uint8_t, 7, 7>,
1480 &NEConvolutionRectangleKernel::convolution<uint8_t, 7, 9>,
1481 &NEConvolutionRectangleKernel::convolution<uint8_t, 9, 3>,
1482 &NEConvolutionRectangleKernel::convolution<uint8_t, 9, 5>,
1483 &NEConvolutionRectangleKernel::convolution<uint8_t, 9, 7>,
1484 &NEConvolutionRectangleKernel::convolution<uint8_t, 9, 9>
1487 // int16_t function table
1488 static const std::array<ConvolutionRectangleFunction, 16> func_table_s16 =
1491 &NEConvolutionRectangleKernel::convolution<int16_t, 3, 3>,
1492 &NEConvolutionRectangleKernel::convolution<int16_t, 3, 5>,
1493 &NEConvolutionRectangleKernel::convolution<int16_t, 3, 7>,
1494 &NEConvolutionRectangleKernel::convolution<int16_t, 3, 9>,
1495 &NEConvolutionRectangleKernel::convolution<int16_t, 5, 3>,
1496 &NEConvolutionRectangleKernel::convolution<int16_t, 5, 5>,
1497 &NEConvolutionRectangleKernel::convolution<int16_t, 5, 7>,
1498 &NEConvolutionRectangleKernel::convolution<int16_t, 5, 9>,
1499 &NEConvolutionRectangleKernel::convolution<int16_t, 7, 3>,
1500 &NEConvolutionRectangleKernel::convolution<int16_t, 7, 5>,
1501 &NEConvolutionRectangleKernel::convolution<int16_t, 7, 7>,
1502 &NEConvolutionRectangleKernel::convolution<int16_t, 7, 9>,
1503 &NEConvolutionRectangleKernel::convolution<int16_t, 9, 3>,
1504 &NEConvolutionRectangleKernel::convolution<int16_t, 9, 5>,
1505 &NEConvolutionRectangleKernel::convolution<int16_t, 9, 7>,
1506 &NEConvolutionRectangleKernel::convolution<int16_t, 9, 9>
1510 // Run appropriate function
1511 switch(_output->info()->format())
1514 ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_u8.size());
1515 (this->*func_table_u8[_func_idx])(window);
1518 ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_s16.size());
1519 (this->*func_table_s16[_func_idx])(window);
1522 ARM_COMPUTE_ERROR("Not supported");
1526 unsigned int NEConvolutionRectangleKernel::get_index(uint32_t val)
1539 ARM_COMPUTE_ERROR("Not supported dimension size");
1544 template <typename OutputType, unsigned int rows, unsigned int cols>
1545 void NEConvolutionRectangleKernel::convolution(const Window &win)
1547 static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
1548 ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
1550 Iterator input(_input, win);
1551 Iterator output(_output, win);
1553 std::array<unsigned char *, rows> input_ptrs{ {} };
1554 const int16_t *conv = _convolution.data();
1555 const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
1556 const int k_row_half = rows / 2;
1557 const int k_col_half = cols / 2;
1560 for(int i = -k_row_half; i <= k_row_half; ++i)
1562 input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i));
1565 execute_window_loop(win, [&](const Coordinates & id)
1567 int32x4_t out = vdupq_n_s32(0);
1568 int32x4_t out2 = vdupq_n_s32(0);
1570 // Perform appropriate convolution
1571 for(unsigned int r = 0; r < rows; ++r)
1573 const uint8x16_t data = vld1q_u8(input_ptrs[r] + input.offset());
1576 convolve_row3x1(out, out2, data, conv + r * cols);
1580 convolve_row5x1(out, out2, data, conv + r * cols);
1584 convolve_row7x1(out, out2, data, conv + r * cols);
1588 convolve_row9x1(out, out2, data, conv + r * cols);
1592 ARM_COMPUTE_ERROR("Unsupported number of columns");
1599 // Convert to F32, scale and convert back to S32
1600 out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
1601 out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
1604 // Clamp and store as U8 or S16:
1605 store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));