be6fac5ed586e75772e817f46f58be845a21da24
[platform/upstream/armcl.git] / src / core / NEON / kernels / NEHarrisCornersKernel.cpp
1 /*
2  * Copyright (c) 2016, 2017 ARM Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h"
25
26 #include "arm_compute/core/AccessWindowAutoPadding.h"
27 #include "arm_compute/core/Coordinates.h"
28 #include "arm_compute/core/Error.h"
29 #include "arm_compute/core/Helpers.h"
30 #include "arm_compute/core/TensorInfo.h"
31 #include "arm_compute/core/Types.h"
32 #include "arm_compute/core/Utils.h"
33 #include "arm_compute/core/Validate.h"
34 #include "arm_compute/core/Window.h"
35
36 #include <algorithm>
37 #include <arm_neon.h>
38 #include <cmath>
39 #include <cstddef>
40
41 using namespace arm_compute;
42
43 #ifdef ARM_COMPUTE_ENABLE_FP16
44
45 template class arm_compute::NEHarrisScoreFP16Kernel<3>;
46 template class arm_compute::NEHarrisScoreFP16Kernel<5>;
47 template class arm_compute::NEHarrisScoreFP16Kernel<7>;
48
49 namespace fp16
50 {
51 inline float16x8_t harris_score(float16x8_t gx2, float16x8_t gy2, float16x8_t gxgy, float sensitivity, float strength_thresh)
52 {
53     static const float16x8_t zero = vdupq_n_f16(0.f);
54
55     /* Trace^2 */
56     float16x8_t trace2 = vaddq_f16(gx2, gy2);
57     trace2             = vmulq_f16(trace2, trace2);
58
59     /* Det(A) */
60     float16x8_t det = vmulq_f16(gx2, gy2);
61     det             = vfmsq_f16(det, gxgy, gxgy);
62
63     /* Det(A) - sensitivity * trace^2 */
64     const float16x8_t mc = vfmsq_f16(det, vdupq_n_f16(sensitivity), trace2);
65
66     /* mc > strength_thresh */
67     const uint16x8_t mask = vcgtq_f16(mc, vdupq_n_f16(strength_thresh));
68
69     return vbslq_f16(mask, mc, zero);
70 }
71
72 template <size_t block_size>
73 inline void harris_score_FLOAT_FLOAT_FLOAT(float16x8_t low_gx, float16x8_t low_gy, float16x8_t high_gx, float16x8_t high_gy, float16x8_t &gx2, float16x8_t &gy2, float16x8_t &gxgy,
74                                            float norm_factor)
75 {
76     const float16x8_t norm_factor_fp16 = vdupq_n_f16(norm_factor);
77
78     /* Normalize */
79     low_gx  = vmulq_f16(low_gx, norm_factor_fp16);
80     low_gy  = vmulq_f16(low_gy, norm_factor_fp16);
81     high_gx = vmulq_f16(high_gx, norm_factor_fp16);
82     high_gy = vmulq_f16(high_gy, norm_factor_fp16);
83
84     for(size_t i = 0; i < block_size; ++i)
85     {
86         const float16x8_t gx = vextq_f16(low_gx, high_gx, i);
87         const float16x8_t gy = vextq_f16(low_gy, high_gy, i);
88
89         gx2  = vfmaq_f16(gx2, gx, gx);
90         gy2  = vfmaq_f16(gy2, gy, gy);
91         gxgy = vfmaq_f16(gxgy, gx, gy);
92     }
93 }
94
95 template <size_t block_size>
96 inline void harris_score_S16_S16_FLOAT(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity,
97                                        float strength_thresh)
98 {
99     auto           gx_ptr_0 = static_cast<const int16_t *__restrict>(in1_ptr) - (block_size / 2) * (in_stride + 1);
100     auto           gy_ptr_0 = static_cast<const int16_t *__restrict>(in2_ptr) - (block_size / 2) * (in_stride + 1);
101     const int16_t *gx_ptr_1 = gx_ptr_0 + 8;
102     const int16_t *gy_ptr_1 = gy_ptr_0 + 8;
103     const auto     output   = static_cast<float *__restrict>(out_ptr);
104
105     /* Gx^2, Gy^2 and Gx*Gy */
106     float16x8_t gx2  = vdupq_n_f16(0.0f);
107     float16x8_t gy2  = vdupq_n_f16(0.0f);
108     float16x8_t gxgy = vdupq_n_f16(0.0f);
109
110     for(size_t i = 0; i < block_size; ++i)
111     {
112         const float16x8_t low_gx  = vcvtq_f16_s16(vld1q_s16(gx_ptr_0));
113         const float16x8_t high_gx = vcvtq_f16_s16(vld1q_s16(gx_ptr_1));
114         const float16x8_t low_gy  = vcvtq_f16_s16(vld1q_s16(gy_ptr_0));
115         const float16x8_t high_gy = vcvtq_f16_s16(vld1q_s16(gy_ptr_1));
116         harris_score_FLOAT_FLOAT_FLOAT<block_size>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
117
118         /* Update gx and gy pointer */
119         gx_ptr_0 += in_stride;
120         gy_ptr_0 += in_stride;
121         gx_ptr_1 += in_stride;
122         gy_ptr_1 += in_stride;
123     }
124
125     /* Calculate harris score */
126     const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
127
128     /* Store score */
129     vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
130     vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
131 }
132
133 template <size_t block_size>
134 inline void harris_score_S32_S32_FLOAT(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity,
135                                        float strength_thresh)
136 {
137     static const float16x8_t zero = vdupq_n_f16(0.0f);
138
139     auto           gx_ptr_0 = static_cast<const int32_t *__restrict>(in1_ptr) - (block_size / 2) * (in_stride + 1);
140     auto           gy_ptr_0 = static_cast<const int32_t *__restrict>(in2_ptr) - (block_size / 2) * (in_stride + 1);
141     const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
142     const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
143     const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
144     const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
145     const auto     output   = static_cast<float *__restrict>(out_ptr);
146
147     /* Gx^2, Gy^2 and Gx*Gy */
148     float16x8_t gx2  = zero;
149     float16x8_t gy2  = zero;
150     float16x8_t gxgy = zero;
151
152     for(size_t i = 0; i < block_size; ++i)
153     {
154         const float16x8_t low_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_0))),
155                                                 vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_1))));
156         const float16x8_t high_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_2))),
157                                                  vget_low_f16(zero));
158         const float16x8_t low_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_0))),
159                                                 vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_1))));
160         const float16x8_t high_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_2))),
161                                                  vget_low_f16(zero));
162         harris_score_FLOAT_FLOAT_FLOAT<block_size>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
163
164         /* Update gx and gy pointer */
165         gx_ptr_0 += in_stride;
166         gy_ptr_0 += in_stride;
167         gx_ptr_1 += in_stride;
168         gy_ptr_1 += in_stride;
169         gx_ptr_2 += in_stride;
170         gy_ptr_2 += in_stride;
171     }
172
173     /* Calculate harris score */
174     const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
175
176     /* Store score */
177     vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
178     vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
179 }
180
181 template <>
182 inline void harris_score_S32_S32_FLOAT<7>(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity,
183                                           float strength_thresh)
184 {
185     static const float16x8_t zero = vdupq_n_f16(0.0f);
186
187     auto           gx_ptr_0 = static_cast<const int32_t *__restrict>(in1_ptr) - 3 * (in_stride + 1);
188     auto           gy_ptr_0 = static_cast<const int32_t *__restrict>(in2_ptr) - 3 * (in_stride + 1);
189     const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
190     const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
191     const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
192     const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
193     const int32_t *gx_ptr_3 = gx_ptr_0 + 12;
194     const int32_t *gy_ptr_3 = gy_ptr_0 + 12;
195     const auto     output   = static_cast<float *__restrict>(out_ptr);
196
197     /* Gx^2, Gy^2 and Gx*Gy */
198     float16x8_t gx2  = zero;
199     float16x8_t gy2  = zero;
200     float16x8_t gxgy = zero;
201
202     for(size_t i = 0; i < 7; ++i)
203     {
204         const float16x8_t low_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_0))),
205                                                 vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_1))));
206         const float16x8_t high_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_2))),
207                                                  vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_3))));
208         const float16x8_t low_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_0))),
209                                                 vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_1))));
210         const float16x8_t high_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_2))),
211                                                  vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_3))));
212         harris_score_FLOAT_FLOAT_FLOAT<7>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
213
214         /* Update gx and gy pointer */
215         gx_ptr_0 += in_stride;
216         gy_ptr_0 += in_stride;
217         gx_ptr_1 += in_stride;
218         gy_ptr_1 += in_stride;
219         gx_ptr_2 += in_stride;
220         gy_ptr_2 += in_stride;
221     }
222
223     /* Calculate harris score */
224     const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
225
226     /* Store score */
227     vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
228     vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
229 }
230
231 } // namespace fp16
232
233 template <int32_t block_size>
234 BorderSize        NEHarrisScoreFP16Kernel<block_size>::border_size() const
235 {
236     return BorderSize(block_size / 2);
237 }
238
239 template <int32_t block_size>
240 NEHarrisScoreFP16Kernel<block_size>::NEHarrisScoreFP16Kernel()
241     : INEHarrisScoreKernel(), _func(nullptr)
242 {
243 }
244
245 template <int32_t block_size>
246 void NEHarrisScoreFP16Kernel<block_size>::run(const Window &window)
247 {
248     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
249     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
250     ARM_COMPUTE_ERROR_ON(_func == nullptr);
251
252     Iterator input1(_input1, window);
253     Iterator input2(_input2, window);
254     Iterator output(_output, window);
255
256     const size_t input_stride = _input1->info()->strides_in_bytes()[1] / element_size_from_data_type(_input1->info()->data_type());
257
258     execute_window_loop(window, [&](const Coordinates & id)
259     {
260         (*_func)(input1.ptr(), input2.ptr(), output.ptr(), input_stride, _norm_factor, _sensitivity, _strength_thresh);
261     },
262     input1, input2, output);
263 }
264
265 template <int32_t block_size>
266 void NEHarrisScoreFP16Kernel<block_size>::configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity,
267                                                     bool border_undefined)
268 {
269     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1);
270     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2);
271     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
272     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32);
273     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32);
274     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
275     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
276     ARM_COMPUTE_ERROR_ON(0.0f == norm_factor);
277
278     _input1          = input1;
279     _input2          = input2;
280     _output          = output;
281     _sensitivity     = sensitivity;
282     _strength_thresh = strength_thresh;
283     _norm_factor     = norm_factor;
284
285     if(input1->info()->data_type() == DataType::S16)
286     {
287         _func = &fp16::harris_score_S16_S16_FLOAT<block_size>;
288     }
289     else
290     {
291         _func = &fp16::harris_score_S32_S32_FLOAT<block_size>;
292     }
293
294     ARM_COMPUTE_ERROR_ON(nullptr == _func);
295
296     const unsigned int processed_elements = 8;
297
298     // Configure kernel window
299     Window                  win = calculate_max_window(*input1->info(), Steps(processed_elements), border_undefined, border_size());
300     AccessWindowAutoPadding output_access(output->info());
301
302     update_window_and_padding(win,
303                               AccessWindowAutoPadding(input1->info()),
304                               AccessWindowAutoPadding(input2->info()),
305                               output_access);
306
307     output_access.set_valid_region();
308
309     INEKernel::configure(win);
310 }
311
312 #endif
313
314 template class arm_compute::NEHarrisScoreKernel<3>;
315 template class arm_compute::NEHarrisScoreKernel<5>;
316 template class arm_compute::NEHarrisScoreKernel<7>;
317 template arm_compute::NEHarrisScoreKernel<3>::NEHarrisScoreKernel();
318 template arm_compute::NEHarrisScoreKernel<5>::NEHarrisScoreKernel();
319 template arm_compute::NEHarrisScoreKernel<7>::NEHarrisScoreKernel();
320
321 namespace
322 {
323 inline float32x4_t harris_score(float32x4_t gx2, float32x4_t gy2, float32x4_t gxgy, float32x4_t sensitivity, float32x4_t strength_thresh)
324 {
325     /* Trace^2 */
326     float32x4_t trace2 = vaddq_f32(gx2, gy2);
327     trace2             = vmulq_f32(trace2, trace2);
328
329     /* Det(A) */
330     float32x4_t det = vmulq_f32(gx2, gy2);
331     det             = vmlsq_f32(det, gxgy, gxgy);
332
333     /* Det(A) - sensitivity * trace^2 */
334     const float32x4_t mc = vmlsq_f32(det, sensitivity, trace2);
335
336     /* mc > strength_thresh */
337     const uint32x4_t mask = vcgtq_f32(mc, strength_thresh);
338
339     return vbslq_f32(mask, mc, vdupq_n_f32(0.0f));
340 }
341
342 inline void harris_score1x3_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy,
343                                               float32x4_t norm_factor)
344 {
345     /* Normalize */
346     low_gx  = vmulq_f32(low_gx, norm_factor);
347     low_gy  = vmulq_f32(low_gy, norm_factor);
348     high_gx = vmulq_f32(high_gx, norm_factor);
349     high_gy = vmulq_f32(high_gy, norm_factor);
350
351     const float32x4_t l_gx = low_gx;
352     const float32x4_t l_gy = low_gy;
353     const float32x4_t m_gx = vextq_f32(low_gx, high_gx, 1);
354     const float32x4_t m_gy = vextq_f32(low_gy, high_gy, 1);
355     const float32x4_t r_gx = vextq_f32(low_gx, high_gx, 2);
356     const float32x4_t r_gy = vextq_f32(low_gy, high_gy, 2);
357
358     /* Gx*Gx*/
359     gx2 = vmlaq_f32(gx2, l_gx, l_gx);
360     gx2 = vmlaq_f32(gx2, m_gx, m_gx);
361     gx2 = vmlaq_f32(gx2, r_gx, r_gx);
362
363     /* Gy*Gy*/
364     gy2 = vmlaq_f32(gy2, l_gy, l_gy);
365     gy2 = vmlaq_f32(gy2, m_gy, m_gy);
366     gy2 = vmlaq_f32(gy2, r_gy, r_gy);
367
368     /* Gx*Gy */
369     gxgy = vmlaq_f32(gxgy, l_gx, l_gy);
370     gxgy = vmlaq_f32(gxgy, m_gx, m_gy);
371     gxgy = vmlaq_f32(gxgy, r_gx, r_gy);
372 }
373
374 inline void harris_score1x5_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy,
375                                               float32x4_t norm_factor)
376 {
377     /* Normalize */
378     low_gx  = vmulq_f32(low_gx, norm_factor);
379     low_gy  = vmulq_f32(low_gy, norm_factor);
380     high_gx = vmulq_f32(high_gx, norm_factor);
381     high_gy = vmulq_f32(high_gy, norm_factor);
382
383     /* L2 values  */
384     float32x4_t gx = low_gx;
385     float32x4_t gy = low_gy;
386
387     /* Accumulate */
388     gx2  = vmlaq_f32(gx2, gx, gx);
389     gy2  = vmlaq_f32(gy2, gy, gy);
390     gxgy = vmlaq_f32(gxgy, gx, gy);
391
392     /* L1 values  */
393     gx = vextq_f32(low_gx, high_gx, 1);
394     gy = vextq_f32(low_gy, high_gy, 1);
395
396     /* Accumulate */
397     gx2  = vmlaq_f32(gx2, gx, gx);
398     gy2  = vmlaq_f32(gy2, gy, gy);
399     gxgy = vmlaq_f32(gxgy, gx, gy);
400
401     /* M values  */
402     gx = vextq_f32(low_gx, high_gx, 2);
403     gy = vextq_f32(low_gy, high_gy, 2);
404
405     /* Accumulate */
406     gx2  = vmlaq_f32(gx2, gx, gx);
407     gy2  = vmlaq_f32(gy2, gy, gy);
408     gxgy = vmlaq_f32(gxgy, gx, gy);
409
410     /* R1 values  */
411     gx = vextq_f32(low_gx, high_gx, 3);
412     gy = vextq_f32(low_gy, high_gy, 3);
413
414     /* Accumulate */
415     gx2  = vmlaq_f32(gx2, gx, gx);
416     gy2  = vmlaq_f32(gy2, gy, gy);
417     gxgy = vmlaq_f32(gxgy, gx, gy);
418
419     /* R2 values  */
420     gx = high_gx;
421     gy = high_gy;
422
423     /* Accumulate */
424     gx2  = vmlaq_f32(gx2, gx, gx);
425     gy2  = vmlaq_f32(gy2, gy, gy);
426     gxgy = vmlaq_f32(gxgy, gx, gy);
427 }
428
429 inline void harris_score1x7_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t high_gx1, float32x4_t high_gy1, float32x4_t &gx2,
430                                               float32x4_t &gy2, float32x4_t &gxgy, float32x4_t norm_factor)
431 {
432     /* Normalize */
433     low_gx  = vmulq_f32(low_gx, norm_factor);
434     low_gy  = vmulq_f32(low_gy, norm_factor);
435     high_gx = vmulq_f32(high_gx, norm_factor);
436     high_gy = vmulq_f32(high_gy, norm_factor);
437
438     /* L3 values  */
439     float32x4_t gx = low_gx;
440     float32x4_t gy = low_gy;
441
442     /* Accumulate */
443     gx2  = vmlaq_f32(gx2, gx, gx);
444     gy2  = vmlaq_f32(gy2, gy, gy);
445     gxgy = vmlaq_f32(gxgy, gx, gy);
446
447     /* L2 values  */
448     gx = vextq_f32(low_gx, high_gx, 1);
449     gy = vextq_f32(low_gy, high_gy, 1);
450
451     /* Accumulate */
452     gx2  = vmlaq_f32(gx2, gx, gx);
453     gy2  = vmlaq_f32(gy2, gy, gy);
454     gxgy = vmlaq_f32(gxgy, gx, gy);
455
456     /* L1 values  */
457     gx = vextq_f32(low_gx, high_gx, 2);
458     gy = vextq_f32(low_gy, high_gy, 2);
459
460     /* Accumulate */
461     gx2  = vmlaq_f32(gx2, gx, gx);
462     gy2  = vmlaq_f32(gy2, gy, gy);
463     gxgy = vmlaq_f32(gxgy, gx, gy);
464
465     /* M values  */
466     gx = vextq_f32(low_gx, high_gx, 3);
467     gy = vextq_f32(low_gy, high_gy, 3);
468
469     /* Accumulate */
470     gx2  = vmlaq_f32(gx2, gx, gx);
471     gy2  = vmlaq_f32(gy2, gy, gy);
472     gxgy = vmlaq_f32(gxgy, gx, gy);
473
474     /* R1 values  */
475     gx = high_gx;
476     gy = high_gy;
477
478     /* Accumulate */
479     gx2  = vmlaq_f32(gx2, gx, gx);
480     gy2  = vmlaq_f32(gy2, gy, gy);
481     gxgy = vmlaq_f32(gxgy, gx, gy);
482
483     /* Change tmp_low and tmp_high for calculating R2 and R3 values */
484     low_gx  = high_gx;
485     low_gy  = high_gy;
486     high_gx = high_gx1;
487     high_gy = high_gy1;
488
489     /* Normalize */
490     high_gx = vmulq_f32(high_gx, norm_factor);
491     high_gy = vmulq_f32(high_gy, norm_factor);
492
493     /* R2 values  */
494     gx = vextq_f32(low_gx, high_gx, 1);
495     gy = vextq_f32(low_gy, high_gy, 1);
496
497     /* Accumulate */
498     gx2  = vmlaq_f32(gx2, gx, gx);
499     gy2  = vmlaq_f32(gy2, gy, gy);
500     gxgy = vmlaq_f32(gxgy, gx, gy);
501
502     /* R3 values  */
503     gx = vextq_f32(low_gx, high_gx, 2);
504     gy = vextq_f32(low_gy, high_gy, 2);
505
506     /* Accumulate */
507     gx2  = vmlaq_f32(gx2, gx, gx);
508     gy2  = vmlaq_f32(gy2, gy, gy);
509     gxgy = vmlaq_f32(gxgy, gx, gy);
510 }
511
512 inline void harris_score3x3_S16_S16_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
513                                           float in_norm_factor, float in_sensitivity, float in_strength_thresh)
514
515 {
516     const auto     gx_ptr_0 = static_cast<const int16_t *__restrict>(input1_ptr) - 1;
517     const auto     gy_ptr_0 = static_cast<const int16_t *__restrict>(input2_ptr) - 1;
518     const int16_t *gx_ptr_1 = gx_ptr_0 + 4;
519     const int16_t *gy_ptr_1 = gy_ptr_0 + 4;
520     const auto     output   = static_cast<float *__restrict>(output_ptr);
521
522     /* Gx^2, Gy^2 and Gx*Gy */
523     float32x4x2_t gx2 =
524     {
525         {
526             vdupq_n_f32(0.0f),
527             vdupq_n_f32(0.0f)
528         }
529     };
530     float32x4x2_t gy2 =
531     {
532         {
533             vdupq_n_f32(0.0f),
534             vdupq_n_f32(0.0f)
535         }
536     };
537     float32x4x2_t gxgy =
538     {
539         {
540             vdupq_n_f32(0.0f),
541             vdupq_n_f32(0.0f)
542         }
543     };
544
545     /* Row0 */
546     int16x8x2_t tmp_gx =
547     {
548         {
549             vld1q_s16(gx_ptr_0 - input_stride),
550             vld1q_s16(gx_ptr_1 - input_stride)
551         }
552     };
553     int16x8x2_t tmp_gy =
554     {
555         {
556             vld1q_s16(gy_ptr_0 - input_stride),
557             vld1q_s16(gy_ptr_1 - input_stride)
558         }
559     };
560     float32x4_t sensitivity     = vdupq_n_f32(in_sensitivity);
561     float32x4_t norm_factor     = vdupq_n_f32(in_norm_factor);
562     float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
563
564     float32x4_t low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
565     float32x4_t low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
566     float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
567     float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
568     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
569
570     low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
571     low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
572     high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
573     high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
574     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
575
576     /* Row1 */
577     tmp_gx.val[0] = vld1q_s16(gx_ptr_0);
578     tmp_gy.val[0] = vld1q_s16(gy_ptr_0);
579     tmp_gx.val[1] = vld1q_s16(gx_ptr_1);
580     tmp_gy.val[1] = vld1q_s16(gy_ptr_1);
581
582     low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
583     low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
584     high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
585     high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
586     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
587
588     low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
589     low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
590     high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
591     high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
592     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
593
594     /* Row2 */
595     tmp_gx.val[0] = vld1q_s16(gx_ptr_0 + input_stride);
596     tmp_gy.val[0] = vld1q_s16(gy_ptr_0 + input_stride);
597     tmp_gx.val[1] = vld1q_s16(gx_ptr_1 + input_stride);
598     tmp_gy.val[1] = vld1q_s16(gy_ptr_1 + input_stride);
599
600     low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
601     low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
602     high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
603     high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
604     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
605
606     low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
607     low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
608     high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
609     high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
610     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
611
612     /* Calculate harris score */
613     const float32x4x2_t mc =
614     {
615         {
616             harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
617             harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
618         }
619     };
620
621     /* Store score */
622     vst1q_f32(output + 0, mc.val[0]);
623     vst1q_f32(output + 4, mc.val[1]);
624 }
625
626 inline void harris_score3x3_S32_S32_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
627                                           float in_norm_factor, float in_sensitivity, float in_strength_thresh)
628 {
629     auto           gx_ptr_0        = static_cast<const int32_t *__restrict>(input1_ptr) - 1;
630     auto           gy_ptr_0        = static_cast<const int32_t *__restrict>(input2_ptr) - 1;
631     const int32_t *gx_ptr_1        = gx_ptr_0 + 4;
632     const int32_t *gy_ptr_1        = gy_ptr_0 + 4;
633     const int32_t *gx_ptr_2        = gx_ptr_0 + 8;
634     const int32_t *gy_ptr_2        = gy_ptr_0 + 8;
635     const auto     output          = static_cast<float *__restrict>(output_ptr);
636     float32x4_t    sensitivity     = vdupq_n_f32(in_sensitivity);
637     float32x4_t    norm_factor     = vdupq_n_f32(in_norm_factor);
638     float32x4_t    strength_thresh = vdupq_n_f32(in_strength_thresh);
639
640     /* Gx^2, Gy^2 and Gx*Gy */
641     float32x4x2_t gx2 =
642     {
643         {
644             vdupq_n_f32(0.0f),
645             vdupq_n_f32(0.0f)
646         }
647     };
648     float32x4x2_t gy2 =
649     {
650         {
651             vdupq_n_f32(0.0f),
652             vdupq_n_f32(0.0f)
653         }
654     };
655     float32x4x2_t gxgy =
656     {
657         {
658             vdupq_n_f32(0.0f),
659             vdupq_n_f32(0.0f)
660         }
661     };
662
663     /* Row0 */
664     float32x4_t low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 - input_stride));
665     float32x4_t low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 - input_stride));
666     float32x4_t high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 - input_stride));
667     float32x4_t high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 - input_stride));
668     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
669
670     low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 - input_stride));
671     low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 - input_stride));
672     high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2 - input_stride));
673     high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 - input_stride));
674     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
675
676     /* Row1 */
677     low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
678     low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
679     high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
680     high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
681     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
682
683     low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
684     low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
685     high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
686     high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
687     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
688
689     /* Row2 */
690     low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 + input_stride));
691     low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 + input_stride));
692     high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 + input_stride));
693     high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 + input_stride));
694     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
695
696     low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 + input_stride));
697     low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 + input_stride));
698     high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2 + input_stride));
699     high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 + input_stride));
700     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
701
702     /* Calculate harris score */
703     const float32x4x2_t mc =
704     {
705         {
706             harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
707             harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
708         }
709     };
710
711     /* Store score */
712     vst1q_f32(output + 0, mc.val[0]);
713     vst1q_f32(output + 4, mc.val[1]);
714 }
715
716 inline void harris_score5x5_S16_S16_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
717                                           float in_norm_factor, float in_sensitivity, float in_strength_thresh)
718 {
719     auto           gx_ptr_0 = static_cast<const int16_t *__restrict>(input1_ptr) - 2 - 2 * input_stride;
720     auto           gy_ptr_0 = static_cast<const int16_t *__restrict>(input2_ptr) - 2 - 2 * input_stride;
721     const int16_t *gx_ptr_1 = gx_ptr_0 + 4;
722     const int16_t *gy_ptr_1 = gy_ptr_0 + 4;
723     const auto     output   = static_cast<float *__restrict>(output_ptr);
724
725     /* Gx^2, Gy^2 and Gx*Gy */
726     float32x4x2_t gx2 =
727     {
728         {
729             vdupq_n_f32(0.0f),
730             vdupq_n_f32(0.0f)
731         }
732     };
733     float32x4x2_t gy2 =
734     {
735         {
736             vdupq_n_f32(0.0f),
737             vdupq_n_f32(0.0f)
738         }
739     };
740     float32x4x2_t gxgy =
741     {
742         {
743             vdupq_n_f32(0.0f),
744             vdupq_n_f32(0.0f)
745         }
746     };
747     float32x4_t sensitivity     = vdupq_n_f32(in_sensitivity);
748     float32x4_t norm_factor     = vdupq_n_f32(in_norm_factor);
749     float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
750
751     for(int i = 0; i < 5; ++i)
752     {
753         const int16x8x2_t tmp_gx =
754         {
755             {
756                 vld1q_s16(gx_ptr_0),
757                 vld1q_s16(gx_ptr_1)
758             }
759         };
760         const int16x8x2_t tmp_gy =
761         {
762             {
763                 vld1q_s16(gy_ptr_0),
764                 vld1q_s16(gy_ptr_1)
765             }
766         };
767
768         float32x4_t low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
769         float32x4_t low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
770         float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
771         float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
772         harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
773
774         low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
775         low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
776         high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
777         high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
778         harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
779
780         /* Update gx and gy pointer */
781         gx_ptr_0 += input_stride;
782         gy_ptr_0 += input_stride;
783         gx_ptr_1 += input_stride;
784         gy_ptr_1 += input_stride;
785     }
786
787     /* Calculate harris score */
788     const float32x4x2_t mc =
789     {
790         {
791             harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
792             harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
793         }
794     };
795
796     /* Store score */
797     vst1q_f32(output + 0, mc.val[0]);
798     vst1q_f32(output + 4, mc.val[1]);
799 }
800
801 inline void harris_score5x5_S32_S32_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
802                                           float in_norm_factor, float in_sensitivity, float in_strength_thresh)
803
804 {
805     auto           gx_ptr_0 = static_cast<const int32_t *__restrict>(input1_ptr) - 2 - 2 * input_stride;
806     auto           gy_ptr_0 = static_cast<const int32_t *__restrict>(input2_ptr) - 2 - 2 * input_stride;
807     const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
808     const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
809     const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
810     const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
811     const auto     output   = static_cast<float *__restrict>(output_ptr);
812
813     /* Gx^2, Gy^2 and Gx*Gy */
814     float32x4x2_t gx2 =
815     {
816         {
817             vdupq_n_f32(0.0f),
818             vdupq_n_f32(0.0f)
819         }
820     };
821     float32x4x2_t gy2 =
822     {
823         {
824             vdupq_n_f32(0.0f),
825             vdupq_n_f32(0.0f)
826         }
827     };
828     float32x4x2_t gxgy =
829     {
830         {
831             vdupq_n_f32(0.0f),
832             vdupq_n_f32(0.0f)
833         }
834     };
835     float32x4_t sensitivity     = vdupq_n_f32(in_sensitivity);
836     float32x4_t norm_factor     = vdupq_n_f32(in_norm_factor);
837     float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
838
839     for(int i = 0; i < 5; ++i)
840     {
841         const float32x4_t low_gx_0  = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
842         const float32x4_t low_gy_0  = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
843         const float32x4_t high_gx_0 = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
844         const float32x4_t high_gy_0 = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
845         harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx_0, low_gy_0, high_gx_0, high_gy_0, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
846
847         const float32x4_t low_gx_1  = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
848         const float32x4_t low_gy_1  = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
849         const float32x4_t high_gx_1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
850         const float32x4_t high_gy_1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
851         harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx_1, low_gy_1, high_gx_1, high_gy_1, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
852
853         /* Update gx and gy pointer */
854         gx_ptr_0 += input_stride;
855         gy_ptr_0 += input_stride;
856         gx_ptr_1 += input_stride;
857         gy_ptr_1 += input_stride;
858         gx_ptr_2 += input_stride;
859         gy_ptr_2 += input_stride;
860     }
861
862     /* Calculate harris score */
863     const float32x4x2_t mc =
864     {
865         {
866             harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
867             harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
868         }
869     };
870
871     /* Store score */
872     vst1q_f32(output + 0, mc.val[0]);
873     vst1q_f32(output + 4, mc.val[1]);
874 }
875
876 inline void harris_score7x7_S16_S16_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
877                                           float in_norm_factor, float in_sensitivity, float in_strength_thresh)
878 {
879     auto           gx_ptr_0 = static_cast<const int16_t *__restrict>(input1_ptr) - 3 - 3 * input_stride;
880     auto           gy_ptr_0 = static_cast<const int16_t *__restrict>(input2_ptr) - 3 - 3 * input_stride;
881     const int16_t *gx_ptr_1 = gx_ptr_0 + 8;
882     const int16_t *gy_ptr_1 = gy_ptr_0 + 8;
883     const auto     output   = static_cast<float *__restrict>(output_ptr);
884
885     /* Gx^2, Gy^2 and Gx*Gy */
886     float32x4_t gx2             = vdupq_n_f32(0.0f);
887     float32x4_t gy2             = vdupq_n_f32(0.0f);
888     float32x4_t gxgy            = vdupq_n_f32(0.0f);
889     float32x4_t sensitivity     = vdupq_n_f32(in_sensitivity);
890     float32x4_t norm_factor     = vdupq_n_f32(in_norm_factor);
891     float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
892
893     for(int i = 0; i < 7; ++i)
894     {
895         const int16x8_t tmp0_gx = vld1q_s16(gx_ptr_0);
896         const int16x8_t tmp0_gy = vld1q_s16(gy_ptr_0);
897         const int16x4_t tmp1_gx = vld1_s16(gx_ptr_1);
898         const int16x4_t tmp1_gy = vld1_s16(gy_ptr_1);
899
900         float32x4_t low_gx   = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp0_gx)));
901         float32x4_t low_gy   = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp0_gy)));
902         float32x4_t high_gx  = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp0_gx)));
903         float32x4_t high_gy  = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp0_gy)));
904         float32x4_t high_gx1 = vcvtq_f32_s32(vmovl_s16(tmp1_gx));
905         float32x4_t high_gy1 = vcvtq_f32_s32(vmovl_s16(tmp1_gy));
906         harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor);
907
908         /* Update gx and gy pointer */
909         gx_ptr_0 += input_stride;
910         gy_ptr_0 += input_stride;
911         gx_ptr_1 += input_stride;
912         gy_ptr_1 += input_stride;
913     }
914
915     /* Calculate harris score */
916     const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
917
918     /* Store score */
919     vst1q_f32(output, mc);
920 }
921
922 inline void harris_score7x7_S32_S32_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
923                                           float in_norm_factor, float in_sensitivity, float in_strength_thresh)
924 {
925     auto           gx_ptr_0 = static_cast<const int32_t *__restrict>(input1_ptr) - 3 - 3 * input_stride;
926     auto           gy_ptr_0 = static_cast<const int32_t *__restrict>(input2_ptr) - 3 - 3 * input_stride;
927     const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
928     const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
929     const int32_t *gx_ptr_2 = gx_ptr_1 + 4;
930     const int32_t *gy_ptr_2 = gy_ptr_1 + 4;
931     const auto     output   = static_cast<float *__restrict>(output_ptr);
932
933     /* Gx^2, Gy^2 and Gx*Gy */
934     float32x4_t gx2             = vdupq_n_f32(0.0f);
935     float32x4_t gy2             = vdupq_n_f32(0.0f);
936     float32x4_t gxgy            = vdupq_n_f32(0.0f);
937     float32x4_t sensitivity     = vdupq_n_f32(in_sensitivity);
938     float32x4_t norm_factor     = vdupq_n_f32(in_norm_factor);
939     float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
940
941     for(int i = 0; i < 7; ++i)
942     {
943         const float32x4_t low_gx   = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
944         const float32x4_t low_gy   = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
945         const float32x4_t high_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
946         const float32x4_t high_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
947         const float32x4_t high_gx1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
948         const float32x4_t high_gy1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
949         harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor);
950
951         /* Update gx and gy pointer */
952         gx_ptr_0 += input_stride;
953         gy_ptr_0 += input_stride;
954         gx_ptr_1 += input_stride;
955         gy_ptr_1 += input_stride;
956         gx_ptr_2 += input_stride;
957         gy_ptr_2 += input_stride;
958     }
959
960     /* Calculate harris score */
961     const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
962
963     /* Store score */
964     vst1q_f32(output, mc);
965 }
966
967 } // namespace
968
969 INEHarrisScoreKernel::INEHarrisScoreKernel()
970     : _input1(nullptr), _input2(nullptr), _output(nullptr), _sensitivity(0.0f), _strength_thresh(0.0f), _norm_factor(0.0f)
971 {
972 }
973
974 template <int32_t block_size>
975 NEHarrisScoreKernel<block_size>::NEHarrisScoreKernel()
976     : INEHarrisScoreKernel(), _func(nullptr)
977 {
978 }
979
980 template <int32_t block_size>
981 void NEHarrisScoreKernel<block_size>::run(const Window &window)
982 {
983     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
984     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
985     ARM_COMPUTE_ERROR_ON(_func == nullptr);
986
987     Iterator input1(_input1, window);
988     Iterator input2(_input2, window);
989     Iterator output(_output, window);
990
991     const size_t input_stride = _input1->info()->strides_in_bytes()[1] / element_size_from_data_type(_input1->info()->data_type());
992
993     execute_window_loop(window, [&](const Coordinates & id)
994     {
995         (*_func)(input1.ptr(), input2.ptr(), output.ptr(), input_stride, _norm_factor, _sensitivity, _strength_thresh);
996     },
997     input1, input2, output);
998 }
999
1000 template <int32_t block_size>
1001 BorderSize        NEHarrisScoreKernel<block_size>::border_size() const
1002 {
1003     return BorderSize(block_size / 2);
1004 }
1005
1006 template <int32_t block_size>
1007 void NEHarrisScoreKernel<block_size>::configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity,
1008                                                 bool border_undefined)
1009 {
1010     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1);
1011     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2);
1012     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
1013     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32);
1014     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32);
1015     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
1016     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
1017     ARM_COMPUTE_ERROR_ON(0.0f == norm_factor);
1018
1019     _input1          = input1;
1020     _input2          = input2;
1021     _output          = output;
1022     _sensitivity     = sensitivity;
1023     _strength_thresh = strength_thresh;
1024     _norm_factor     = norm_factor;
1025
1026     if(input1->info()->data_type() == DataType::S16)
1027     {
1028         switch(block_size)
1029         {
1030             case 3:
1031                 _func = &harris_score3x3_S16_S16_FLOAT;
1032                 break;
1033             case 5:
1034                 _func = &harris_score5x5_S16_S16_FLOAT;
1035                 break;
1036             case 7:
1037                 _func = &harris_score7x7_S16_S16_FLOAT;
1038                 break;
1039             default:
1040                 ARM_COMPUTE_ERROR("Invalid block size");
1041                 break;
1042         }
1043     }
1044     else
1045     {
1046         switch(block_size)
1047         {
1048             case 3:
1049                 _func = &harris_score3x3_S32_S32_FLOAT;
1050                 break;
1051             case 5:
1052                 _func = &harris_score5x5_S32_S32_FLOAT;
1053                 break;
1054             case 7:
1055                 _func = &harris_score7x7_S32_S32_FLOAT;
1056                 break;
1057             default:
1058                 ARM_COMPUTE_ERROR("Invalid block size");
1059                 break;
1060         }
1061     }
1062
1063     ARM_COMPUTE_ERROR_ON(nullptr == _func);
1064
1065     unsigned int processed_elements = 0;
1066
1067     if(block_size != 7)
1068     {
1069         processed_elements = 8;
1070     }
1071     else
1072     {
1073         processed_elements = 4;
1074     }
1075
1076     // Configure kernel window
1077     Window                  win = calculate_max_window(*input1->info(), Steps(processed_elements), border_undefined, border_size());
1078     AccessWindowAutoPadding output_access(output->info());
1079
1080     update_window_and_padding(win,
1081                               AccessWindowAutoPadding(input1->info()),
1082                               AccessWindowAutoPadding(input2->info()),
1083                               output_access);
1084
1085     output_access.set_valid_region();
1086
1087     INEKernel::configure(win);
1088 }