src/core/NEON/kernels/NEHarrisCornersKernel.cpp

   1 /*
   2  * Copyright (c) 2016, 2017 ARM Limited.
   3  *
   4  * SPDX-License-Identifier: MIT
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to
   8  * deal in the Software without restriction, including without limitation the
   9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  10  * sell copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in all
  14  * copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24 #include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h"
  25
  26 #include "arm_compute/core/AccessWindowAutoPadding.h"
  27 #include "arm_compute/core/Coordinates.h"
  28 #include "arm_compute/core/Error.h"
  29 #include "arm_compute/core/Helpers.h"
  30 #include "arm_compute/core/TensorInfo.h"
  31 #include "arm_compute/core/Types.h"
  32 #include "arm_compute/core/Utils.h"
  33 #include "arm_compute/core/Validate.h"
  34 #include "arm_compute/core/Window.h"
  35
  36 #include <algorithm>
  37 #include <arm_neon.h>
  38 #include <cmath>
  39 #include <cstddef>
  40
  41 using namespace arm_compute;
  42
  43 #ifdef ARM_COMPUTE_ENABLE_FP16
  44
  45 template class arm_compute::NEHarrisScoreFP16Kernel<3>;
  46 template class arm_compute::NEHarrisScoreFP16Kernel<5>;
  47 template class arm_compute::NEHarrisScoreFP16Kernel<7>;
  48
  49 namespace fp16
  50 {
  51 inline float16x8_t harris_score(float16x8_t gx2, float16x8_t gy2, float16x8_t gxgy, float sensitivity, float strength_thresh)
  52 {
  53     static const float16x8_t zero = vdupq_n_f16(0.f);
  54
  55     /* Trace^2 */
  56     float16x8_t trace2 = vaddq_f16(gx2, gy2);
  57     trace2             = vmulq_f16(trace2, trace2);
  58
  59     /* Det(A) */
  60     float16x8_t det = vmulq_f16(gx2, gy2);
  61     det             = vfmsq_f16(det, gxgy, gxgy);
  62
  63     /* Det(A) - sensitivity * trace^2 */
  64     const float16x8_t mc = vfmsq_f16(det, vdupq_n_f16(sensitivity), trace2);
  65
  66     /* mc > strength_thresh */
  67     const uint16x8_t mask = vcgtq_f16(mc, vdupq_n_f16(strength_thresh));
  68
  69     return vbslq_f16(mask, mc, zero);
  70 }
  71
  72 template <size_t block_size>
  73 inline void harris_score_FLOAT_FLOAT_FLOAT(float16x8_t low_gx, float16x8_t low_gy, float16x8_t high_gx, float16x8_t high_gy, float16x8_t &gx2, float16x8_t &gy2, float16x8_t &gxgy,
  74                                            float norm_factor)
  75 {
  76     const float16x8_t norm_factor_fp16 = vdupq_n_f16(norm_factor);
  77
  78     /* Normalize */
  79     low_gx  = vmulq_f16(low_gx, norm_factor_fp16);
  80     low_gy  = vmulq_f16(low_gy, norm_factor_fp16);
  81     high_gx = vmulq_f16(high_gx, norm_factor_fp16);
  82     high_gy = vmulq_f16(high_gy, norm_factor_fp16);
  83
  84     for(size_t i = 0; i < block_size; ++i)
  85     {
  86         const float16x8_t gx = vextq_f16(low_gx, high_gx, i);
  87         const float16x8_t gy = vextq_f16(low_gy, high_gy, i);
  88
  89         gx2  = vfmaq_f16(gx2, gx, gx);
  90         gy2  = vfmaq_f16(gy2, gy, gy);
  91         gxgy = vfmaq_f16(gxgy, gx, gy);
  92     }
  93 }
  94
  95 template <size_t block_size>
  96 inline void harris_score_S16_S16_FLOAT(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity,
  97                                        float strength_thresh)
  98 {
  99     auto           gx_ptr_0 = static_cast<const int16_t *__restrict>(in1_ptr) - (block_size / 2) * (in_stride + 1);
 100     auto           gy_ptr_0 = static_cast<const int16_t *__restrict>(in2_ptr) - (block_size / 2) * (in_stride + 1);
 101     const int16_t *gx_ptr_1 = gx_ptr_0 + 8;
 102     const int16_t *gy_ptr_1 = gy_ptr_0 + 8;
 103     const auto     output   = static_cast<float *__restrict>(out_ptr);
 104
 105     /* Gx^2, Gy^2 and Gx*Gy */
 106     float16x8_t gx2  = vdupq_n_f16(0.0f);
 107     float16x8_t gy2  = vdupq_n_f16(0.0f);
 108     float16x8_t gxgy = vdupq_n_f16(0.0f);
 109
 110     for(size_t i = 0; i < block_size; ++i)
 111     {
 112         const float16x8_t low_gx  = vcvtq_f16_s16(vld1q_s16(gx_ptr_0));
 113         const float16x8_t high_gx = vcvtq_f16_s16(vld1q_s16(gx_ptr_1));
 114         const float16x8_t low_gy  = vcvtq_f16_s16(vld1q_s16(gy_ptr_0));
 115         const float16x8_t high_gy = vcvtq_f16_s16(vld1q_s16(gy_ptr_1));
 116         harris_score_FLOAT_FLOAT_FLOAT<block_size>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
 117
 118         /* Update gx and gy pointer */
 119         gx_ptr_0 += in_stride;
 120         gy_ptr_0 += in_stride;
 121         gx_ptr_1 += in_stride;
 122         gy_ptr_1 += in_stride;
 123     }
 124
 125     /* Calculate harris score */
 126     const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
 127
 128     /* Store score */
 129     vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
 130     vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
 131 }
 132
 133 template <size_t block_size>
 134 inline void harris_score_S32_S32_FLOAT(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity,
 135                                        float strength_thresh)
 136 {
 137     static const float16x8_t zero = vdupq_n_f16(0.0f);
 138
 139     auto           gx_ptr_0 = static_cast<const int32_t *__restrict>(in1_ptr) - (block_size / 2) * (in_stride + 1);
 140     auto           gy_ptr_0 = static_cast<const int32_t *__restrict>(in2_ptr) - (block_size / 2) * (in_stride + 1);
 141     const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
 142     const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
 143     const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
 144     const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
 145     const auto     output   = static_cast<float *__restrict>(out_ptr);
 146
 147     /* Gx^2, Gy^2 and Gx*Gy */
 148     float16x8_t gx2  = zero;
 149     float16x8_t gy2  = zero;
 150     float16x8_t gxgy = zero;
 151
 152     for(size_t i = 0; i < block_size; ++i)
 153     {
 154         const float16x8_t low_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_0))),
 155                                                 vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_1))));
 156         const float16x8_t high_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_2))),
 157                                                  vget_low_f16(zero));
 158         const float16x8_t low_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_0))),
 159                                                 vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_1))));
 160         const float16x8_t high_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_2))),
 161                                                  vget_low_f16(zero));
 162         harris_score_FLOAT_FLOAT_FLOAT<block_size>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
 163
 164         /* Update gx and gy pointer */
 165         gx_ptr_0 += in_stride;
 166         gy_ptr_0 += in_stride;
 167         gx_ptr_1 += in_stride;
 168         gy_ptr_1 += in_stride;
 169         gx_ptr_2 += in_stride;
 170         gy_ptr_2 += in_stride;
 171     }
 172
 173     /* Calculate harris score */
 174     const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
 175
 176     /* Store score */
 177     vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
 178     vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
 179 }
 180
 181 template <>
 182 inline void harris_score_S32_S32_FLOAT<7>(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity,
 183                                           float strength_thresh)
 184 {
 185     static const float16x8_t zero = vdupq_n_f16(0.0f);
 186
 187     auto           gx_ptr_0 = static_cast<const int32_t *__restrict>(in1_ptr) - 3 * (in_stride + 1);
 188     auto           gy_ptr_0 = static_cast<const int32_t *__restrict>(in2_ptr) - 3 * (in_stride + 1);
 189     const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
 190     const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
 191     const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
 192     const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
 193     const int32_t *gx_ptr_3 = gx_ptr_0 + 12;
 194     const int32_t *gy_ptr_3 = gy_ptr_0 + 12;
 195     const auto     output   = static_cast<float *__restrict>(out_ptr);
 196
 197     /* Gx^2, Gy^2 and Gx*Gy */
 198     float16x8_t gx2  = zero;
 199     float16x8_t gy2  = zero;
 200     float16x8_t gxgy = zero;
 201
 202     for(size_t i = 0; i < 7; ++i)
 203     {
 204         const float16x8_t low_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_0))),
 205                                                 vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_1))));
 206         const float16x8_t high_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_2))),
 207                                                  vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_3))));
 208         const float16x8_t low_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_0))),
 209                                                 vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_1))));
 210         const float16x8_t high_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_2))),
 211                                                  vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_3))));
 212         harris_score_FLOAT_FLOAT_FLOAT<7>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
 213
 214         /* Update gx and gy pointer */
 215         gx_ptr_0 += in_stride;
 216         gy_ptr_0 += in_stride;
 217         gx_ptr_1 += in_stride;
 218         gy_ptr_1 += in_stride;
 219         gx_ptr_2 += in_stride;
 220         gy_ptr_2 += in_stride;
 221     }
 222
 223     /* Calculate harris score */
 224     const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
 225
 226     /* Store score */
 227     vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
 228     vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
 229 }
 230
 231 } // namespace fp16
 232
 233 template <int32_t block_size>
 234 BorderSize        NEHarrisScoreFP16Kernel<block_size>::border_size() const
 235 {
 236     return BorderSize(block_size / 2);
 237 }
 238
 239 template <int32_t block_size>
 240 NEHarrisScoreFP16Kernel<block_size>::NEHarrisScoreFP16Kernel()
 241     : INEHarrisScoreKernel(), _func(nullptr)
 242 {
 243 }
 244
 245 template <int32_t block_size>
 246 void NEHarrisScoreFP16Kernel<block_size>::run(const Window &window)
 247 {
 248     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
 249     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 250     ARM_COMPUTE_ERROR_ON(_func == nullptr);
 251
 252     Iterator input1(_input1, window);
 253     Iterator input2(_input2, window);
 254     Iterator output(_output, window);
 255
 256     const size_t input_stride = _input1->info()->strides_in_bytes()[1] / element_size_from_data_type(_input1->info()->data_type());
 257
 258     execute_window_loop(window, [&](const Coordinates & id)
 259     {
 260         (*_func)(input1.ptr(), input2.ptr(), output.ptr(), input_stride, _norm_factor, _sensitivity, _strength_thresh);
 261     },
 262     input1, input2, output);
 263 }
 264
 265 template <int32_t block_size>
 266 void NEHarrisScoreFP16Kernel<block_size>::configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity,
 267                                                     bool border_undefined)
 268 {
 269     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1);
 270     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2);
 271     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
 272     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32);
 273     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32);
 274     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
 275     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
 276     ARM_COMPUTE_ERROR_ON(0.0f == norm_factor);
 277
 278     _input1          = input1;
 279     _input2          = input2;
 280     _output          = output;
 281     _sensitivity     = sensitivity;
 282     _strength_thresh = strength_thresh;
 283     _norm_factor     = norm_factor;
 284
 285     if(input1->info()->data_type() == DataType::S16)
 286     {
 287         _func = &fp16::harris_score_S16_S16_FLOAT<block_size>;
 288     }
 289     else
 290     {
 291         _func = &fp16::harris_score_S32_S32_FLOAT<block_size>;
 292     }
 293
 294     ARM_COMPUTE_ERROR_ON(nullptr == _func);
 295
 296     const unsigned int processed_elements = 8;
 297
 298     // Configure kernel window
 299     Window                  win = calculate_max_window(*input1->info(), Steps(processed_elements), border_undefined, border_size());
 300     AccessWindowAutoPadding output_access(output->info());
 301
 302     update_window_and_padding(win,
 303                               AccessWindowAutoPadding(input1->info()),
 304                               AccessWindowAutoPadding(input2->info()),
 305                               output_access);
 306
 307     output_access.set_valid_region();
 308
 309     INEKernel::configure(win);
 310 }
 311
 312 #endif
 313
 314 template class arm_compute::NEHarrisScoreKernel<3>;
 315 template class arm_compute::NEHarrisScoreKernel<5>;
 316 template class arm_compute::NEHarrisScoreKernel<7>;
 317 template arm_compute::NEHarrisScoreKernel<3>::NEHarrisScoreKernel();
 318 template arm_compute::NEHarrisScoreKernel<5>::NEHarrisScoreKernel();
 319 template arm_compute::NEHarrisScoreKernel<7>::NEHarrisScoreKernel();
 320
 321 namespace
 322 {
 323 inline float32x4_t harris_score(float32x4_t gx2, float32x4_t gy2, float32x4_t gxgy, float32x4_t sensitivity, float32x4_t strength_thresh)
 324 {
 325     /* Trace^2 */
 326     float32x4_t trace2 = vaddq_f32(gx2, gy2);
 327     trace2             = vmulq_f32(trace2, trace2);
 328
 329     /* Det(A) */
 330     float32x4_t det = vmulq_f32(gx2, gy2);
 331     det             = vmlsq_f32(det, gxgy, gxgy);
 332
 333     /* Det(A) - sensitivity * trace^2 */
 334     const float32x4_t mc = vmlsq_f32(det, sensitivity, trace2);
 335
 336     /* mc > strength_thresh */
 337     const uint32x4_t mask = vcgtq_f32(mc, strength_thresh);
 338
 339     return vbslq_f32(mask, mc, vdupq_n_f32(0.0f));
 340 }
 341
 342 inline void harris_score1x3_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy,
 343                                               float32x4_t norm_factor)
 344 {
 345     /* Normalize */
 346     low_gx  = vmulq_f32(low_gx, norm_factor);
 347     low_gy  = vmulq_f32(low_gy, norm_factor);
 348     high_gx = vmulq_f32(high_gx, norm_factor);
 349     high_gy = vmulq_f32(high_gy, norm_factor);
 350
 351     const float32x4_t l_gx = low_gx;
 352     const float32x4_t l_gy = low_gy;
 353     const float32x4_t m_gx = vextq_f32(low_gx, high_gx, 1);
 354     const float32x4_t m_gy = vextq_f32(low_gy, high_gy, 1);
 355     const float32x4_t r_gx = vextq_f32(low_gx, high_gx, 2);
 356     const float32x4_t r_gy = vextq_f32(low_gy, high_gy, 2);
 357
 358     /* Gx*Gx*/
 359     gx2 = vmlaq_f32(gx2, l_gx, l_gx);
 360     gx2 = vmlaq_f32(gx2, m_gx, m_gx);
 361     gx2 = vmlaq_f32(gx2, r_gx, r_gx);
 362
 363     /* Gy*Gy*/
 364     gy2 = vmlaq_f32(gy2, l_gy, l_gy);
 365     gy2 = vmlaq_f32(gy2, m_gy, m_gy);
 366     gy2 = vmlaq_f32(gy2, r_gy, r_gy);
 367
 368     /* Gx*Gy */
 369     gxgy = vmlaq_f32(gxgy, l_gx, l_gy);
 370     gxgy = vmlaq_f32(gxgy, m_gx, m_gy);
 371     gxgy = vmlaq_f32(gxgy, r_gx, r_gy);
 372 }
 373
 374 inline void harris_score1x5_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy,
 375                                               float32x4_t norm_factor)
 376 {
 377     /* Normalize */
 378     low_gx  = vmulq_f32(low_gx, norm_factor);
 379     low_gy  = vmulq_f32(low_gy, norm_factor);
 380     high_gx = vmulq_f32(high_gx, norm_factor);
 381     high_gy = vmulq_f32(high_gy, norm_factor);
 382
 383     /* L2 values  */
 384     float32x4_t gx = low_gx;
 385     float32x4_t gy = low_gy;
 386
 387     /* Accumulate */
 388     gx2  = vmlaq_f32(gx2, gx, gx);
 389     gy2  = vmlaq_f32(gy2, gy, gy);
 390     gxgy = vmlaq_f32(gxgy, gx, gy);
 391
 392     /* L1 values  */
 393     gx = vextq_f32(low_gx, high_gx, 1);
 394     gy = vextq_f32(low_gy, high_gy, 1);
 395
 396     /* Accumulate */
 397     gx2  = vmlaq_f32(gx2, gx, gx);
 398     gy2  = vmlaq_f32(gy2, gy, gy);
 399     gxgy = vmlaq_f32(gxgy, gx, gy);
 400
 401     /* M values  */
 402     gx = vextq_f32(low_gx, high_gx, 2);
 403     gy = vextq_f32(low_gy, high_gy, 2);
 404
 405     /* Accumulate */
 406     gx2  = vmlaq_f32(gx2, gx, gx);
 407     gy2  = vmlaq_f32(gy2, gy, gy);
 408     gxgy = vmlaq_f32(gxgy, gx, gy);
 409
 410     /* R1 values  */
 411     gx = vextq_f32(low_gx, high_gx, 3);
 412     gy = vextq_f32(low_gy, high_gy, 3);
 413
 414     /* Accumulate */
 415     gx2  = vmlaq_f32(gx2, gx, gx);
 416     gy2  = vmlaq_f32(gy2, gy, gy);
 417     gxgy = vmlaq_f32(gxgy, gx, gy);
 418
 419     /* R2 values  */
 420     gx = high_gx;
 421     gy = high_gy;
 422
 423     /* Accumulate */
 424     gx2  = vmlaq_f32(gx2, gx, gx);
 425     gy2  = vmlaq_f32(gy2, gy, gy);
 426     gxgy = vmlaq_f32(gxgy, gx, gy);
 427 }
 428
 429 inline void harris_score1x7_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t high_gx1, float32x4_t high_gy1, float32x4_t &gx2,
 430                                               float32x4_t &gy2, float32x4_t &gxgy, float32x4_t norm_factor)
 431 {
 432     /* Normalize */
 433     low_gx  = vmulq_f32(low_gx, norm_factor);
 434     low_gy  = vmulq_f32(low_gy, norm_factor);
 435     high_gx = vmulq_f32(high_gx, norm_factor);
 436     high_gy = vmulq_f32(high_gy, norm_factor);
 437
 438     /* L3 values  */
 439     float32x4_t gx = low_gx;
 440     float32x4_t gy = low_gy;
 441
 442     /* Accumulate */
 443     gx2  = vmlaq_f32(gx2, gx, gx);
 444     gy2  = vmlaq_f32(gy2, gy, gy);
 445     gxgy = vmlaq_f32(gxgy, gx, gy);
 446
 447     /* L2 values  */
 448     gx = vextq_f32(low_gx, high_gx, 1);
 449     gy = vextq_f32(low_gy, high_gy, 1);
 450
 451     /* Accumulate */
 452     gx2  = vmlaq_f32(gx2, gx, gx);
 453     gy2  = vmlaq_f32(gy2, gy, gy);
 454     gxgy = vmlaq_f32(gxgy, gx, gy);
 455
 456     /* L1 values  */
 457     gx = vextq_f32(low_gx, high_gx, 2);
 458     gy = vextq_f32(low_gy, high_gy, 2);
 459
 460     /* Accumulate */
 461     gx2  = vmlaq_f32(gx2, gx, gx);
 462     gy2  = vmlaq_f32(gy2, gy, gy);
 463     gxgy = vmlaq_f32(gxgy, gx, gy);
 464
 465     /* M values  */
 466     gx = vextq_f32(low_gx, high_gx, 3);
 467     gy = vextq_f32(low_gy, high_gy, 3);
 468
 469     /* Accumulate */
 470     gx2  = vmlaq_f32(gx2, gx, gx);
 471     gy2  = vmlaq_f32(gy2, gy, gy);
 472     gxgy = vmlaq_f32(gxgy, gx, gy);
 473
 474     /* R1 values  */
 475     gx = high_gx;
 476     gy = high_gy;
 477
 478     /* Accumulate */
 479     gx2  = vmlaq_f32(gx2, gx, gx);
 480     gy2  = vmlaq_f32(gy2, gy, gy);
 481     gxgy = vmlaq_f32(gxgy, gx, gy);
 482
 483     /* Change tmp_low and tmp_high for calculating R2 and R3 values */
 484     low_gx  = high_gx;
 485     low_gy  = high_gy;
 486     high_gx = high_gx1;
 487     high_gy = high_gy1;
 488
 489     /* Normalize */
 490     high_gx = vmulq_f32(high_gx, norm_factor);
 491     high_gy = vmulq_f32(high_gy, norm_factor);
 492
 493     /* R2 values  */
 494     gx = vextq_f32(low_gx, high_gx, 1);
 495     gy = vextq_f32(low_gy, high_gy, 1);
 496
 497     /* Accumulate */
 498     gx2  = vmlaq_f32(gx2, gx, gx);
 499     gy2  = vmlaq_f32(gy2, gy, gy);
 500     gxgy = vmlaq_f32(gxgy, gx, gy);
 501
 502     /* R3 values  */
 503     gx = vextq_f32(low_gx, high_gx, 2);
 504     gy = vextq_f32(low_gy, high_gy, 2);
 505
 506     /* Accumulate */
 507     gx2  = vmlaq_f32(gx2, gx, gx);
 508     gy2  = vmlaq_f32(gy2, gy, gy);
 509     gxgy = vmlaq_f32(gxgy, gx, gy);
 510 }
 511
 512 inline void harris_score3x3_S16_S16_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
 513                                           float in_norm_factor, float in_sensitivity, float in_strength_thresh)
 514
 515 {
 516     const auto     gx_ptr_0 = static_cast<const int16_t *__restrict>(input1_ptr) - 1;
 517     const auto     gy_ptr_0 = static_cast<const int16_t *__restrict>(input2_ptr) - 1;
 518     const int16_t *gx_ptr_1 = gx_ptr_0 + 4;
 519     const int16_t *gy_ptr_1 = gy_ptr_0 + 4;
 520     const auto     output   = static_cast<float *__restrict>(output_ptr);
 521
 522     /* Gx^2, Gy^2 and Gx*Gy */
 523     float32x4x2_t gx2 =
 524     {
 525         {
 526             vdupq_n_f32(0.0f),
 527             vdupq_n_f32(0.0f)
 528         }
 529     };
 530     float32x4x2_t gy2 =
 531     {
 532         {
 533             vdupq_n_f32(0.0f),
 534             vdupq_n_f32(0.0f)
 535         }
 536     };
 537     float32x4x2_t gxgy =
 538     {
 539         {
 540             vdupq_n_f32(0.0f),
 541             vdupq_n_f32(0.0f)
 542         }
 543     };
 544
 545     /* Row0 */
 546     int16x8x2_t tmp_gx =
 547     {
 548         {
 549             vld1q_s16(gx_ptr_0 - input_stride),
 550             vld1q_s16(gx_ptr_1 - input_stride)
 551         }
 552     };
 553     int16x8x2_t tmp_gy =
 554     {
 555         {
 556             vld1q_s16(gy_ptr_0 - input_stride),
 557             vld1q_s16(gy_ptr_1 - input_stride)
 558         }
 559     };
 560     float32x4_t sensitivity     = vdupq_n_f32(in_sensitivity);
 561     float32x4_t norm_factor     = vdupq_n_f32(in_norm_factor);
 562     float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
 563
 564     float32x4_t low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
 565     float32x4_t low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
 566     float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
 567     float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
 568     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
 569
 570     low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
 571     low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
 572     high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
 573     high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
 574     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
 575
 576     /* Row1 */
 577     tmp_gx.val[0] = vld1q_s16(gx_ptr_0);
 578     tmp_gy.val[0] = vld1q_s16(gy_ptr_0);
 579     tmp_gx.val[1] = vld1q_s16(gx_ptr_1);
 580     tmp_gy.val[1] = vld1q_s16(gy_ptr_1);
 581
 582     low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
 583     low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
 584     high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
 585     high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
 586     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
 587
 588     low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
 589     low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
 590     high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
 591     high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
 592     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
 593
 594     /* Row2 */
 595     tmp_gx.val[0] = vld1q_s16(gx_ptr_0 + input_stride);
 596     tmp_gy.val[0] = vld1q_s16(gy_ptr_0 + input_stride);
 597     tmp_gx.val[1] = vld1q_s16(gx_ptr_1 + input_stride);
 598     tmp_gy.val[1] = vld1q_s16(gy_ptr_1 + input_stride);
 599
 600     low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
 601     low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
 602     high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
 603     high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
 604     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
 605
 606     low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
 607     low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
 608     high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
 609     high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
 610     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
 611
 612     /* Calculate harris score */
 613     const float32x4x2_t mc =
 614     {
 615         {
 616             harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
 617             harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
 618         }
 619     };
 620
 621     /* Store score */
 622     vst1q_f32(output + 0, mc.val[0]);
 623     vst1q_f32(output + 4, mc.val[1]);
 624 }
 625
 626 inline void harris_score3x3_S32_S32_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
 627                                           float in_norm_factor, float in_sensitivity, float in_strength_thresh)
 628 {
 629     auto           gx_ptr_0        = static_cast<const int32_t *__restrict>(input1_ptr) - 1;
 630     auto           gy_ptr_0        = static_cast<const int32_t *__restrict>(input2_ptr) - 1;
 631     const int32_t *gx_ptr_1        = gx_ptr_0 + 4;
 632     const int32_t *gy_ptr_1        = gy_ptr_0 + 4;
 633     const int32_t *gx_ptr_2        = gx_ptr_0 + 8;
 634     const int32_t *gy_ptr_2        = gy_ptr_0 + 8;
 635     const auto     output          = static_cast<float *__restrict>(output_ptr);
 636     float32x4_t    sensitivity     = vdupq_n_f32(in_sensitivity);
 637     float32x4_t    norm_factor     = vdupq_n_f32(in_norm_factor);
 638     float32x4_t    strength_thresh = vdupq_n_f32(in_strength_thresh);
 639
 640     /* Gx^2, Gy^2 and Gx*Gy */
 641     float32x4x2_t gx2 =
 642     {
 643         {
 644             vdupq_n_f32(0.0f),
 645             vdupq_n_f32(0.0f)
 646         }
 647     };
 648     float32x4x2_t gy2 =
 649     {
 650         {
 651             vdupq_n_f32(0.0f),
 652             vdupq_n_f32(0.0f)
 653         }
 654     };
 655     float32x4x2_t gxgy =
 656     {
 657         {
 658             vdupq_n_f32(0.0f),
 659             vdupq_n_f32(0.0f)
 660         }
 661     };
 662
 663     /* Row0 */
 664     float32x4_t low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 - input_stride));
 665     float32x4_t low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 - input_stride));
 666     float32x4_t high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 - input_stride));
 667     float32x4_t high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 - input_stride));
 668     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
 669
 670     low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 - input_stride));
 671     low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 - input_stride));
 672     high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2 - input_stride));
 673     high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 - input_stride));
 674     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
 675
 676     /* Row1 */
 677     low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
 678     low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
 679     high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
 680     high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
 681     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
 682
 683     low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
 684     low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
 685     high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
 686     high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
 687     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
 688
 689     /* Row2 */
 690     low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 + input_stride));
 691     low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 + input_stride));
 692     high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 + input_stride));
 693     high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 + input_stride));
 694     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
 695
 696     low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 + input_stride));
 697     low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 + input_stride));
 698     high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2 + input_stride));
 699     high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 + input_stride));
 700     harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
 701
 702     /* Calculate harris score */
 703     const float32x4x2_t mc =
 704     {
 705         {
 706             harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
 707             harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
 708         }
 709     };
 710
 711     /* Store score */
 712     vst1q_f32(output + 0, mc.val[0]);
 713     vst1q_f32(output + 4, mc.val[1]);
 714 }
 715
 716 inline void harris_score5x5_S16_S16_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
 717                                           float in_norm_factor, float in_sensitivity, float in_strength_thresh)
 718 {
 719     auto           gx_ptr_0 = static_cast<const int16_t *__restrict>(input1_ptr) - 2 - 2 * input_stride;
 720     auto           gy_ptr_0 = static_cast<const int16_t *__restrict>(input2_ptr) - 2 - 2 * input_stride;
 721     const int16_t *gx_ptr_1 = gx_ptr_0 + 4;
 722     const int16_t *gy_ptr_1 = gy_ptr_0 + 4;
 723     const auto     output   = static_cast<float *__restrict>(output_ptr);
 724
 725     /* Gx^2, Gy^2 and Gx*Gy */
 726     float32x4x2_t gx2 =
 727     {
 728         {
 729             vdupq_n_f32(0.0f),
 730             vdupq_n_f32(0.0f)
 731         }
 732     };
 733     float32x4x2_t gy2 =
 734     {
 735         {
 736             vdupq_n_f32(0.0f),
 737             vdupq_n_f32(0.0f)
 738         }
 739     };
 740     float32x4x2_t gxgy =
 741     {
 742         {
 743             vdupq_n_f32(0.0f),
 744             vdupq_n_f32(0.0f)
 745         }
 746     };
 747     float32x4_t sensitivity     = vdupq_n_f32(in_sensitivity);
 748     float32x4_t norm_factor     = vdupq_n_f32(in_norm_factor);
 749     float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
 750
 751     for(int i = 0; i < 5; ++i)
 752     {
 753         const int16x8x2_t tmp_gx =
 754         {
 755             {
 756                 vld1q_s16(gx_ptr_0),
 757                 vld1q_s16(gx_ptr_1)
 758             }
 759         };
 760         const int16x8x2_t tmp_gy =
 761         {
 762             {
 763                 vld1q_s16(gy_ptr_0),
 764                 vld1q_s16(gy_ptr_1)
 765             }
 766         };
 767
 768         float32x4_t low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
 769         float32x4_t low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
 770         float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
 771         float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
 772         harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
 773
 774         low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
 775         low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
 776         high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
 777         high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
 778         harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
 779
 780         /* Update gx and gy pointer */
 781         gx_ptr_0 += input_stride;
 782         gy_ptr_0 += input_stride;
 783         gx_ptr_1 += input_stride;
 784         gy_ptr_1 += input_stride;
 785     }
 786
 787     /* Calculate harris score */
 788     const float32x4x2_t mc =
 789     {
 790         {
 791             harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
 792             harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
 793         }
 794     };
 795
 796     /* Store score */
 797     vst1q_f32(output + 0, mc.val[0]);
 798     vst1q_f32(output + 4, mc.val[1]);
 799 }
 800
 801 inline void harris_score5x5_S32_S32_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
 802                                           float in_norm_factor, float in_sensitivity, float in_strength_thresh)
 803
 804 {
 805     auto           gx_ptr_0 = static_cast<const int32_t *__restrict>(input1_ptr) - 2 - 2 * input_stride;
 806     auto           gy_ptr_0 = static_cast<const int32_t *__restrict>(input2_ptr) - 2 - 2 * input_stride;
 807     const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
 808     const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
 809     const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
 810     const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
 811     const auto     output   = static_cast<float *__restrict>(output_ptr);
 812
 813     /* Gx^2, Gy^2 and Gx*Gy */
 814     float32x4x2_t gx2 =
 815     {
 816         {
 817             vdupq_n_f32(0.0f),
 818             vdupq_n_f32(0.0f)
 819         }
 820     };
 821     float32x4x2_t gy2 =
 822     {
 823         {
 824             vdupq_n_f32(0.0f),
 825             vdupq_n_f32(0.0f)
 826         }
 827     };
 828     float32x4x2_t gxgy =
 829     {
 830         {
 831             vdupq_n_f32(0.0f),
 832             vdupq_n_f32(0.0f)
 833         }
 834     };
 835     float32x4_t sensitivity     = vdupq_n_f32(in_sensitivity);
 836     float32x4_t norm_factor     = vdupq_n_f32(in_norm_factor);
 837     float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
 838
 839     for(int i = 0; i < 5; ++i)
 840     {
 841         const float32x4_t low_gx_0  = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
 842         const float32x4_t low_gy_0  = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
 843         const float32x4_t high_gx_0 = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
 844         const float32x4_t high_gy_0 = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
 845         harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx_0, low_gy_0, high_gx_0, high_gy_0, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
 846
 847         const float32x4_t low_gx_1  = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
 848         const float32x4_t low_gy_1  = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
 849         const float32x4_t high_gx_1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
 850         const float32x4_t high_gy_1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
 851         harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx_1, low_gy_1, high_gx_1, high_gy_1, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
 852
 853         /* Update gx and gy pointer */
 854         gx_ptr_0 += input_stride;
 855         gy_ptr_0 += input_stride;
 856         gx_ptr_1 += input_stride;
 857         gy_ptr_1 += input_stride;
 858         gx_ptr_2 += input_stride;
 859         gy_ptr_2 += input_stride;
 860     }
 861
 862     /* Calculate harris score */
 863     const float32x4x2_t mc =
 864     {
 865         {
 866             harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
 867             harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
 868         }
 869     };
 870
 871     /* Store score */
 872     vst1q_f32(output + 0, mc.val[0]);
 873     vst1q_f32(output + 4, mc.val[1]);
 874 }
 875
 876 inline void harris_score7x7_S16_S16_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
 877                                           float in_norm_factor, float in_sensitivity, float in_strength_thresh)
 878 {
 879     auto           gx_ptr_0 = static_cast<const int16_t *__restrict>(input1_ptr) - 3 - 3 * input_stride;
 880     auto           gy_ptr_0 = static_cast<const int16_t *__restrict>(input2_ptr) - 3 - 3 * input_stride;
 881     const int16_t *gx_ptr_1 = gx_ptr_0 + 8;
 882     const int16_t *gy_ptr_1 = gy_ptr_0 + 8;
 883     const auto     output   = static_cast<float *__restrict>(output_ptr);
 884
 885     /* Gx^2, Gy^2 and Gx*Gy */
 886     float32x4_t gx2             = vdupq_n_f32(0.0f);
 887     float32x4_t gy2             = vdupq_n_f32(0.0f);
 888     float32x4_t gxgy            = vdupq_n_f32(0.0f);
 889     float32x4_t sensitivity     = vdupq_n_f32(in_sensitivity);
 890     float32x4_t norm_factor     = vdupq_n_f32(in_norm_factor);
 891     float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
 892
 893     for(int i = 0; i < 7; ++i)
 894     {
 895         const int16x8_t tmp0_gx = vld1q_s16(gx_ptr_0);
 896         const int16x8_t tmp0_gy = vld1q_s16(gy_ptr_0);
 897         const int16x4_t tmp1_gx = vld1_s16(gx_ptr_1);
 898         const int16x4_t tmp1_gy = vld1_s16(gy_ptr_1);
 899
 900         float32x4_t low_gx   = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp0_gx)));
 901         float32x4_t low_gy   = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp0_gy)));
 902         float32x4_t high_gx  = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp0_gx)));
 903         float32x4_t high_gy  = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp0_gy)));
 904         float32x4_t high_gx1 = vcvtq_f32_s32(vmovl_s16(tmp1_gx));
 905         float32x4_t high_gy1 = vcvtq_f32_s32(vmovl_s16(tmp1_gy));
 906         harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor);
 907
 908         /* Update gx and gy pointer */
 909         gx_ptr_0 += input_stride;
 910         gy_ptr_0 += input_stride;
 911         gx_ptr_1 += input_stride;
 912         gy_ptr_1 += input_stride;
 913     }
 914
 915     /* Calculate harris score */
 916     const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
 917
 918     /* Store score */
 919     vst1q_f32(output, mc);
 920 }
 921
 922 inline void harris_score7x7_S32_S32_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
 923                                           float in_norm_factor, float in_sensitivity, float in_strength_thresh)
 924 {
 925     auto           gx_ptr_0 = static_cast<const int32_t *__restrict>(input1_ptr) - 3 - 3 * input_stride;
 926     auto           gy_ptr_0 = static_cast<const int32_t *__restrict>(input2_ptr) - 3 - 3 * input_stride;
 927     const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
 928     const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
 929     const int32_t *gx_ptr_2 = gx_ptr_1 + 4;
 930     const int32_t *gy_ptr_2 = gy_ptr_1 + 4;
 931     const auto     output   = static_cast<float *__restrict>(output_ptr);
 932
 933     /* Gx^2, Gy^2 and Gx*Gy */
 934     float32x4_t gx2             = vdupq_n_f32(0.0f);
 935     float32x4_t gy2             = vdupq_n_f32(0.0f);
 936     float32x4_t gxgy            = vdupq_n_f32(0.0f);
 937     float32x4_t sensitivity     = vdupq_n_f32(in_sensitivity);
 938     float32x4_t norm_factor     = vdupq_n_f32(in_norm_factor);
 939     float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
 940
 941     for(int i = 0; i < 7; ++i)
 942     {
 943         const float32x4_t low_gx   = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
 944         const float32x4_t low_gy   = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
 945         const float32x4_t high_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
 946         const float32x4_t high_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
 947         const float32x4_t high_gx1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
 948         const float32x4_t high_gy1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
 949         harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor);
 950
 951         /* Update gx and gy pointer */
 952         gx_ptr_0 += input_stride;
 953         gy_ptr_0 += input_stride;
 954         gx_ptr_1 += input_stride;
 955         gy_ptr_1 += input_stride;
 956         gx_ptr_2 += input_stride;
 957         gy_ptr_2 += input_stride;
 958     }
 959
 960     /* Calculate harris score */
 961     const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
 962
 963     /* Store score */
 964     vst1q_f32(output, mc);
 965 }
 966
 967 } // namespace
 968
 969 INEHarrisScoreKernel::INEHarrisScoreKernel()
 970     : _input1(nullptr), _input2(nullptr), _output(nullptr), _sensitivity(0.0f), _strength_thresh(0.0f), _norm_factor(0.0f)
 971 {
 972 }
 973
 974 template <int32_t block_size>
 975 NEHarrisScoreKernel<block_size>::NEHarrisScoreKernel()
 976     : INEHarrisScoreKernel(), _func(nullptr)
 977 {
 978 }
 979
 980 template <int32_t block_size>
 981 void NEHarrisScoreKernel<block_size>::run(const Window &window)
 982 {
 983     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
 984     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 985     ARM_COMPUTE_ERROR_ON(_func == nullptr);
 986
 987     Iterator input1(_input1, window);
 988     Iterator input2(_input2, window);
 989     Iterator output(_output, window);
 990
 991     const size_t input_stride = _input1->info()->strides_in_bytes()[1] / element_size_from_data_type(_input1->info()->data_type());
 992
 993     execute_window_loop(window, [&](const Coordinates & id)
 994     {
 995         (*_func)(input1.ptr(), input2.ptr(), output.ptr(), input_stride, _norm_factor, _sensitivity, _strength_thresh);
 996     },
 997     input1, input2, output);
 998 }
 999
1000 template <int32_t block_size>
1001 BorderSize        NEHarrisScoreKernel<block_size>::border_size() const
1002 {
1003     return BorderSize(block_size / 2);
1004 }
1005
1006 template <int32_t block_size>
1007 void NEHarrisScoreKernel<block_size>::configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity,
1008                                                 bool border_undefined)
1009 {
1010     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1);
1011     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2);
1012     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
1013     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32);
1014     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32);
1015     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
1016     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
1017     ARM_COMPUTE_ERROR_ON(0.0f == norm_factor);
1018
1019     _input1          = input1;
1020     _input2          = input2;
1021     _output          = output;
1022     _sensitivity     = sensitivity;
1023     _strength_thresh = strength_thresh;
1024     _norm_factor     = norm_factor;
1025
1026     if(input1->info()->data_type() == DataType::S16)
1027     {
1028         switch(block_size)
1029         {
1030             case 3:
1031                 _func = &harris_score3x3_S16_S16_FLOAT;
1032                 break;
1033             case 5:
1034                 _func = &harris_score5x5_S16_S16_FLOAT;
1035                 break;
1036             case 7:
1037                 _func = &harris_score7x7_S16_S16_FLOAT;
1038                 break;
1039             default:
1040                 ARM_COMPUTE_ERROR("Invalid block size");
1041                 break;
1042         }
1043     }
1044     else
1045     {
1046         switch(block_size)
1047         {
1048             case 3:
1049                 _func = &harris_score3x3_S32_S32_FLOAT;
1050                 break;
1051             case 5:
1052                 _func = &harris_score5x5_S32_S32_FLOAT;
1053                 break;
1054             case 7:
1055                 _func = &harris_score7x7_S32_S32_FLOAT;
1056                 break;
1057             default:
1058                 ARM_COMPUTE_ERROR("Invalid block size");
1059                 break;
1060         }
1061     }
1062
1063     ARM_COMPUTE_ERROR_ON(nullptr == _func);
1064
1065     unsigned int processed_elements = 0;
1066
1067     if(block_size != 7)
1068     {
1069         processed_elements = 8;
1070     }
1071     else
1072     {
1073         processed_elements = 4;
1074     }
1075
1076     // Configure kernel window
1077     Window                  win = calculate_max_window(*input1->info(), Steps(processed_elements), border_undefined, border_size());
1078     AccessWindowAutoPadding output_access(output->info());
1079
1080     update_window_and_padding(win,
1081                               AccessWindowAutoPadding(input1->info()),
1082                               AccessWindowAutoPadding(input2->info()),
1083                               output_access);
1084
1085     output_access.set_valid_region();
1086
1087     INEKernel::configure(win);
1088 }