src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp

   1 /*
   2  * Copyright (c) 2016, 2017 ARM Limited.
   3  *
   4  * SPDX-License-Identifier: MIT
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to
   8  * deal in the Software without restriction, including without limitation the
   9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  10  * sell copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in all
  14  * copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24 #include "arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
  25
  26 #include "arm_compute/core/Error.h"
  27 #include "arm_compute/core/Helpers.h"
  28 #include "arm_compute/core/ITensor.h"
  29 #include "arm_compute/core/TensorInfo.h"
  30 #include "arm_compute/core/Types.h"
  31 #include "arm_compute/core/Utils.h"
  32 #include "arm_compute/core/Validate.h"
  33
  34 #include <arm_neon.h>
  35 #include <cstddef>
  36
  37 using namespace arm_compute;
  38
  39 namespace arm_compute
  40 {
  41 class Coordinates;
  42 } // namespace arm_compute
  43
  44 #ifdef ARM_COMPUTE_ENABLE_FP16
  45 namespace fp16
  46 {
  47 inline void mask_top(const float16x8_t &vc, const float16x8_t &in0, const float16x8_t &in1, uint16x8_t &mask)
  48 {
  49     // vc > nc.val[0], vc > nc.val[1], vc > nc.val[2]
  50     mask = vandq_u16(mask, vcgeq_f16(vc, in0));
  51     mask = vandq_u16(mask, vcgeq_f16(vc, vextq_f16(in0, in1, 1)));
  52     mask = vandq_u16(mask, vcgeq_f16(vc, vextq_f16(in0, in1, 2)));
  53 }
  54
  55 inline void mask_middle(const float16x8_t &vc, const float16x8_t &in0, const float16x8_t &in1, uint16x8_t &mask)
  56 {
  57     // vc >= nc.val[0], vc > nc.val[2]
  58     mask = vandq_u16(mask, vcgeq_f16(vc, in0));
  59     mask = vandq_u16(mask, vcgtq_f16(vc, vextq_f16(in0, in1, 2)));
  60 }
  61
  62 inline void mask_bottom(const float16x8_t &vc, const float16x8_t &in0, const float16x8_t &in1, uint16x8_t &mask)
  63 {
  64     // vc > nc.val[0], vc > nc.val[1], vc > nc.val[2]
  65     mask = vandq_u16(mask, vcgtq_f16(vc, in0));
  66     mask = vandq_u16(mask, vcgtq_f16(vc, vextq_f16(in0, in1, 1)));
  67     mask = vandq_u16(mask, vcgtq_f16(vc, vextq_f16(in0, in1, 2)));
  68 }
  69
  70 inline void non_maxima_suppression3x3_F32_F32(const void *__restrict in_ptr, void *__restrict out_ptr, const uint32_t in_stride)
  71 {
  72     auto       in  = static_cast<const float *__restrict>(in_ptr) - 1;
  73     const auto out = static_cast<float *__restrict>(out_ptr);
  74
  75     // Get centre scores
  76     const float16x8x2_t vc =
  77     {
  78         vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 1)), vcvt_f16_f32(vld1q_f32(in + 5))),
  79         vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 9)), vcvt_f16_f32(vld1q_f32(in + 13)))
  80     };
  81
  82     // Neighboring pixels
  83     in -= in_stride;
  84
  85     static const float16x4_t  zero_f16x4 = vdup_n_f16(0);
  86     static const uint16x8_t   zero_u16   = vdupq_n_u16(0);
  87     static const uint16x8_t   true_mask  = vceqq_u16(zero_u16, zero_u16);
  88     static const uint16x8x2_t true_mask_x2 =
  89     {
  90         true_mask,
  91         true_mask
  92     };
  93
  94     uint16x8x2_t mask = true_mask_x2;
  95
  96     // Top row
  97     const float16x8_t tmp_top0 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in)), vcvt_f16_f32(vld1q_f32(in + 4)));
  98     const float16x8_t tmp_top1 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 8)), vcvt_f16_f32(vld1q_f32(in + 12)));
  99     const float16x8_t tmp_top2 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 16)), zero_f16x4);
 100
 101     // vc >= nc.val[0], vc >= nc.val[1], vc >= nc.val[2]
 102     mask_top(vc.val[0], tmp_top0, tmp_top1, mask.val[0]);
 103     mask_top(vc.val[1], tmp_top1, tmp_top2, mask.val[1]);
 104
 105     in += in_stride;
 106
 107     // Middle row
 108     const float16x8_t tmp_mid0 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in)), vcvt_f16_f32(vld1q_f32(in + 4)));
 109     const float16x8_t tmp_mid1 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 8)), vcvt_f16_f32(vld1q_f32(in + 12)));
 110     const float16x8_t tmp_mid2 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 16)), zero_f16x4);
 111
 112     // vc >= nc.val[0], vc > nc.val[2]
 113     mask_middle(vc.val[0], tmp_mid0, tmp_mid1, mask.val[0]);
 114     mask_middle(vc.val[1], tmp_mid1, tmp_mid2, mask.val[1]);
 115
 116     in += in_stride;
 117
 118     // Bottom row
 119     const float16x8_t tmp_bot0 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in)), vcvt_f16_f32(vld1q_f32(in + 4)));
 120     const float16x8_t tmp_bot1 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 8)), vcvt_f16_f32(vld1q_f32(in + 12)));
 121     const float16x8_t tmp_bot2 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 16)), zero_f16x4);
 122
 123     // vc > nc.val[0], vc > nc.val[1], vc > nc.val[2]
 124     mask_bottom(vc.val[0], tmp_bot0, tmp_bot1, mask.val[0]);
 125     mask_bottom(vc.val[1], tmp_bot1, tmp_bot2, mask.val[1]);
 126
 127     // Store
 128     static const float16x8_t zero_f16x8 = vdupq_n_f16(0);
 129
 130     const float16x8_t suppressed0 = vbslq_f16(mask.val[0], vc.val[0], zero_f16x8);
 131     vst1q_f32(out + 0, vcvt_f32_f16(vget_low_f16(suppressed0)));
 132     vst1q_f32(out + 4, vcvt_f32_f16(vget_high_f16(suppressed0)));
 133
 134     const float16x8_t suppressed1 = vbslq_f16(mask.val[1], vc.val[1], zero_f16x8);
 135     vst1q_f32(out + 8, vcvt_f32_f16(vget_low_f16(suppressed1)));
 136     vst1q_f32(out + 12, vcvt_f32_f16(vget_high_f16(suppressed1)));
 137 }
 138
 139 inline void non_maxima_suppression3x3_U8_U8(const void *__restrict in_ptr, void *__restrict out_ptr, const uint32_t in_stride)
 140 {
 141     auto       in  = static_cast<const uint8_t *__restrict>(in_ptr) - 1;
 142     const auto out = static_cast<uint8_t *__restrict>(out_ptr);
 143
 144     // Get centre scores
 145     const uint8x16_t vc = vld1q_u8(in + 1);
 146
 147     // Neighboring pixels
 148     in -= in_stride;
 149
 150     // Top row
 151     const uint8x16_t l_nc_0 = vld1q_u8(in);
 152     const uint8x16_t m_nc_0 = vld1q_u8(in + 1);
 153     const uint8x16_t r_nc_0 = vld1q_u8(in + 2);
 154
 155     // Keep center scores if ...
 156     // vc >= l_nc_0, vc >= m_nc_0, vc >= r_nc_0
 157     uint8x16_t mask = vcgeq_u8(vc, l_nc_0);
 158     mask            = vandq_u8(mask, vcgeq_u8(vc, m_nc_0));
 159     mask            = vandq_u8(mask, vcgeq_u8(vc, r_nc_0));
 160
 161     in += in_stride;
 162
 163     // Middle row
 164     const uint8x16_t l_nc_1 = vld1q_u8(in);
 165     const uint8x16_t r_nc_1 = vld1q_u8(in + 2);
 166
 167     // ... and ...
 168     // vc >= l_nc_1, vc > r_nc_1
 169     mask = vandq_u8(mask, vcgeq_u8(vc, l_nc_1));
 170     mask = vandq_u8(mask, vcgtq_u8(vc, r_nc_1));
 171
 172     in += in_stride;
 173
 174     // Bottom row
 175     const uint8x16_t l_nc_2 = vld1q_u8(in);
 176     const uint8x16_t m_nc_2 = vld1q_u8(in + 1);
 177     const uint8x16_t r_nc_2 = vld1q_u8(in + 2);
 178
 179     // ... and ...
 180     // vc > l_nc_2, vc > m_nc_2, vc > r_nc_2
 181     mask = vandq_u8(mask, vcgtq_u8(vc, l_nc_2));
 182     mask = vandq_u8(mask, vcgtq_u8(vc, m_nc_2));
 183     mask = vandq_u8(mask, vcgtq_u8(vc, r_nc_2));
 184
 185     // Store
 186     static const uint8x16_t zero = vdupq_n_u8(0);
 187     vst1q_u8(out, vbslq_u8(mask, vc, zero));
 188 }
 189 } // namespace fp16
 190
 191 void NENonMaximaSuppression3x3FP16Kernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
 192 {
 193     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F32);
 194     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::F32);
 195     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 196
 197     _input  = input;
 198     _output = output;
 199
 200     switch(input->info()->data_type())
 201     {
 202         case DataType::U8:
 203             _func = &fp16::non_maxima_suppression3x3_U8_U8;
 204             break;
 205         default:
 206             _func = &fp16::non_maxima_suppression3x3_F32_F32;
 207             break;
 208     }
 209
 210     constexpr unsigned int num_elems_processed_per_iteration = 16;
 211     const unsigned int     num_elems_read_per_iteration      = 16 + 2 * border_size().left + (input->info()->data_type() == DataType::U8 ? 0 : 3);
 212     constexpr unsigned int num_elems_written_per_iteration   = 16;
 213     constexpr unsigned int num_rows_read_per_iteration       = 3;
 214
 215     // Configure kernel window
 216     Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
 217     AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
 218
 219     update_window_and_padding(win,
 220                               AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
 221                               output_access);
 222
 223     output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
 224
 225     INEKernel::configure(win);
 226 }
 227 #endif
 228
 229 namespace
 230 {
 231 inline void non_maxima_suppression3x3_FLOAT_FLOAT(const void *__restrict input_ptr, void *__restrict output_ptr, const uint32_t input_stride)
 232 {
 233     auto       input  = static_cast<const float *__restrict>(input_ptr) - 1;
 234     const auto output = static_cast<float *__restrict>(output_ptr);
 235
 236     // Get centre scores
 237     const float32x4x4_t vc =
 238     {
 239         {
 240             vld1q_f32(input + 1),
 241             vld1q_f32(input + 5),
 242             vld1q_f32(input + 9),
 243             vld1q_f32(input + 13)
 244         }
 245     };
 246
 247     // Neighboring pixels
 248     float32x4x4_t l_nc{ {} };
 249     float32x4x4_t m_nc{ {} };
 250     float32x4x4_t r_nc{ {} };
 251
 252     input -= input_stride;
 253
 254     // Row0 - Low part
 255     float32x4_t tmp_low   = vld1q_f32(input);
 256     float32x4_t tmp_high  = vld1q_f32(input + 4);
 257     float32x4_t tmp_high1 = vld1q_f32(input + 8);
 258
 259     l_nc.val[0] = tmp_low;
 260     m_nc.val[0] = vextq_f32(tmp_low, tmp_high, 1);
 261     r_nc.val[0] = vextq_f32(tmp_low, tmp_high, 2);
 262
 263     tmp_low  = tmp_high;
 264     tmp_high = tmp_high1;
 265
 266     l_nc.val[1] = tmp_low;
 267     m_nc.val[1] = vextq_f32(tmp_low, tmp_high, 1);
 268     r_nc.val[1] = vextq_f32(tmp_low, tmp_high, 2);
 269
 270     // Row0 - High part
 271     tmp_low   = tmp_high1;
 272     tmp_high  = vld1q_f32(input + 12);
 273     tmp_high1 = vld1q_f32(input + 16);
 274
 275     l_nc.val[2] = tmp_low;
 276     m_nc.val[2] = vextq_f32(tmp_low, tmp_high, 1);
 277     r_nc.val[2] = vextq_f32(tmp_low, tmp_high, 2);
 278
 279     tmp_low  = tmp_high;
 280     tmp_high = tmp_high1;
 281
 282     l_nc.val[3] = tmp_low;
 283     m_nc.val[3] = vextq_f32(tmp_low, tmp_high, 1);
 284     r_nc.val[3] = vextq_f32(tmp_low, tmp_high, 2);
 285
 286     // mc >= nc.val[0], mc >= nc.val[1], mc >= nc.val[2]
 287     uint32x4x4_t mask{ {} };
 288     mask.val[0] = vcgeq_f32(vc.val[0], l_nc.val[0]);
 289     mask.val[0] = vandq_u32(mask.val[0], vcgeq_f32(vc.val[0], m_nc.val[0]));
 290     mask.val[0] = vandq_u32(mask.val[0], vcgeq_f32(vc.val[0], r_nc.val[0]));
 291     mask.val[1] = vcgeq_f32(vc.val[1], l_nc.val[1]);
 292     mask.val[1] = vandq_u32(mask.val[1], vcgeq_f32(vc.val[1], m_nc.val[1]));
 293     mask.val[1] = vandq_u32(mask.val[1], vcgeq_f32(vc.val[1], r_nc.val[1]));
 294     mask.val[2] = vcgeq_f32(vc.val[2], l_nc.val[2]);
 295     mask.val[2] = vandq_u32(mask.val[2], vcgeq_f32(vc.val[2], m_nc.val[2]));
 296     mask.val[2] = vandq_u32(mask.val[2], vcgeq_f32(vc.val[2], r_nc.val[2]));
 297     mask.val[3] = vcgeq_f32(vc.val[3], l_nc.val[3]);
 298     mask.val[3] = vandq_u32(mask.val[3], vcgeq_f32(vc.val[3], m_nc.val[3]));
 299     mask.val[3] = vandq_u32(mask.val[3], vcgeq_f32(vc.val[3], r_nc.val[3]));
 300
 301     input += input_stride;
 302
 303     // Row1 - Low part
 304     tmp_low   = vld1q_f32(input);
 305     tmp_high  = vld1q_f32(input + 4);
 306     tmp_high1 = vld1q_f32(input + 8);
 307
 308     l_nc.val[0] = tmp_low;
 309     r_nc.val[0] = vextq_f32(tmp_low, tmp_high, 2);
 310
 311     tmp_low  = tmp_high;
 312     tmp_high = tmp_high1;
 313
 314     l_nc.val[1] = tmp_low;
 315     r_nc.val[1] = vextq_f32(tmp_low, tmp_high, 2);
 316
 317     // Row1 - High part
 318     tmp_low   = tmp_high1;
 319     tmp_high  = vld1q_f32(input + 12);
 320     tmp_high1 = vld1q_f32(input + 16);
 321
 322     l_nc.val[2] = tmp_low;
 323     r_nc.val[2] = vextq_f32(tmp_low, tmp_high, 2);
 324
 325     tmp_low  = tmp_high;
 326     tmp_high = tmp_high1;
 327
 328     l_nc.val[3] = tmp_low;
 329     r_nc.val[3] = vextq_f32(tmp_low, tmp_high, 2);
 330
 331     // mc >= nc.val[0], mc > nc.val[2]
 332     mask.val[0] = vandq_u32(mask.val[0], vcgeq_f32(vc.val[0], l_nc.val[0]));
 333     mask.val[0] = vandq_u32(mask.val[0], vcgtq_f32(vc.val[0], r_nc.val[0]));
 334     mask.val[1] = vandq_u32(mask.val[1], vcgeq_f32(vc.val[1], l_nc.val[1]));
 335     mask.val[1] = vandq_u32(mask.val[1], vcgtq_f32(vc.val[1], r_nc.val[1]));
 336     mask.val[2] = vandq_u32(mask.val[2], vcgeq_f32(vc.val[2], l_nc.val[2]));
 337     mask.val[2] = vandq_u32(mask.val[2], vcgtq_f32(vc.val[2], r_nc.val[2]));
 338     mask.val[3] = vandq_u32(mask.val[3], vcgeq_f32(vc.val[3], l_nc.val[3]));
 339     mask.val[3] = vandq_u32(mask.val[3], vcgtq_f32(vc.val[3], r_nc.val[3]));
 340
 341     input += input_stride;
 342
 343     // Row2 - Low part
 344     tmp_low   = vld1q_f32(input);
 345     tmp_high  = vld1q_f32(input + 4);
 346     tmp_high1 = vld1q_f32(input + 8);
 347
 348     l_nc.val[0] = tmp_low;
 349     m_nc.val[0] = vextq_f32(tmp_low, tmp_high, 1);
 350     r_nc.val[0] = vextq_f32(tmp_low, tmp_high, 2);
 351
 352     tmp_low  = tmp_high;
 353     tmp_high = tmp_high1;
 354
 355     l_nc.val[1] = tmp_low;
 356     m_nc.val[1] = vextq_f32(tmp_low, tmp_high, 1);
 357     r_nc.val[1] = vextq_f32(tmp_low, tmp_high, 2);
 358
 359     // Row2 - High part
 360     tmp_low   = tmp_high1;
 361     tmp_high  = vld1q_f32(input + 12);
 362     tmp_high1 = vld1q_f32(input + 16);
 363
 364     l_nc.val[2] = tmp_low;
 365     m_nc.val[2] = vextq_f32(tmp_low, tmp_high, 1);
 366     r_nc.val[2] = vextq_f32(tmp_low, tmp_high, 2);
 367
 368     tmp_low  = tmp_high;
 369     tmp_high = tmp_high1;
 370
 371     l_nc.val[3] = tmp_low;
 372     m_nc.val[3] = vextq_f32(tmp_low, tmp_high, 1);
 373     r_nc.val[3] = vextq_f32(tmp_low, tmp_high, 2);
 374
 375     // mc > nc.val[0], mc > nc.val[1], mc > nc.val[2]
 376     mask.val[0] = vandq_u32(mask.val[0], vcgtq_f32(vc.val[0], l_nc.val[0]));
 377     mask.val[0] = vandq_u32(mask.val[0], vcgtq_f32(vc.val[0], m_nc.val[0]));
 378     mask.val[0] = vandq_u32(mask.val[0], vcgtq_f32(vc.val[0], r_nc.val[0]));
 379     mask.val[1] = vandq_u32(mask.val[1], vcgtq_f32(vc.val[1], l_nc.val[1]));
 380     mask.val[1] = vandq_u32(mask.val[1], vcgtq_f32(vc.val[1], m_nc.val[1]));
 381     mask.val[1] = vandq_u32(mask.val[1], vcgtq_f32(vc.val[1], r_nc.val[1]));
 382     mask.val[2] = vandq_u32(mask.val[2], vcgtq_f32(vc.val[2], l_nc.val[2]));
 383     mask.val[2] = vandq_u32(mask.val[2], vcgtq_f32(vc.val[2], m_nc.val[2]));
 384     mask.val[2] = vandq_u32(mask.val[2], vcgtq_f32(vc.val[2], r_nc.val[2]));
 385     mask.val[3] = vandq_u32(mask.val[3], vcgtq_f32(vc.val[3], l_nc.val[3]));
 386     mask.val[3] = vandq_u32(mask.val[3], vcgtq_f32(vc.val[3], m_nc.val[3]));
 387     mask.val[3] = vandq_u32(mask.val[3], vcgtq_f32(vc.val[3], r_nc.val[3]));
 388
 389     static const float32x4_t zero = vdupq_n_f32(0.f);
 390
 391     // Store
 392     vst1q_f32(output + 0, vbslq_f32(mask.val[0], vc.val[0], zero));
 393     vst1q_f32(output + 4, vbslq_f32(mask.val[1], vc.val[1], zero));
 394     vst1q_f32(output + 8, vbslq_f32(mask.val[2], vc.val[2], zero));
 395     vst1q_f32(output + 12, vbslq_f32(mask.val[3], vc.val[3], zero));
 396 }
 397
 398 inline void non_maxima_suppression3x3_U8_U8(const void *__restrict input_ptr, void *__restrict output_ptr, const uint32_t input_stride)
 399 {
 400     auto       input  = static_cast<const uint8_t *__restrict>(input_ptr) - 1;
 401     const auto output = static_cast<uint8_t *__restrict>(output_ptr);
 402
 403     // Get centre scores
 404     const uint8x16_t vc = vld1q_u8(input + 1);
 405
 406     // Neighboring pixels
 407     uint8x16_t l_nc{};
 408     uint8x16_t m_nc{};
 409     uint8x16_t r_nc{};
 410
 411     input -= input_stride;
 412
 413     // Row0
 414     l_nc = vld1q_u8(input);
 415     m_nc = vld1q_u8(input + 1);
 416     r_nc = vld1q_u8(input + 2);
 417
 418     // mc >= l_nc, mc >= m_nc, mc >= r_nc
 419     uint8x16_t mask = vcgeq_u8(vc, l_nc);
 420     mask            = vandq_u8(mask, vcgeq_u8(vc, m_nc));
 421     mask            = vandq_u8(mask, vcgeq_u8(vc, r_nc));
 422
 423     input += input_stride;
 424
 425     // Row1
 426     l_nc = vld1q_u8(input);
 427     r_nc = vld1q_u8(input + 2);
 428
 429     // mc >= l_nc, mc > r_nc
 430     mask = vandq_u8(mask, vcgeq_u8(vc, l_nc));
 431     mask = vandq_u8(mask, vcgtq_u8(vc, r_nc));
 432
 433     input += input_stride;
 434
 435     // Row2
 436     l_nc = vld1q_u8(input);
 437     m_nc = vld1q_u8(input + 1);
 438     r_nc = vld1q_u8(input + 2);
 439
 440     // mc > l_nc, mc > m_nc, mc > r_nc
 441     mask = vandq_u8(mask, vcgtq_u8(vc, l_nc));
 442     mask = vandq_u8(mask, vcgtq_u8(vc, m_nc));
 443     mask = vandq_u8(mask, vcgtq_u8(vc, r_nc));
 444
 445     static const uint8x16_t zero = vdupq_n_u8(0);
 446
 447     // Store
 448     vst1q_u8(output, vbslq_u8(mask, vc, zero));
 449 }
 450 } // namespace
 451
 452 NENonMaximaSuppression3x3Kernel::NENonMaximaSuppression3x3Kernel()
 453     : _func(nullptr), _input(nullptr), _output(nullptr)
 454 {
 455 }
 456
 457 BorderSize NENonMaximaSuppression3x3Kernel::border_size() const
 458 {
 459     return BorderSize(1);
 460 }
 461
 462 void NENonMaximaSuppression3x3Kernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
 463 {
 464     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F32);
 465     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::F32);
 466     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 467
 468     _input  = input;
 469     _output = output;
 470
 471     if(input->info()->data_type() == DataType::U8)
 472     {
 473         _func = &non_maxima_suppression3x3_U8_U8;
 474     }
 475     else
 476     {
 477         _func = &non_maxima_suppression3x3_FLOAT_FLOAT;
 478     }
 479
 480     constexpr unsigned int num_elems_processed_per_iteration = 16;
 481     const unsigned int     num_elems_read_per_iteration      = 16 + 2 * border_size().left + (input->info()->data_type() == DataType::U8 ? 0 : 3);
 482     constexpr unsigned int num_elems_written_per_iteration   = 16;
 483     constexpr unsigned int num_rows_read_per_iteration       = 3;
 484
 485     // Configure kernel window
 486     Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
 487     AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
 488
 489     update_window_and_padding(win,
 490                               AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
 491                               output_access);
 492
 493     output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
 494
 495     INEKernel::configure(win);
 496 }
 497
 498 void NENonMaximaSuppression3x3Kernel::run(const Window &window)
 499 {
 500     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
 501     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 502     ARM_COMPUTE_ERROR_ON(_func == nullptr);
 503     Iterator input(_input, window);
 504     Iterator output(_output, window);
 505
 506     const size_t input_stride = _input->info()->strides_in_bytes()[1] / element_size_from_data_type(_input->info()->data_type());
 507
 508     execute_window_loop(window, [&](const Coordinates & id)
 509     {
 510         _func(input.ptr(), output.ptr(), input_stride);
 511     },
 512     input, output);
 513 }