2 * Copyright (c) 2016, 2017 ARM Limited.
4 * SPDX-License-Identifier: MIT
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 #include "arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h"
26 #include "arm_compute/core/Coordinates.h"
27 #include "arm_compute/core/Error.h"
28 #include "arm_compute/core/Helpers.h"
29 #include "arm_compute/core/ITensor.h"
30 #include "arm_compute/core/TensorInfo.h"
31 #include "arm_compute/core/Validate.h"
39 using namespace arm_compute;
43 const uint8x16_t zero_u8 = vdupq_n_u8(0);
45 template <size_t columns>
46 inline uint8x8_t min_row(uint8x16_t row_data)
48 uint8x8_t min = vget_low_u8(row_data);
50 for(size_t c = 1; c < columns; ++c)
52 row_data = vextq_u8(row_data, zero_u8, 1);
53 min = vmin_u8(min, vget_low_u8(row_data));
59 template <size_t columns>
60 inline uint8x8_t max_row(uint8x16_t row_data)
62 uint8x8_t max = vget_low_u8(row_data);
64 for(size_t c = 1; c < columns; ++c)
66 row_data = vextq_u8(row_data, zero_u8, 1);
67 max = vmax_u8(max, vget_low_u8(row_data));
73 inline void sort(uint8x8_t &a, uint8x8_t &b)
75 const uint8x8_t min = vmin_u8(a, b);
76 const uint8x8_t max = vmax_u8(a, b);
81 // Sorting networks below were generated using http://pages.ripco.net/~jgamble/nw.html
82 // Calculations that do not affect the median were removed.
83 inline void sort5(uint8x8_t &p0, uint8x8_t &p1, uint8x8_t &p2, uint8x8_t &p3, uint8x8_t &p4)
95 inline void sort9(uint8x8_t &p0, uint8x8_t &p1, uint8x8_t &p2,
96 uint8x8_t &p3, uint8x8_t &p4, uint8x8_t &p5,
97 uint8x8_t &p6, uint8x8_t &p7, uint8x8_t &p8)
120 inline void sort21(uint8x8_t p[21])
225 inline void sort25(uint8x8_t p[25])
343 NENonLinearFilterKernel::NENonLinearFilterKernel()
344 : _border_width(0), _input(nullptr), _output(nullptr), _mask(nullptr), _pattern(MatrixPattern::BOX), _function(NonLinearFilterFunction::MIN), _func_idx(0), _border_size()
348 BorderSize NENonLinearFilterKernel::border_size() const
353 void NENonLinearFilterKernel::configure(const ITensor *input, ITensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
354 bool border_undefined)
356 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
357 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
358 ARM_COMPUTE_ERROR_ON(3 != mask_size && 5 != mask_size);
359 ARM_COMPUTE_ERROR_ON(MatrixPattern::OTHER == pattern && nullptr == mask);
361 // Set class variables
362 _border_size = BorderSize(mask_size / 2);
367 _function = function;
369 // Configure kernel window
370 const unsigned int num_elems_processed_per_iteration = (MatrixPattern::OTHER == pattern) ? 1 : 8;
371 constexpr unsigned int num_elems_read_per_iteration = 16;
373 Window win = calculate_max_window(*input->info(), num_elems_processed_per_iteration, border_undefined, border_size());
374 AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
375 update_window_and_padding(win,
376 AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, mask_size),
378 output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
380 INEKernel::configure(win);
382 // Define function index
383 _func_idx = (3 == mask_size) ? 0 : 1;
385 if(MatrixPattern::OTHER != pattern)
387 _func_idx = (_func_idx) * 3 + static_cast<unsigned int>(function);
391 void NENonLinearFilterKernel::fill_mask(uint8_t *mask, int cols, int rows, MatrixPattern pattern)
395 for(int r = 0; r < rows; ++r)
397 for(int c = 0; c < cols; ++c, ++v)
403 case MatrixPattern::BOX:
406 case MatrixPattern::CROSS:
407 val = ((r == (rows / 2)) || (c == (cols / 2))) ? 255 : 0;
409 case MatrixPattern::DISK:
410 val = (((r - rows / 2.0f + 0.5f) * (r - rows / 2.0f + 0.5f)) / ((rows / 2.0f) * (rows / 2.0f)) + ((c - cols / 2.0f + 0.5f) * (c - cols / 2.0f + 0.5f)) / ((cols / 2.0f) *
411 (cols / 2.0f))) <= 1.0f ? 255 : 0;
422 #ifndef DOXYGEN_SKIP_THIS /* Doxygen gets confused by the templates and can't match the implementation to the declaration */
423 namespace arm_compute
426 void NENonLinearFilterKernel::median_filter_box<3, 3>(const Window &win)
428 Iterator input(_input, win);
429 Iterator output(_output, win);
431 const auto input_top_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, -1)));
432 const auto input_mid_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, 0)));
433 const auto input_bot_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, 1)));
435 execute_window_loop(win, [&](const Coordinates & id)
437 const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
438 const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
439 const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
441 uint8x8_t p0 = vget_low_u8(top_data);
442 uint8x8_t p1 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 1);
443 uint8x8_t p2 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 2);
444 uint8x8_t p3 = vget_low_u8(mid_data);
445 uint8x8_t p4 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1);
446 uint8x8_t p5 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2);
447 uint8x8_t p6 = vget_low_u8(bot_data);
448 uint8x8_t p7 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 1);
449 uint8x8_t p8 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 2);
451 sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
453 vst1_u8(output.ptr(), p4);
458 void NENonLinearFilterKernel::median_filter_box<5, 5>(const Window &win)
460 Iterator input(_input, win);
461 Iterator output(_output, win);
463 const auto input_top2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -2)));
464 const auto input_top_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -1)));
465 const auto input_mid_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 0)));
466 const auto input_bot_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
467 const auto input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 2)));
469 execute_window_loop(win, [&](const Coordinates & id)
471 const uint8x16_t top2_data = vld1q_u8(input_top2_ptr + input.offset());
472 const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
473 const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
474 const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
475 const uint8x16_t bot2_data = vld1q_u8(input_bot2_ptr + input.offset());
477 const uint8x8_t d[] =
479 vget_low_u8(top2_data),
480 vget_high_u8(top2_data),
481 vget_low_u8(top_data),
482 vget_high_u8(top_data),
483 vget_low_u8(mid_data),
484 vget_high_u8(mid_data),
485 vget_low_u8(bot_data),
486 vget_high_u8(bot_data),
487 vget_low_u8(bot2_data),
488 vget_high_u8(bot2_data)
492 for(unsigned int i = 0; i < 5; ++i)
494 const unsigned int idx_d = i * 2;
495 const unsigned int idx_p = i * 5;
498 p[idx_p + 1] = vext_u8(d[idx_d], d[idx_d + 1], 1);
499 p[idx_p + 2] = vext_u8(d[idx_d], d[idx_d + 1], 2);
500 p[idx_p + 3] = vext_u8(d[idx_d], d[idx_d + 1], 3);
501 p[idx_p + 4] = vext_u8(d[idx_d], d[idx_d + 1], 4);
506 vst1_u8(output.ptr(), p[12]);
510 } // namespace arm_compute
513 template <int mask_w, int mask_h>
514 void NENonLinearFilterKernel::min_filter_box(const Window &win)
516 static_assert(mask_w > 0, "Mask size must not be 0");
517 static_assert(mask_h > 0, "Mask size must not be 0");
519 Iterator input(_input, win);
520 Iterator output(_output, win);
522 const int k_row_half = mask_h / 2;
523 const int k_col_half = mask_w / 2;
526 std::array<const unsigned char *, mask_h> input_ptrs{ {} };
527 for(int i = -k_row_half; i <= k_row_half; ++i)
529 input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i));
532 execute_window_loop(win, [&](const Coordinates & id)
535 uint8x16_t rows_min = vld1q_u8(input_ptrs[0] + input.offset());
537 for(unsigned int r = 1; r < mask_h; ++r)
539 const uint8x16_t data = vld1q_u8(input_ptrs[r] + input.offset());
540 rows_min = vminq_u8(rows_min, data);
543 const uint8x8_t out = min_row<mask_w>(rows_min);
545 // Store result as U8
546 vst1_u8(output.ptr(), out);
551 template <int mask_w, int mask_h>
552 void NENonLinearFilterKernel::max_filter_box(const Window &win)
554 static_assert(mask_w > 0, "Mask size must not be 0");
555 static_assert(mask_h > 0, "Mask size must not be 0");
556 ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
558 Iterator input(_input, win);
559 Iterator output(_output, win);
561 const int k_row_half = mask_h / 2;
562 const int k_col_half = mask_w / 2;
565 std::array<const unsigned char *, mask_h> input_ptrs{ {} };
566 for(int i = -k_row_half; i <= k_row_half; ++i)
568 input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i));
571 execute_window_loop(win, [&](const Coordinates & id)
573 uint8x16_t rows_max = vld1q_u8(input_ptrs[0] + input.offset());
576 for(unsigned int r = 1; r < mask_h; ++r)
578 const uint8x16_t data = vld1q_u8(input_ptrs[r] + input.offset());
579 rows_max = vmaxq_u8(rows_max, data);
582 // Get max of columns
583 const uint8x8_t out = max_row<mask_w>(rows_max);
585 // Store result as U8
586 vst1_u8(output.ptr(), out);
591 #ifndef DOXYGEN_SKIP_THIS /* Doxygen gets confused by the templates and can't match the implementation to the declaration */
592 namespace arm_compute
595 void NENonLinearFilterKernel::median_filter_cross<3, 3>(const Window &win)
597 Iterator input(_input, win);
598 Iterator output(_output, win);
600 const auto input_top_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, -1)));
601 const auto input_mid_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, 0)));
602 const auto input_bot_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, 1)));
604 execute_window_loop(win, [&](const Coordinates & id)
606 const uint8x8_t top_data = vld1_u8(input_top_ptr + input.offset());
607 const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
608 const uint8x8_t bot_data = vld1_u8(input_bot_ptr + input.offset());
610 uint8x8_t p0 = top_data;
611 uint8x8_t p1 = vget_low_u8(mid_data);
612 uint8x8_t p2 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1);
613 uint8x8_t p3 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2);
614 uint8x8_t p4 = bot_data;
616 sort5(p0, p1, p2, p3, p4);
618 vst1_u8(output.ptr(), p2);
624 void NENonLinearFilterKernel::median_filter_cross<5, 5>(const Window &win)
626 Iterator input(_input, win);
627 Iterator output(_output, win);
629 const auto input_top2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, -2)));
630 const auto input_top_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, -1)));
631 const auto input_mid_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 0)));
632 const auto input_bot_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, 1)));
633 const auto input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, 2)));
635 execute_window_loop(win, [&](const Coordinates & id)
637 const uint8x8_t top2_data = vld1_u8(input_top2_ptr + input.offset());
638 const uint8x8_t top_data = vld1_u8(input_top_ptr + input.offset());
639 const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
640 const uint8x8_t bot_data = vld1_u8(input_bot_ptr + input.offset());
641 const uint8x8_t bot2_data = vld1_u8(input_bot2_ptr + input.offset());
643 uint8x8_t p0 = top2_data;
644 uint8x8_t p1 = top_data;
645 uint8x8_t p2 = vget_low_u8(mid_data);
646 uint8x8_t p3 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1);
647 uint8x8_t p4 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2);
648 uint8x8_t p5 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 3);
649 uint8x8_t p6 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 4);
650 uint8x8_t p7 = bot_data;
651 uint8x8_t p8 = bot2_data;
653 sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
655 vst1_u8(output.ptr(), p4);
659 } // namespace arm_compute
662 template <int mask_w, int mask_h>
663 void NENonLinearFilterKernel::min_filter_cross(const Window &win)
665 static_assert(mask_w > 0, "Mask size must not be 0");
666 static_assert(mask_h > 0, "Mask size must not be 0");
667 ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
669 Iterator input(_input, win);
670 Iterator output(_output, win);
672 const int k_row_half = mask_h / 2;
673 const int k_col_half = mask_w / 2;
675 const unsigned char *mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, 0));
678 std::array<const unsigned char *, mask_h> input_ptrs{ {} };
679 for(int i = -k_row_half; i <= k_row_half; ++i)
681 input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, i));
684 execute_window_loop(win, [&](const Coordinates & id)
686 uint8x8_t rows_min = vld1_u8(input_ptrs[0] + input.offset());
689 for(unsigned int r = 1; r < mask_h; ++r)
691 const uint8x8_t data = vld1_u8(input_ptrs[r] + input.offset());
692 rows_min = vmin_u8(rows_min, data);
695 // Get min of middle row
696 const uint8x16_t data = vld1q_u8(mid_ptr + input.offset());
697 uint8x8_t out = min_row<mask_w>(data);
700 out = vmin_u8(out, rows_min);
702 // Store result as U8
703 vst1_u8(output.ptr(), out);
708 template <int mask_w, int mask_h>
709 void NENonLinearFilterKernel::max_filter_cross(const Window &win)
711 static_assert(mask_w > 0, "Mask size must not be 0");
712 static_assert(mask_h > 0, "Mask size must not be 0");
713 ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
715 Iterator input(_input, win);
716 Iterator output(_output, win);
718 const int k_row_half = mask_h / 2;
719 const int k_col_half = mask_w / 2;
721 const unsigned char *mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, 0));
724 std::array<unsigned char *, mask_h> input_ptrs{ {} };
725 for(int i = -k_row_half; i <= k_row_half; ++i)
727 input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, i));
730 execute_window_loop(win, [&](const Coordinates & id)
732 uint8x8_t rows_max = vld1_u8(input_ptrs[0] + input.offset());
735 for(unsigned int r = 1; r < mask_h; ++r)
737 const uint8x8_t data = vld1_u8(input_ptrs[r] + input.offset());
738 rows_max = vmax_u8(rows_max, data);
741 // Get max of middle row
742 const uint8x16_t data = vld1q_u8(mid_ptr + input.offset());
743 uint8x8_t out = max_row<mask_w>(data);
746 out = vmax_u8(out, rows_max);
748 // Store result as U8
749 vst1_u8(output.ptr(), out);
754 #ifndef DOXYGEN_SKIP_THIS /* Doxygen gets confused by the templates and can't match the implementation to the declaration */
755 namespace arm_compute
758 void NENonLinearFilterKernel::median_filter_disk<5, 5>(const Window &win)
760 Iterator input(_input, win);
761 Iterator output(_output, win);
763 const auto input_top2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, -2)));
764 const auto input_top_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -1)));
765 const auto input_mid_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 0)));
766 const auto input_bot_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
767 const auto input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, 2)));
769 execute_window_loop(win, [&](const Coordinates & id)
771 const uint8x16_t top2_data = vld1q_u8(input_top2_ptr + input.offset());
772 const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
773 const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
774 const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
775 const uint8x16_t bot2_data = vld1q_u8(input_bot2_ptr + input.offset());
779 vget_low_u8(top2_data),
780 vget_high_u8(top2_data),
781 vget_low_u8(top_data),
782 vget_high_u8(top_data),
783 vget_low_u8(mid_data),
784 vget_high_u8(mid_data),
785 vget_low_u8(bot_data),
786 vget_high_u8(bot_data),
787 vget_low_u8(bot2_data),
788 vget_high_u8(bot2_data)
793 p[1] = vext_u8(d[0], d[1], 1);
794 p[2] = vext_u8(d[0], d[1], 2);
796 p[19] = vext_u8(d[8], d[9], 1);
797 p[20] = vext_u8(d[8], d[9], 2);
799 for(unsigned int i = 0; i < 3; ++i)
801 const unsigned int idx_d = 2 + i * 2;
802 const unsigned int idx_p = 3 + i * 5;
805 p[idx_p + 1] = vext_u8(d[idx_d], d[idx_d + 1], 1);
806 p[idx_p + 2] = vext_u8(d[idx_d], d[idx_d + 1], 2);
807 p[idx_p + 3] = vext_u8(d[idx_d], d[idx_d + 1], 3);
808 p[idx_p + 4] = vext_u8(d[idx_d], d[idx_d + 1], 4);
813 vst1_u8(output.ptr(), p[10]);
819 void NENonLinearFilterKernel::min_filter_disk<5, 5>(const Window &win)
821 Iterator input(_input, win);
822 Iterator output(_output, win);
824 const auto input_top2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, -2)));
825 const auto input_top_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -1)));
826 const auto input_mid_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 0)));
827 const auto input_bot_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
828 const auto input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, 2)));
830 execute_window_loop(win, [&](const Coordinates & id)
832 const uint8x16_t top2_data = vld1q_u8(input_top2_ptr + input.offset());
833 const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
834 const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
835 const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
836 const uint8x16_t bot2_data = vld1q_u8(input_bot2_ptr + input.offset());
838 const uint8x16_t rows_min_3 = vminq_u8(top2_data, bot2_data);
839 uint8x16_t rows_min_5 = vminq_u8(top_data, bot_data);
840 rows_min_5 = vminq_u8(rows_min_5, mid_data);
842 const uint8x8_t out_3 = min_row<3>(rows_min_3);
843 const uint8x8_t out_5 = min_row<5>(rows_min_5);
845 vst1_u8(output.ptr(), vmin_u8(out_3, out_5));
851 void NENonLinearFilterKernel::max_filter_disk<5, 5>(const Window &win)
853 Iterator input(_input, win);
854 Iterator output(_output, win);
856 const auto input_top2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, -2)));
857 const auto input_top_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -1)));
858 const auto input_mid_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 0)));
859 const auto input_bot_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
860 const auto input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, 2)));
862 execute_window_loop(win, [&](const Coordinates & id)
864 const uint8x16_t top2_data = vld1q_u8(input_top2_ptr + input.offset());
865 const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
866 const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
867 const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
868 const uint8x16_t bot2_data = vld1q_u8(input_bot2_ptr + input.offset());
870 const uint8x16_t rows_max_3 = vmaxq_u8(top2_data, bot2_data);
871 uint8x16_t rows_max_5 = vmaxq_u8(top_data, bot_data);
872 rows_max_5 = vmaxq_u8(rows_max_5, mid_data);
874 const uint8x8_t out_3 = max_row<3>(rows_max_3);
875 const uint8x8_t out_5 = max_row<5>(rows_max_5);
877 vst1_u8(output.ptr(), vmax_u8(out_3, out_5));
881 } // namespace arm_compute
884 template <int mask_w, int mask_h>
885 void NENonLinearFilterKernel::non_linear_filter_generic(const Window &win)
887 Iterator input(_input, win);
888 Iterator output(_output, win);
889 ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
891 const int k_row_half = mask_h / 2;
892 const int k_col_half = mask_w / 2;
893 constexpr int mask_size = mask_w * mask_h;
896 std::array<unsigned char *, mask_h> input_ptrs{ {} };
897 for(int i = -k_row_half; i <= k_row_half; ++i)
899 input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i));
902 execute_window_loop(win, [&](const Coordinates & id)
904 std::array<uint8_t, mask_size> vals{ {} };
909 for(unsigned int r = 0; r < mask_h; ++r)
911 const auto in_ptr = static_cast<const uint8_t *>(input_ptrs[r] + input.offset());
913 for(unsigned int c = 0; c < mask_w; ++c, ++m)
923 // Only do something if there is at least one non-zero element in the
927 std::sort(vals.begin(), vals.begin() + v);
931 case NonLinearFilterFunction::MIN:
932 *output.ptr() = vals[0];
934 case NonLinearFilterFunction::MAX:
935 *output.ptr() = vals[v - 1];
937 case NonLinearFilterFunction::MEDIAN:
938 *output.ptr() = vals[v / 2];
948 void NENonLinearFilterKernel::run(const Window &window)
950 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
951 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
953 using NonLinearFilterFunction = void (NENonLinearFilterKernel::*)(const Window & window);
955 // Function table for BOX pattern
956 static const std::array<NonLinearFilterFunction, 6> func_table_box =
959 &NENonLinearFilterKernel::median_filter_box<3, 3>,
960 &NENonLinearFilterKernel::min_filter_box<3, 3>,
961 &NENonLinearFilterKernel::max_filter_box<3, 3>,
962 &NENonLinearFilterKernel::median_filter_box<5, 5>,
963 &NENonLinearFilterKernel::min_filter_box<5, 5>,
964 &NENonLinearFilterKernel::max_filter_box<5, 5>,
968 // Function table for CROSS pattern
969 static const std::array<NonLinearFilterFunction, 6> func_table_cross =
972 &NENonLinearFilterKernel::median_filter_cross<3, 3>,
973 &NENonLinearFilterKernel::min_filter_cross<3, 3>,
974 &NENonLinearFilterKernel::max_filter_cross<3, 3>,
975 &NENonLinearFilterKernel::median_filter_cross<5, 5>,
976 &NENonLinearFilterKernel::min_filter_cross<5, 5>,
977 &NENonLinearFilterKernel::max_filter_cross<5, 5>,
981 // Function table for DISK pattern
982 static const std::array<NonLinearFilterFunction, 6> func_table_disk =
985 &NENonLinearFilterKernel::median_filter_box<3, 3>,
986 &NENonLinearFilterKernel::min_filter_box<3, 3>,
987 &NENonLinearFilterKernel::max_filter_box<3, 3>,
988 &NENonLinearFilterKernel::median_filter_disk<5, 5>,
989 &NENonLinearFilterKernel::min_filter_disk<5, 5>,
990 &NENonLinearFilterKernel::max_filter_disk<5, 5>,
994 // Function table for OTHER pattern
995 static const std::array<NonLinearFilterFunction, 2> func_table_generic =
998 &NENonLinearFilterKernel::non_linear_filter_generic<3, 3>,
999 &NENonLinearFilterKernel::non_linear_filter_generic<5, 5>,
1005 case MatrixPattern::BOX:
1006 ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_box.size());
1007 (this->*func_table_box[_func_idx])(window);
1009 case MatrixPattern::CROSS:
1010 ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_cross.size());
1011 (this->*func_table_cross[_func_idx])(window);
1013 case MatrixPattern::DISK:
1014 ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_disk.size());
1015 (this->*func_table_disk[_func_idx])(window);
1017 case MatrixPattern::OTHER:
1019 ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_generic.size());
1020 (this->*func_table_generic[_func_idx])(window);