arm_compute v17.03.1
[platform/upstream/armcl.git] / src / core / NEON / kernels / NEGaussianPyramidKernel.cpp
1 /*
2  * Copyright (c) 2016, 2017 ARM Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h"
25
26 #include "arm_compute/core/AccessWindowAutoPadding.h"
27 #include "arm_compute/core/Coordinates.h"
28 #include "arm_compute/core/Error.h"
29 #include "arm_compute/core/Helpers.h"
30 #include "arm_compute/core/ITensor.h"
31 #include "arm_compute/core/TensorInfo.h"
32 #include "arm_compute/core/Types.h"
33 #include "arm_compute/core/Validate.h"
34 #include "arm_compute/core/Window.h"
35
36 #include <arm_neon.h>
37 #include <cstddef>
38 #include <cstdint>
39 #include <tuple>
40
41 using namespace arm_compute;
42
43 NEGaussianPyramidHorKernel::NEGaussianPyramidHorKernel()
44     : _input(nullptr), _output(nullptr)
45 {
46 }
47
48 NEGaussianPyramidVertKernel::NEGaussianPyramidVertKernel()
49     : _input(nullptr), _output(nullptr)
50 {
51 }
52
53 void NEGaussianPyramidHorKernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
54 {
55     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
56     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16);
57     ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != 2 * output->info()->dimension(0));
58     ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1));
59
60     for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
61     {
62         ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
63     }
64
65     _input  = input;
66     _output = output;
67
68     const unsigned int processed_elements = 8;
69
70     // Configure kernel window
71     Window                  win = calculate_max_window_horizontal(*input->info(), Steps(processed_elements), border_undefined, border_size());
72     AccessWindowAutoPadding output_access(output->info());
73
74     update_window_and_padding(win,
75                               AccessWindowAutoPadding(input->info()),
76                               output_access);
77
78     output_access.set_valid_region();
79
80     INEKernel::configure(win);
81 }
82
83 BorderSize NEGaussianPyramidHorKernel::border_size() const
84 {
85     return BorderSize(2);
86 }
87
88 void NEGaussianPyramidHorKernel::run(const Window &window)
89 {
90     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
91     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
92     ARM_COMPUTE_ERROR_ON(window.x().step() % 2);
93
94     const int16x8_t six  = vdupq_n_s16(6);
95     const int16x8_t four = vdupq_n_s16(4);
96
97     //The output is half the width of the input:
98     Window win_out(window);
99     win_out.set(Window::DimX, Window::Dimension(window.x().start() / 2, window.x().end() / 2, window.x().step() / 2));
100
101     Iterator out(_output, win_out);
102
103     const int even_width = 1 - (_input->info()->dimension(0) % 2);
104     Window    win_in(window);
105     win_in.shift(Window::DimX, -2 + even_width);
106
107     Iterator in(_input, win_in);
108
109     execute_window_loop(window, [&](const Coordinates & id)
110     {
111         const uint8x16x2_t data_2q   = vld2q_u8(in.ptr());
112         const uint8x16_t &data_even = data_2q.val[0];
113         const uint8x16_t &data_odd  = data_2q.val[1];
114
115         const int16x8_t data_l2 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data_even)));
116         const int16x8_t data_l1 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data_odd)));
117         const int16x8_t data_m  = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vextq_u8(data_even, data_even, 1))));
118         const int16x8_t data_r1 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vextq_u8(data_odd, data_odd, 1))));
119         const int16x8_t data_r2 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vextq_u8(data_even, data_even, 2))));
120
121         int16x8_t out_val = vaddq_s16(data_l2, data_r2);
122         out_val           = vmlaq_s16(out_val, data_l1, four);
123         out_val           = vmlaq_s16(out_val, data_m, six);
124         out_val           = vmlaq_s16(out_val, data_r1, four);
125
126         vst1q_s16(reinterpret_cast<int16_t *>(out.ptr()), out_val);
127     },
128     in, out);
129 }
130
131 void NEGaussianPyramidVertKernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
132 {
133     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16);
134     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
135
136     ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0));
137     ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != 2 * output->info()->dimension(1));
138
139     for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
140     {
141         ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
142     }
143
144     _input  = input;
145     _output = output;
146
147     const int          even_height        = 1 - (_input->info()->dimension(1) % 2);
148     const unsigned int processed_elements = 16;
149
150     // Configure kernel window
151     Window win = calculate_max_window(*input->info(), Steps(processed_elements), border_undefined, border_size());
152     // Use all elements in X direction
153     win.set(Window::DimY, Window::Dimension(win.y().start() + even_height, win.y().end() + even_height, 2));
154
155     AccessWindowAutoPadding output_access(output->info());
156
157     update_window_and_padding(win,
158                               AccessWindowAutoPadding(input->info()),
159                               output_access);
160
161     output_access.set_valid_region();
162
163     INEKernel::configure(win);
164 }
165
166 BorderSize NEGaussianPyramidVertKernel::border_size() const
167 {
168     return BorderSize(2, 0);
169 }
170
171 void NEGaussianPyramidVertKernel::run(const Window &window)
172 {
173     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
174     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
175     ARM_COMPUTE_ERROR_ON(window.x().step() != 16);
176     ARM_COMPUTE_ERROR_ON(window.y().step() % 2);
177     ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
178
179     const uint16x8_t six  = vdupq_n_u16(6);
180     const uint16x8_t four = vdupq_n_u16(4);
181
182     Window win_in(window);
183     win_in.set_dimension_step(Window::DimX, 8);
184
185     Iterator in(_input, win_in);
186
187     Window win_out(window);
188     win_out.set(Window::DimY, Window::Dimension(window.y().start() / 2, window.y().end() / 2, 1));
189
190     Iterator out(_output, win_out);
191
192     const uint8_t *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(win_in.x().start(), 2));
193     const uint8_t *input_top_ptr  = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(win_in.x().start(), 1));
194     const uint8_t *input_mid_ptr  = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(win_in.x().start(), 0));
195     const uint8_t *input_low_ptr  = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(win_in.x().start(), -1));
196     const uint8_t *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(win_in.x().start(), -2));
197
198     execute_window_loop(window, [&](const Coordinates & id)
199     {
200         // Low data
201         const uint16x8_t data_low_t2 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top2_ptr + in.offset())));
202         const uint16x8_t data_low_t1 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top_ptr + in.offset())));
203         const uint16x8_t data_low_m  = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_mid_ptr + in.offset())));
204         const uint16x8_t data_low_b1 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_low_ptr + in.offset())));
205         const uint16x8_t data_low_b2 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_low2_ptr + in.offset())));
206
207         uint16x8_t out_low = vaddq_u16(data_low_t2, data_low_b2);
208         out_low            = vmlaq_u16(out_low, data_low_t1, four);
209         out_low            = vmlaq_u16(out_low, data_low_m, six);
210         out_low            = vmlaq_u16(out_low, data_low_b1, four);
211
212         in.increment(Window::DimX);
213
214         // High data
215         const uint16x8_t data_high_t2 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top2_ptr + in.offset())));
216         const uint16x8_t data_high_t1 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top_ptr + in.offset())));
217         const uint16x8_t data_high_m  = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_mid_ptr + in.offset())));
218         const uint16x8_t data_high_b1 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_low_ptr + in.offset())));
219         const uint16x8_t data_high_b2 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_low2_ptr + in.offset())));
220
221         uint16x8_t out_high = vaddq_u16(data_high_t2, data_high_b2);
222         out_high            = vmlaq_u16(out_high, data_high_t1, four);
223         out_high            = vmlaq_u16(out_high, data_high_m, six);
224         out_high            = vmlaq_u16(out_high, data_high_b1, four);
225
226         vst1q_u8(out.ptr(), vcombine_u8(vqshrn_n_u16(out_low, 8), vqshrn_n_u16(out_high, 8)));
227     },
228     in, out);
229 }