Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / src / inference_engine / cpu_x86_sse42 / ie_preprocess_data_sse42.cpp
1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
3 //
4
5 #include "ie_preprocess_data.hpp"
6 #include "ie_preprocess_data_sse42.hpp"
7
8 #include <nmmintrin.h>  // SSE 4.2
9
10 #include <stdint.h>
11
12 namespace InferenceEngine {
13 namespace Resize {
14
15 static inline int ceil(double value) {
16     __m128d t = _mm_set_sd(value);
17     int i = _mm_cvtsd_si32(t);
18     return i + _mm_movemask_pd(_mm_cmplt_sd(_mm_cvtsi32_sd(t, i), t));
19 }
20
21
22 static inline int floor(double value) {
23     __m128d t = _mm_set_sd(value);
24     int i = _mm_cvtsd_si32(t);
25     return i - _mm_movemask_pd(_mm_cmplt_sd(t, _mm_cvtsi32_sd(t, i)));
26 }
27
28 static inline int16_t mulq15(int16_t a, int16_t b) {
29     return static_cast<int16_t>(((1 << 14) + (int32_t)a * (int32_t)b) >> 15);
30 }
31
32 static inline uint16_t mulq16(uint16_t a, uint16_t b) {
33     return static_cast<uint16_t>(((uint32_t)a * (uint32_t)b) >> 16);
34 }
35
36 void resize_bilinear_u8(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* buffer) {
37     Border border = {BORDER_REPLICATE, 0};
38
39     auto dstDims = outBlob->getTensorDesc().getDims();
40     auto srcDims = inBlob->getTensorDesc().getDims();
41
42     auto dwidth = static_cast<const int>(dstDims[3]);
43     auto dheight = static_cast<const int>(dstDims[2]);
44     auto swidth = static_cast<const int>(srcDims[3]);
45     auto channels = static_cast<const int>(srcDims[1]);
46
47     auto src_strides = inBlob->getTensorDesc().getBlockingDesc().getStrides();
48     auto dst_strides = outBlob->getTensorDesc().getBlockingDesc().getStrides();
49     auto origSrcW = src_strides[2];
50     auto origSrcH = src_strides[1] / src_strides[2];
51     auto origDstW = dst_strides[2];
52     auto origDstH = dst_strides[1] / dst_strides[2];
53
54     const int src_go_x = 0;
55     const int src_go_y = 0;
56     const int dst_go_x = 0;
57     const int dst_go_y = 0;
58     auto src_full_width = static_cast<const int>(srcDims[3]);
59     auto src_full_height = static_cast<const int>(srcDims[2]);
60     auto dst_full_width = static_cast<const int>(dstDims[3]);
61     auto dst_full_height = static_cast<const int>(dstDims[2]);
62
63     const uint8_t *sptr = static_cast<uint8_t *>(inBlob->buffer()) +
64                           inBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
65     uint8_t *dptr = static_cast<uint8_t *>(outBlob->buffer()) +
66                     outBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
67
68     auto sstep = static_cast<const int>(inBlob->getTensorDesc().getBlockingDesc().getStrides()[2]);
69     auto dstep = static_cast<const int>(outBlob->getTensorDesc().getBlockingDesc().getStrides()[2]);
70     auto scale_x = static_cast<float>(src_full_width) / dst_full_width;
71     auto scale_y = static_cast<float>(src_full_height) / dst_full_height;
72
73     const int BITS = 15;
74     const int SCALE = (1 << BITS);
75     const int alpha_clones_num = 4;
76     const int cols_block_size = 8;
77     const int kRowsBlockSize = 4;
78
79     auto *pxofs1 = reinterpret_cast<int32_t *>(buffer);
80     auto *alpha = reinterpret_cast<int16_t *>(pxofs1 + dwidth);
81     auto *yofs = reinterpret_cast<int32_t *>(alpha + dwidth * alpha_clones_num);
82     auto *beta = reinterpret_cast<int16_t *>(yofs + dheight);
83     auto *tptr = reinterpret_cast<uint8_t *>(beta + dheight);
84
85     auto tptr_ = tptr;
86
87     tptr_[0] = (uint8_t) border.value;
88     tptr_[1] = (uint8_t) border.value;
89     tptr_[2] = (uint8_t) border.value;
90     tptr_[3] = (uint8_t) border.value;
91     tptr_[swidth + 0 + 4] = (uint8_t) border.value;
92     tptr_[swidth + 1 + 4] = (uint8_t) border.value;
93     tptr_[swidth + 2 + 4] = (uint8_t) border.value;
94     tptr_[swidth + 3 + 4] = (uint8_t) border.value;
95     tptr_[swidth * kRowsBlockSize + 0 + 4] = (uint8_t) border.value;
96     tptr_[swidth * kRowsBlockSize + 1 + 4] = (uint8_t) border.value;
97     tptr_[swidth * kRowsBlockSize + 2 + 4] = (uint8_t) border.value;
98     tptr_[swidth * kRowsBlockSize + 3 + 4] = (uint8_t) border.value;
99
100     for (int dx = dst_go_x; dx < dst_go_x + dwidth; dx++) {
101         auto fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
102         int32_t sx = floor(fx);
103         fx -= sx;
104
105         int32_t sx0 = sx;
106         if (sx < 0 && border.type == BORDER_REPLICATE) {
107             fx = 0;
108             sx0 = 0;
109         }
110
111         fx = fx * SCALE;
112
113         if (sx >= src_full_width - 1 && border.type == BORDER_REPLICATE) {
114             fx = 1.f * SCALE - 1;
115             sx0 = (std::max)(src_full_width - 2, 0);
116         }
117
118         pxofs1[dx - dst_go_x] = kRowsBlockSize * (sx0 - src_go_x);
119         for (int i = 0; i < alpha_clones_num; i++) {
120             alpha[(dx - dst_go_x) * alpha_clones_num + i] = (int16_t) fx;
121         }
122     }
123
124     for (int dy = dst_go_y; dy < dst_go_y + dheight; dy++) {
125         float fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
126         int32_t sy = floor(fy);
127         fy -= sy;
128
129         int32_t sy0 = sy;
130         if (sy < 0 && border.type == BORDER_REPLICATE) {
131             fy = 0;
132             sy0 = 0;
133         }
134
135         fy = fy * SCALE;
136
137         if (sy >= src_full_height - 1 && border.type == BORDER_REPLICATE) {
138             fy = 1.f * SCALE - 1;
139             sy0 = (std::max)(src_full_height - 2, 0);
140         }
141
142         yofs[dy - dst_go_y] = (sy0 - src_go_y) * sstep;
143         beta[dy - dst_go_y] = (int16_t) fy;
144     }
145
146     if (swidth < cols_block_size || dwidth < cols_block_size || dheight < kRowsBlockSize) {
147         auto full_pass = [&](int c, int y) {
148             auto sptr_ = sptr + c * origSrcW * origSrcH;
149             auto dptr_ = dptr + c * origDstW * origDstH;
150             auto tptr_ = tptr;
151
152             for (int x = 0; x < swidth; x++) {
153                 int val0 = (yofs[y] < 0) ? border.value : sptr_[yofs[y] + x + 0];
154                 int val1 = (yofs[y] / sstep + 1 >= src_full_height - src_go_y) ? border.value : sptr_[yofs[y] + x +
155                                                                                                       sstep];
156
157                 int res = val0 + mulq15(beta[y], (int16_t) (val1 - val0));
158                 tptr_[x + 4] = (uint8_t) res;
159             }
160
161             for (int x = 0; x < dwidth; x++) {
162                 int val0 = tptr_[pxofs1[x] / kRowsBlockSize + 0 + 4];
163                 int val1 = tptr_[pxofs1[x] / kRowsBlockSize + 1 + 4];
164
165                 int res = val0 + mulq15(alpha[x * alpha_clones_num], (int16_t) (val1 - val0));
166                 dptr_[y * dstep + x] = (uint8_t) res;
167             }
168         };
169
170         for (int c = 0; c < channels; c++) {
171             for (int y = 0; y < dheight; y++) {
172                 full_pass(c, y);
173             }
174         }
175
176         return;
177     }
178
179     auto full_pass_vec = [&](const uint8_t* sptr_, uint8_t* dptr_, uint8_t* tptr_, int y) {
180         int32_t filtered_rows_id[4];
181         for (int i = 0; i < 4; i++) {
182             filtered_rows_id[i] = (yofs[y + i] < 0) ? 0 :
183                                   (yofs[y + i] / sstep >= src_full_height - src_go_y - 1) ? 0 : yofs[y + i];
184         }
185
186         __m128i b0 = _mm_set1_epi16(beta[y + 0]);
187         __m128i b1 = _mm_set1_epi16(beta[y + 1]);
188         __m128i b2 = _mm_set1_epi16(beta[y + 2]);
189         __m128i b3 = _mm_set1_epi16(beta[y + 3]);
190
191         int x = 0;
192         vertical_pass:
193         for (; x <= swidth - cols_block_size; x += cols_block_size) {
194             __m128i val0lo = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(sptr_ + x + filtered_rows_id[0])),
195                                               *(reinterpret_cast<const int64_t *>(sptr_ + x + filtered_rows_id[1])), 1);
196             __m128i val0hi = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(sptr_ + x + filtered_rows_id[2])),
197                                               *(reinterpret_cast<const int64_t *>(sptr_ + x + filtered_rows_id[3])), 1);
198             __m128i val1lo = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(sptr_ + x + filtered_rows_id[0] + sstep)),
199                                               *(reinterpret_cast<const int64_t *>(sptr_ + x + filtered_rows_id[1] + sstep)), 1);
200             __m128i val1hi = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(sptr_ + x + filtered_rows_id[2] + sstep)),
201                                               *(reinterpret_cast<const int64_t *>(sptr_ + x + filtered_rows_id[3] + sstep)), 1);
202
203             __m128i val0_0 = _mm_unpacklo_epi8(val0lo, _mm_setzero_si128());
204             __m128i val0_1 = _mm_unpackhi_epi8(val0lo, _mm_setzero_si128());
205             __m128i val0_2 = _mm_unpacklo_epi8(val0hi, _mm_setzero_si128());
206             __m128i val0_3 = _mm_unpackhi_epi8(val0hi, _mm_setzero_si128());
207
208             __m128i val1_0 = _mm_unpacklo_epi8(val1lo, _mm_setzero_si128());
209             __m128i val1_1 = _mm_unpackhi_epi8(val1lo, _mm_setzero_si128());
210             __m128i val1_2 = _mm_unpacklo_epi8(val1hi, _mm_setzero_si128());
211             __m128i val1_3 = _mm_unpackhi_epi8(val1hi, _mm_setzero_si128());
212
213             __m128i s0_0 = _mm_sub_epi16(val1_0, val0_0);
214             __m128i s0_1 = _mm_sub_epi16(val1_1, val0_1);
215             __m128i s0_2 = _mm_sub_epi16(val1_2, val0_2);
216             __m128i s0_3 = _mm_sub_epi16(val1_3, val0_3);
217
218             __m128i t0 = _mm_mulhrs_epi16(s0_0, b0);
219             __m128i t1 = _mm_mulhrs_epi16(s0_1, b1);
220             __m128i t2 = _mm_mulhrs_epi16(s0_2, b2);
221             __m128i t3 = _mm_mulhrs_epi16(s0_3, b3);
222
223             __m128i r0 = _mm_add_epi16(val0_0, t0);
224             __m128i r1 = _mm_add_epi16(val0_1, t1);
225             __m128i r2 = _mm_add_epi16(val0_2, t2);
226             __m128i r3 = _mm_add_epi16(val0_3, t3);
227
228             __m128i q0 = _mm_packus_epi16(r0, r1);
229             __m128i q1 = _mm_packus_epi16(r2, r3);
230
231             __m128i q2 = _mm_blend_epi16(q0, _mm_slli_si128(q1, 4), 0xCC /*0b11001100*/);
232             __m128i q3 = _mm_blend_epi16(_mm_srli_si128(q0, 4), q1, 0xCC /*0b11001100*/);
233
234             __m128i q4 = _mm_shuffle_epi8(q2, _mm_setr_epi8(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15));
235             __m128i q5 = _mm_shuffle_epi8(q3, _mm_setr_epi8(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15));
236
237             _mm_storeu_si128(reinterpret_cast<__m128i *>(tptr_ + (x + 0) * kRowsBlockSize + 4), q4);
238             _mm_storeu_si128(reinterpret_cast<__m128i *>(tptr_ + (x + 4) * kRowsBlockSize + 4), q5);
239         }
240
241         if (x < swidth) {
242             x = swidth - cols_block_size;
243             goto vertical_pass;
244         }
245
246         if (border.type == BORDER_CONSTANT) {
247             for (int i = 0; i < kRowsBlockSize; i++) {
248                 if (yofs[y + i] < 0) {
249                     for (x = 0; x < swidth; x++) {
250                         int val0 = border.value;
251                         int val1 = sptr_[yofs[y + i] + x + sstep];
252
253                         int res = val0 + mulq15(beta[y + i], (int16_t) (val1 - val0));
254                         tptr_[x * 4 + i + 4] = (uint8_t) res;
255                     }
256                 }
257
258                 if (yofs[y + i] / sstep >= src_full_height - src_go_y - 1) {
259                     for (x = 0; x < swidth; x++) {
260                         int val0 = sptr_[yofs[y + i] + x];
261                         int val1 = border.value;
262
263                         int res = val0 + mulq15(beta[y + i], (int16_t) (val1 - val0));
264                         tptr_[x * 4 + i + 4] = (uint8_t) res;
265                     }
266                 }
267             }
268         }
269
270         x = 0;
271         horizontal_pass:
272         for (; x <= dwidth - cols_block_size; x += cols_block_size) {
273             __m128i a10 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(alpha + (x + 0) * alpha_clones_num));
274             __m128i a32 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(alpha + (x + 2) * alpha_clones_num));
275             __m128i a54 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(alpha + (x + 4) * alpha_clones_num));
276             __m128i a76 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(alpha + (x + 6) * alpha_clones_num));
277
278             __m128i val_0 = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(tptr_ + pxofs1[x + 0] + 4)),
279                                              *(reinterpret_cast<const int64_t *>(tptr_ + pxofs1[x + 1] + 4)), 1);
280             __m128i val_1 = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(tptr_ + pxofs1[x + 2] + 4)),
281                                              *(reinterpret_cast<const int64_t *>(tptr_ + pxofs1[x + 3] + 4)), 1);
282             __m128i val_2 = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(tptr_ + pxofs1[x + 4] + 4)),
283                                              *(reinterpret_cast<const int64_t *>(tptr_ + pxofs1[x + 5] + 4)), 1);
284             __m128i val_3 = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(tptr_ + pxofs1[x + 6] + 4)),
285                                              *(reinterpret_cast<const int64_t *>(tptr_ + pxofs1[x + 7] + 4)), 1);
286
287             val_0 = _mm_shuffle_epi32(val_0, _MM_SHUFFLE(3, 1, 2, 0));
288             val_1 = _mm_shuffle_epi32(val_1, _MM_SHUFFLE(3, 1, 2, 0));
289             val_2 = _mm_shuffle_epi32(val_2, _MM_SHUFFLE(3, 1, 2, 0));
290             val_3 = _mm_shuffle_epi32(val_3, _MM_SHUFFLE(3, 1, 2, 0));
291
292             __m128i val0_0 = _mm_unpacklo_epi8(val_0, _mm_setzero_si128());
293             __m128i val0_1 = _mm_unpacklo_epi8(val_1, _mm_setzero_si128());
294             __m128i val0_2 = _mm_unpacklo_epi8(val_2, _mm_setzero_si128());
295             __m128i val0_3 = _mm_unpacklo_epi8(val_3, _mm_setzero_si128());
296
297             __m128i val1_0 = _mm_unpackhi_epi8(val_0, _mm_setzero_si128());
298             __m128i val1_1 = _mm_unpackhi_epi8(val_1, _mm_setzero_si128());
299             __m128i val1_2 = _mm_unpackhi_epi8(val_2, _mm_setzero_si128());
300             __m128i val1_3 = _mm_unpackhi_epi8(val_3, _mm_setzero_si128());
301
302             val1_0 = _mm_sub_epi16(val1_0, val0_0);
303             val1_1 = _mm_sub_epi16(val1_1, val0_1);
304             val1_2 = _mm_sub_epi16(val1_2, val0_2);
305             val1_3 = _mm_sub_epi16(val1_3, val0_3);
306
307             __m128i t0 = _mm_mulhrs_epi16(val1_0, a10);
308             __m128i t1 = _mm_mulhrs_epi16(val1_1, a32);
309             __m128i t2 = _mm_mulhrs_epi16(val1_2, a54);
310             __m128i t3 = _mm_mulhrs_epi16(val1_3, a76);
311
312             __m128i r0 = _mm_add_epi16(val0_0, t0);
313             __m128i r1 = _mm_add_epi16(val0_1, t1);
314             __m128i r2 = _mm_add_epi16(val0_2, t2);
315             __m128i r3 = _mm_add_epi16(val0_3, t3);
316
317             __m128i q0 = _mm_packus_epi16(r0, r1);
318             __m128i q1 = _mm_packus_epi16(r2, r3);
319
320             __m128i q2 = _mm_shuffle_epi8(q0, _mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15));
321             __m128i q3 = _mm_shuffle_epi8(q1, _mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15));
322
323             __m128i q4 = _mm_blend_epi16(q2, _mm_slli_si128(q3, 4), 0xCC /*0b11001100*/);
324             __m128i q5 = _mm_blend_epi16(_mm_srli_si128(q2, 4), q3, 0xCC /*0b11001100*/);
325
326             _mm_storel_epi64(reinterpret_cast<__m128i *>(dptr_ + (y + 0) * dstep + x), q4);
327             _mm_storel_epi64(reinterpret_cast<__m128i *>(dptr_ + (y + 1) * dstep + x), _mm_srli_si128(q4, 8));
328             _mm_storel_epi64(reinterpret_cast<__m128i *>(dptr_ + (y + 2) * dstep + x), q5);
329             _mm_storel_epi64(reinterpret_cast<__m128i *>(dptr_ + (y + 3) * dstep + x), _mm_srli_si128(q5, 8));
330         }
331
332         if (x < dwidth) {
333             x = dwidth - cols_block_size;
334             goto horizontal_pass;
335         }
336     };
337
338     for (int c = 0; c < channels; c++) {
339         for (int y = 0; y <= dheight - kRowsBlockSize; y += kRowsBlockSize) {
340             auto sptr_ = sptr + c * origSrcW * origSrcH;
341             auto dptr_ = dptr + c * origDstW * origDstH;
342             auto tptr_ = tptr;
343
344             full_pass_vec(sptr_, dptr_, tptr_, y);
345
346             if (y + kRowsBlockSize > dheight - kRowsBlockSize)
347                 full_pass_vec(sptr_, dptr_, tptr_, dheight - kRowsBlockSize);
348         }
349     }
350 }
351
352 void resize_area_u8_downscale(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* buffer) {
353     auto dstDims = outBlob->getTensorDesc().getDims();
354     auto srcDims = inBlob->getTensorDesc().getDims();
355
356     auto dwidth = static_cast<const int>(dstDims[3]);
357     auto dheight = static_cast<const int>(dstDims[2]);
358     auto swidth = static_cast<const int>(srcDims[3]);
359     auto sheight = static_cast<const int>(srcDims[2]);
360     auto channels = static_cast<const int>(srcDims[1]);
361
362     auto src_strides = inBlob->getTensorDesc().getBlockingDesc().getStrides();
363     auto dst_strides = outBlob->getTensorDesc().getBlockingDesc().getStrides();
364     auto origSrcW = src_strides[2];
365     auto origSrcH = src_strides[1] / src_strides[2];
366     auto origDstW = dst_strides[2];
367     auto origDstH = dst_strides[1] / dst_strides[2];
368
369     const int src_go_x = 0;
370     const int src_go_y = 0;
371     const int dst_go_x = 0;
372     const int dst_go_y = 0;
373
374     auto src_full_width = static_cast<const int>(srcDims[3]);
375     auto src_full_height = static_cast<const int>(srcDims[2]);
376     auto dst_full_width = static_cast<const int>(dstDims[3]);
377     auto dst_full_height = static_cast<const int>(dstDims[2]);
378
379     auto sptr = static_cast<uint8_t*>(inBlob->buffer()) + inBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
380     auto dptr = static_cast<uint8_t*>(outBlob->buffer()) + outBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
381     auto sstep = static_cast<const int>(inBlob->getTensorDesc().getBlockingDesc().getStrides()[2]);
382     auto dstep = static_cast<const int>(outBlob->getTensorDesc().getBlockingDesc().getStrides()[2]);
383
384     float scale_x = static_cast<float>(src_full_width) / dst_full_width;
385     float scale_y = static_cast<float>(src_full_height) / dst_full_height;
386
387     int x_max_count = getResizeAreaTabSize(dst_go_x, src_full_width,  dwidth,  scale_x);
388     int y_max_count = getResizeAreaTabSize(dst_go_y, src_full_height, dheight, scale_y);
389
390     auto* xsi = reinterpret_cast<uint16_t*>(buffer);
391     auto* ysi = xsi + dwidth;
392     auto* xalpha = ysi + dheight;
393     auto* yalpha = xalpha + dwidth*x_max_count + 8*16;
394
395     computeResizeAreaTab(src_go_x, dst_go_x, src_full_width,   dwidth, scale_x, xsi, xalpha, x_max_count);
396     computeResizeAreaTab(src_go_y, dst_go_y, src_full_height, dheight, scale_y, ysi, yalpha, y_max_count);
397
398     int vest_sum_size = 2*swidth;
399     uint16_t* vert_sum = yalpha + dheight*y_max_count;
400     uint16_t* alpha0 = vert_sum + vest_sum_size;
401     uint16_t* alpha1 = alpha0 + dwidth;
402     uint16_t* alpha2 = alpha1 + dwidth;
403     uint16_t* alpha3 = alpha2 + dwidth;
404     uint16_t* sxid0 = alpha3 + dwidth;
405     uint16_t* sxid1 = sxid0 + 4*dwidth;
406     uint16_t* sxid2 = sxid1 + 4*dwidth;
407     uint16_t* sxid3 = sxid2 + 4*dwidth;
408
409     uint16_t* alpha[] = {alpha0, alpha1, alpha2, alpha3};
410     uint16_t* sxid[] = {sxid0, sxid1, sxid2, sxid3};
411     generate_alpha_and_id_arrays(x_max_count, dwidth, xalpha, xsi, alpha, sxid);
412
413     auto full_pass = [&](int c, int y) {
414         uint8_t* pdst_row = dptr + (y * dstep) + c * origDstW * origDstH;
415         uint16_t* vert_sum_ = vert_sum;
416
417         int ysi_row = ysi[y];
418
419         memset(vert_sum_, 0, swidth * sizeof(uint16_t));
420
421         for (int dy = 0; dy < y_max_count; dy++) {
422             uint16_t yalpha_dy = yalpha[y * y_max_count + dy];
423             const uint8_t *sptr_dy = sptr + ((ysi_row + dy) * sstep) + c * origSrcW * origSrcH;
424             if (ysi_row + dy >= sheight) break;
425
426             int x = 0;
427
428             __m128i yalpha_dy_sse = _mm_set1_epi16(yalpha_dy);
429             for (; x <= swidth - 16; x += 16) {
430                 __m128i sval = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sptr_dy + x));
431
432                 // sptr_dy[x] << 8
433                 __m128i sval_Q16_lo = _mm_unpacklo_epi8(_mm_setzero_si128(), sval);
434                 __m128i sval_Q16_hi = _mm_unpackhi_epi8(_mm_setzero_si128(), sval);
435
436                 __m128i vert_sum_lo = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + x + 0));
437                 __m128i vert_sum_hi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + x + 8));
438
439                 vert_sum_lo = _mm_add_epi16(vert_sum_lo, _mm_mulhi_epu16(yalpha_dy_sse, sval_Q16_lo));
440                 vert_sum_hi = _mm_add_epi16(vert_sum_hi, _mm_mulhi_epu16(yalpha_dy_sse, sval_Q16_hi));
441
442                 _mm_storeu_si128(reinterpret_cast<__m128i*>(vert_sum_ + x + 0), vert_sum_lo);
443                 _mm_storeu_si128(reinterpret_cast<__m128i*>(vert_sum_ + x + 8), vert_sum_hi);
444             }
445
446             for (; x < swidth; x++) {
447                 vert_sum_[x] += mulq16(yalpha_dy, static_cast<uint16_t>(sptr_dy[x] << 8));
448             }
449         }
450
451         if (x_max_count == 2) {
452             int x = 0;
453             for (; x <= dwidth - 8; x += 8) {
454                 __m128i res = _mm_set1_epi16(1 << (8 - 1));
455
456                 int id0 = xsi[x];
457
458                 __m128i chunk0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0));
459                 __m128i chunk1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 8));
460
461                 __m128i sx0_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 2));
462                 __m128i sx0_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 2 + 8));
463
464                 __m128i sx1_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 2));
465                 __m128i sx1_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 2 + 8));
466
467                 __m128i vert_sum0 = _mm_or_si128(_mm_shuffle_epi8(chunk0, sx0_id0),
468                                                  _mm_shuffle_epi8(chunk1, sx0_id1));
469                 __m128i vert_sum1 = _mm_or_si128(_mm_shuffle_epi8(chunk0, sx1_id0),
470                                                  _mm_shuffle_epi8(chunk1, sx1_id1));
471
472                 res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha0 + x)), vert_sum0));
473                 res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha1 + x)), vert_sum1));
474
475                 res = _mm_srli_epi16(res, 8);
476                 res = _mm_packus_epi16(res, res);
477                 _mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res);
478             }
479
480             for (; x < dwidth; x++) {
481                 uint16_t res = 1 << (8 - 1);
482                 int id = xsi[x];
483                 res += mulq16(alpha0[x], vert_sum_[id + 0]);
484                 res += mulq16(alpha1[x], vert_sum_[id + 1]);
485                 pdst_row[x] = saturateU32toU8(res >> 8);
486             }
487         } else if (x_max_count == 3) {
488             int x = 0;
489             for (; x <= dwidth - 8; x += 8) {
490                 __m128i res = _mm_set1_epi16(1 << (8 - 1));
491
492                 int id0 = xsi[x];
493
494                 __m128i chunk0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0));
495                 __m128i chunk1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 8));
496                 __m128i chunk2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 16));
497
498                 __m128i sx0_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 3));
499                 __m128i sx0_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 3 + 8));
500                 __m128i sx0_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 3 + 16));
501
502                 __m128i sx1_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 3));
503                 __m128i sx1_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 3 + 8));
504                 __m128i sx1_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 3 + 16));
505
506                 __m128i sx2_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 3));
507                 __m128i sx2_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 3 + 8));
508                 __m128i sx2_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 3 + 16));
509
510                 __m128i vert_sum0 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx0_id0),
511                                                               _mm_shuffle_epi8(chunk1, sx0_id1)),
512                                                  _mm_shuffle_epi8(chunk2, sx0_id2));
513                 __m128i vert_sum1 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx1_id0),
514                                                               _mm_shuffle_epi8(chunk1, sx1_id1)),
515                                                  _mm_shuffle_epi8(chunk2, sx1_id2));
516                 __m128i vert_sum2 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx2_id0),
517                                                               _mm_shuffle_epi8(chunk1, sx2_id1)),
518                                                  _mm_shuffle_epi8(chunk2, sx2_id2));
519
520                 res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha0 + x)), vert_sum0));
521                 res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha1 + x)), vert_sum1));
522                 res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha2 + x)), vert_sum2));
523
524                 res = _mm_srli_epi16(res, 8);
525                 res = _mm_packus_epi16(res, res);
526                 _mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res);
527             }
528
529             for (; x < dwidth; x++) {
530                 uint16_t res = 1 << (8 - 1);
531                 int id = xsi[x];
532                 res += mulq16(alpha0[x], vert_sum_[id + 0]);
533                 res += mulq16(alpha1[x], vert_sum_[id + 1]);
534                 res += mulq16(alpha2[x], vert_sum_[id + 2]);
535                 pdst_row[x] = saturateU32toU8(res >> 8);
536             }
537         } else if (x_max_count == 4) {
538             int x = 0;
539             for (; x <= dwidth - 8; x += 8) {
540                 __m128i res = _mm_set1_epi16(1 << (8 - 1));
541
542                 int id0 = xsi[x];
543
544                 __m128i chunk0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0));
545                 __m128i chunk1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 8));
546                 __m128i chunk2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 16));
547                 __m128i chunk3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 24));
548
549                 __m128i sx0_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4));
550                 __m128i sx0_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4 + 8));
551                 __m128i sx0_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4 + 16));
552                 __m128i sx0_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4 + 24));
553
554                 __m128i sx1_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4));
555                 __m128i sx1_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4 + 8));
556                 __m128i sx1_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4 + 16));
557                 __m128i sx1_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4 + 24));
558
559                 __m128i sx2_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4));
560                 __m128i sx2_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4 + 8));
561                 __m128i sx2_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4 + 16));
562                 __m128i sx2_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4 + 24));
563
564                 __m128i sx3_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4));
565                 __m128i sx3_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4 + 8));
566                 __m128i sx3_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4 + 16));
567                 __m128i sx3_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4 + 24));
568
569                 __m128i vert_sum0 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx0_id0),
570                                                               _mm_shuffle_epi8(chunk1, sx0_id1)),
571                                                  _mm_or_si128(_mm_shuffle_epi8(chunk2, sx0_id2),
572                                                               _mm_shuffle_epi8(chunk3, sx0_id3)));
573                 __m128i vert_sum1 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx1_id0),
574                                                               _mm_shuffle_epi8(chunk1, sx1_id1)),
575                                                  _mm_or_si128(_mm_shuffle_epi8(chunk2, sx1_id2),
576                                                               _mm_shuffle_epi8(chunk3, sx1_id3)));
577                 __m128i vert_sum2 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx2_id0),
578                                                               _mm_shuffle_epi8(chunk1, sx2_id1)),
579                                                  _mm_or_si128(_mm_shuffle_epi8(chunk2, sx2_id2),
580                                                               _mm_shuffle_epi8(chunk3, sx2_id3)));
581                 __m128i vert_sum3 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx3_id0),
582                                                               _mm_shuffle_epi8(chunk1, sx3_id1)),
583                                                  _mm_or_si128(_mm_shuffle_epi8(chunk2, sx3_id2),
584                                                               _mm_shuffle_epi8(chunk3, sx3_id3)));
585
586                 res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha0 + x)), vert_sum0));
587                 res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha1 + x)), vert_sum1));
588                 res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha2 + x)), vert_sum2));
589                 res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha3 + x)), vert_sum3));
590
591                 res = _mm_srli_epi16(res, 8);
592                 res = _mm_packus_epi16(res, res);
593                 _mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res);
594             }
595
596             for (; x < dwidth; x++) {
597                 uint16_t res = 1 << (8 - 1);
598                 int id = xsi[x];
599                 res += mulq16(alpha0[x], vert_sum_[id + 0]);
600                 res += mulq16(alpha1[x], vert_sum_[id + 1]);
601                 res += mulq16(alpha2[x], vert_sum_[id + 2]);
602                 res += mulq16(alpha3[x], vert_sum_[id + 3]);
603                 pdst_row[x] = saturateU32toU8(res >> 8);
604             }
605         } else if (x_max_count <= 7) {
606             int x = 0;
607             for (; x <= dwidth - 8; x += 8) {
608                 __m128i res = _mm_set1_epi16(1 << (16 - 8 - 1));
609                 for (int i = 0; i < x_max_count; i++) {
610                     __m128i valpha = _mm_setr_epi16(xalpha[x * x_max_count + x_max_count * 0 + i],
611                                                     xalpha[x * x_max_count + x_max_count * 1 + i],
612                                                     xalpha[x * x_max_count + x_max_count * 2 + i],
613                                                     xalpha[x * x_max_count + x_max_count * 3 + i],
614                                                     xalpha[x * x_max_count + x_max_count * 4 + i],
615                                                     xalpha[x * x_max_count + x_max_count * 5 + i],
616                                                     xalpha[x * x_max_count + x_max_count * 6 + i],
617                                                     xalpha[x * x_max_count + x_max_count * 7 + i]);
618                     __m128i vvert_sum = _mm_setr_epi16(vert_sum_[xsi[x + 0] + i],
619                                                        vert_sum_[xsi[x + 1] + i],
620                                                        vert_sum_[xsi[x + 2] + i],
621                                                        vert_sum_[xsi[x + 3] + i],
622                                                        vert_sum_[xsi[x + 4] + i],
623                                                        vert_sum_[xsi[x + 5] + i],
624                                                        vert_sum_[xsi[x + 6] + i],
625                                                        vert_sum_[xsi[x + 7] + i]);
626
627                     res = _mm_add_epi16(res, _mm_mulhi_epu16(valpha, vvert_sum));
628                 }
629                 res = _mm_srli_epi16(res, 8);
630                 res = _mm_packus_epi16(res, res);
631                 _mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res);
632             }
633
634             for (; x < dwidth; x++) {
635                 uint16_t res = 1 << (8 - 1);
636                 for (int i = 0; i < x_max_count; i++) {
637                     uint16_t a = xalpha[x * x_max_count + i];
638                     int sx = xsi[x] + i;
639
640                     res += mulq16(a, vert_sum_[sx]);
641                 }
642                 pdst_row[x] = saturateU32toU8(res >> 8);
643             }
644         } else {
645             for (int x = 0; x < dwidth; x++) {
646                 uint16_t res = 1 << (8 - 1);
647                 __m128i vres = _mm_setzero_si128();
648                 int id = xsi[x];
649
650                 int i = 0;
651                 for (; i <= x_max_count - 8; i += 8) {
652                     __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i*>(xalpha + x * x_max_count + i));
653                     __m128i s = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id + i));
654
655                     vres = _mm_add_epi16(vres, _mm_mulhi_epu16(a, s));
656                 }
657                 vres = _mm_add_epi16(vres, _mm_slli_si128(vres, 2));
658                 vres = _mm_add_epi16(vres, _mm_slli_si128(vres, 4));
659                 vres = _mm_add_epi16(vres, _mm_slli_si128(vres, 8));
660                 res += static_cast<uint16_t>(_mm_extract_epi16(vres, 7));
661
662                 for (; i < x_max_count; i++) {
663                     uint16_t a = xalpha[x * x_max_count + i];
664                     uint16_t s = vert_sum_[id + i];
665
666                     res += mulq16(a, s);
667                 }
668
669                 pdst_row[x] = saturateU32toU8(res >> 8);
670             }
671         }
672     };
673
674     for (int c = 0; c < channels; c++) {
675         for (int y = 0; y < dheight; y++) {
676             full_pass(c, y);
677         }
678     }
679 }
680
681 }  // namespace Resize
682 }  // namespace InferenceEngine