1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
5 #include "ie_preprocess_data.hpp"
6 #include "ie_preprocess_data_sse42.hpp"
8 #include <nmmintrin.h> // SSE 4.2
12 namespace InferenceEngine {
15 static inline int ceil(double value) {
16 __m128d t = _mm_set_sd(value);
17 int i = _mm_cvtsd_si32(t);
18 return i + _mm_movemask_pd(_mm_cmplt_sd(_mm_cvtsi32_sd(t, i), t));
22 static inline int floor(double value) {
23 __m128d t = _mm_set_sd(value);
24 int i = _mm_cvtsd_si32(t);
25 return i - _mm_movemask_pd(_mm_cmplt_sd(t, _mm_cvtsi32_sd(t, i)));
28 static inline int16_t mulq15(int16_t a, int16_t b) {
29 return static_cast<int16_t>(((1 << 14) + (int32_t)a * (int32_t)b) >> 15);
32 static inline uint16_t mulq16(uint16_t a, uint16_t b) {
33 return static_cast<uint16_t>(((uint32_t)a * (uint32_t)b) >> 16);
36 void resize_bilinear_u8(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* buffer) {
37 Border border = {BORDER_REPLICATE, 0};
39 auto dstDims = outBlob->getTensorDesc().getDims();
40 auto srcDims = inBlob->getTensorDesc().getDims();
42 auto dwidth = static_cast<const int>(dstDims[3]);
43 auto dheight = static_cast<const int>(dstDims[2]);
44 auto swidth = static_cast<const int>(srcDims[3]);
45 auto channels = static_cast<const int>(srcDims[1]);
47 auto src_strides = inBlob->getTensorDesc().getBlockingDesc().getStrides();
48 auto dst_strides = outBlob->getTensorDesc().getBlockingDesc().getStrides();
49 auto origSrcW = src_strides[2];
50 auto origSrcH = src_strides[1] / src_strides[2];
51 auto origDstW = dst_strides[2];
52 auto origDstH = dst_strides[1] / dst_strides[2];
54 const int src_go_x = 0;
55 const int src_go_y = 0;
56 const int dst_go_x = 0;
57 const int dst_go_y = 0;
58 auto src_full_width = static_cast<const int>(srcDims[3]);
59 auto src_full_height = static_cast<const int>(srcDims[2]);
60 auto dst_full_width = static_cast<const int>(dstDims[3]);
61 auto dst_full_height = static_cast<const int>(dstDims[2]);
63 const uint8_t *sptr = static_cast<uint8_t *>(inBlob->buffer()) +
64 inBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
65 uint8_t *dptr = static_cast<uint8_t *>(outBlob->buffer()) +
66 outBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
68 auto sstep = static_cast<const int>(inBlob->getTensorDesc().getBlockingDesc().getStrides()[2]);
69 auto dstep = static_cast<const int>(outBlob->getTensorDesc().getBlockingDesc().getStrides()[2]);
70 auto scale_x = static_cast<float>(src_full_width) / dst_full_width;
71 auto scale_y = static_cast<float>(src_full_height) / dst_full_height;
74 const int SCALE = (1 << BITS);
75 const int alpha_clones_num = 4;
76 const int cols_block_size = 8;
77 const int kRowsBlockSize = 4;
79 auto *pxofs1 = reinterpret_cast<int32_t *>(buffer);
80 auto *alpha = reinterpret_cast<int16_t *>(pxofs1 + dwidth);
81 auto *yofs = reinterpret_cast<int32_t *>(alpha + dwidth * alpha_clones_num);
82 auto *beta = reinterpret_cast<int16_t *>(yofs + dheight);
83 auto *tptr = reinterpret_cast<uint8_t *>(beta + dheight);
87 tptr_[0] = (uint8_t) border.value;
88 tptr_[1] = (uint8_t) border.value;
89 tptr_[2] = (uint8_t) border.value;
90 tptr_[3] = (uint8_t) border.value;
91 tptr_[swidth + 0 + 4] = (uint8_t) border.value;
92 tptr_[swidth + 1 + 4] = (uint8_t) border.value;
93 tptr_[swidth + 2 + 4] = (uint8_t) border.value;
94 tptr_[swidth + 3 + 4] = (uint8_t) border.value;
95 tptr_[swidth * kRowsBlockSize + 0 + 4] = (uint8_t) border.value;
96 tptr_[swidth * kRowsBlockSize + 1 + 4] = (uint8_t) border.value;
97 tptr_[swidth * kRowsBlockSize + 2 + 4] = (uint8_t) border.value;
98 tptr_[swidth * kRowsBlockSize + 3 + 4] = (uint8_t) border.value;
100 for (int dx = dst_go_x; dx < dst_go_x + dwidth; dx++) {
101 auto fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
102 int32_t sx = floor(fx);
106 if (sx < 0 && border.type == BORDER_REPLICATE) {
113 if (sx >= src_full_width - 1 && border.type == BORDER_REPLICATE) {
114 fx = 1.f * SCALE - 1;
115 sx0 = (std::max)(src_full_width - 2, 0);
118 pxofs1[dx - dst_go_x] = kRowsBlockSize * (sx0 - src_go_x);
119 for (int i = 0; i < alpha_clones_num; i++) {
120 alpha[(dx - dst_go_x) * alpha_clones_num + i] = (int16_t) fx;
124 for (int dy = dst_go_y; dy < dst_go_y + dheight; dy++) {
125 float fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
126 int32_t sy = floor(fy);
130 if (sy < 0 && border.type == BORDER_REPLICATE) {
137 if (sy >= src_full_height - 1 && border.type == BORDER_REPLICATE) {
138 fy = 1.f * SCALE - 1;
139 sy0 = (std::max)(src_full_height - 2, 0);
142 yofs[dy - dst_go_y] = (sy0 - src_go_y) * sstep;
143 beta[dy - dst_go_y] = (int16_t) fy;
146 if (swidth < cols_block_size || dwidth < cols_block_size || dheight < kRowsBlockSize) {
147 auto full_pass = [&](int c, int y) {
148 auto sptr_ = sptr + c * origSrcW * origSrcH;
149 auto dptr_ = dptr + c * origDstW * origDstH;
152 for (int x = 0; x < swidth; x++) {
153 int val0 = (yofs[y] < 0) ? border.value : sptr_[yofs[y] + x + 0];
154 int val1 = (yofs[y] / sstep + 1 >= src_full_height - src_go_y) ? border.value : sptr_[yofs[y] + x +
157 int res = val0 + mulq15(beta[y], (int16_t) (val1 - val0));
158 tptr_[x + 4] = (uint8_t) res;
161 for (int x = 0; x < dwidth; x++) {
162 int val0 = tptr_[pxofs1[x] / kRowsBlockSize + 0 + 4];
163 int val1 = tptr_[pxofs1[x] / kRowsBlockSize + 1 + 4];
165 int res = val0 + mulq15(alpha[x * alpha_clones_num], (int16_t) (val1 - val0));
166 dptr_[y * dstep + x] = (uint8_t) res;
170 for (int c = 0; c < channels; c++) {
171 for (int y = 0; y < dheight; y++) {
179 auto full_pass_vec = [&](const uint8_t* sptr_, uint8_t* dptr_, uint8_t* tptr_, int y) {
180 int32_t filtered_rows_id[4];
181 for (int i = 0; i < 4; i++) {
182 filtered_rows_id[i] = (yofs[y + i] < 0) ? 0 :
183 (yofs[y + i] / sstep >= src_full_height - src_go_y - 1) ? 0 : yofs[y + i];
186 __m128i b0 = _mm_set1_epi16(beta[y + 0]);
187 __m128i b1 = _mm_set1_epi16(beta[y + 1]);
188 __m128i b2 = _mm_set1_epi16(beta[y + 2]);
189 __m128i b3 = _mm_set1_epi16(beta[y + 3]);
193 for (; x <= swidth - cols_block_size; x += cols_block_size) {
194 __m128i val0lo = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(sptr_ + x + filtered_rows_id[0])),
195 *(reinterpret_cast<const int64_t *>(sptr_ + x + filtered_rows_id[1])), 1);
196 __m128i val0hi = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(sptr_ + x + filtered_rows_id[2])),
197 *(reinterpret_cast<const int64_t *>(sptr_ + x + filtered_rows_id[3])), 1);
198 __m128i val1lo = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(sptr_ + x + filtered_rows_id[0] + sstep)),
199 *(reinterpret_cast<const int64_t *>(sptr_ + x + filtered_rows_id[1] + sstep)), 1);
200 __m128i val1hi = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(sptr_ + x + filtered_rows_id[2] + sstep)),
201 *(reinterpret_cast<const int64_t *>(sptr_ + x + filtered_rows_id[3] + sstep)), 1);
203 __m128i val0_0 = _mm_unpacklo_epi8(val0lo, _mm_setzero_si128());
204 __m128i val0_1 = _mm_unpackhi_epi8(val0lo, _mm_setzero_si128());
205 __m128i val0_2 = _mm_unpacklo_epi8(val0hi, _mm_setzero_si128());
206 __m128i val0_3 = _mm_unpackhi_epi8(val0hi, _mm_setzero_si128());
208 __m128i val1_0 = _mm_unpacklo_epi8(val1lo, _mm_setzero_si128());
209 __m128i val1_1 = _mm_unpackhi_epi8(val1lo, _mm_setzero_si128());
210 __m128i val1_2 = _mm_unpacklo_epi8(val1hi, _mm_setzero_si128());
211 __m128i val1_3 = _mm_unpackhi_epi8(val1hi, _mm_setzero_si128());
213 __m128i s0_0 = _mm_sub_epi16(val1_0, val0_0);
214 __m128i s0_1 = _mm_sub_epi16(val1_1, val0_1);
215 __m128i s0_2 = _mm_sub_epi16(val1_2, val0_2);
216 __m128i s0_3 = _mm_sub_epi16(val1_3, val0_3);
218 __m128i t0 = _mm_mulhrs_epi16(s0_0, b0);
219 __m128i t1 = _mm_mulhrs_epi16(s0_1, b1);
220 __m128i t2 = _mm_mulhrs_epi16(s0_2, b2);
221 __m128i t3 = _mm_mulhrs_epi16(s0_3, b3);
223 __m128i r0 = _mm_add_epi16(val0_0, t0);
224 __m128i r1 = _mm_add_epi16(val0_1, t1);
225 __m128i r2 = _mm_add_epi16(val0_2, t2);
226 __m128i r3 = _mm_add_epi16(val0_3, t3);
228 __m128i q0 = _mm_packus_epi16(r0, r1);
229 __m128i q1 = _mm_packus_epi16(r2, r3);
231 __m128i q2 = _mm_blend_epi16(q0, _mm_slli_si128(q1, 4), 0xCC /*0b11001100*/);
232 __m128i q3 = _mm_blend_epi16(_mm_srli_si128(q0, 4), q1, 0xCC /*0b11001100*/);
234 __m128i q4 = _mm_shuffle_epi8(q2, _mm_setr_epi8(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15));
235 __m128i q5 = _mm_shuffle_epi8(q3, _mm_setr_epi8(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15));
237 _mm_storeu_si128(reinterpret_cast<__m128i *>(tptr_ + (x + 0) * kRowsBlockSize + 4), q4);
238 _mm_storeu_si128(reinterpret_cast<__m128i *>(tptr_ + (x + 4) * kRowsBlockSize + 4), q5);
242 x = swidth - cols_block_size;
246 if (border.type == BORDER_CONSTANT) {
247 for (int i = 0; i < kRowsBlockSize; i++) {
248 if (yofs[y + i] < 0) {
249 for (x = 0; x < swidth; x++) {
250 int val0 = border.value;
251 int val1 = sptr_[yofs[y + i] + x + sstep];
253 int res = val0 + mulq15(beta[y + i], (int16_t) (val1 - val0));
254 tptr_[x * 4 + i + 4] = (uint8_t) res;
258 if (yofs[y + i] / sstep >= src_full_height - src_go_y - 1) {
259 for (x = 0; x < swidth; x++) {
260 int val0 = sptr_[yofs[y + i] + x];
261 int val1 = border.value;
263 int res = val0 + mulq15(beta[y + i], (int16_t) (val1 - val0));
264 tptr_[x * 4 + i + 4] = (uint8_t) res;
272 for (; x <= dwidth - cols_block_size; x += cols_block_size) {
273 __m128i a10 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(alpha + (x + 0) * alpha_clones_num));
274 __m128i a32 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(alpha + (x + 2) * alpha_clones_num));
275 __m128i a54 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(alpha + (x + 4) * alpha_clones_num));
276 __m128i a76 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(alpha + (x + 6) * alpha_clones_num));
278 __m128i val_0 = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(tptr_ + pxofs1[x + 0] + 4)),
279 *(reinterpret_cast<const int64_t *>(tptr_ + pxofs1[x + 1] + 4)), 1);
280 __m128i val_1 = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(tptr_ + pxofs1[x + 2] + 4)),
281 *(reinterpret_cast<const int64_t *>(tptr_ + pxofs1[x + 3] + 4)), 1);
282 __m128i val_2 = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(tptr_ + pxofs1[x + 4] + 4)),
283 *(reinterpret_cast<const int64_t *>(tptr_ + pxofs1[x + 5] + 4)), 1);
284 __m128i val_3 = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(tptr_ + pxofs1[x + 6] + 4)),
285 *(reinterpret_cast<const int64_t *>(tptr_ + pxofs1[x + 7] + 4)), 1);
287 val_0 = _mm_shuffle_epi32(val_0, _MM_SHUFFLE(3, 1, 2, 0));
288 val_1 = _mm_shuffle_epi32(val_1, _MM_SHUFFLE(3, 1, 2, 0));
289 val_2 = _mm_shuffle_epi32(val_2, _MM_SHUFFLE(3, 1, 2, 0));
290 val_3 = _mm_shuffle_epi32(val_3, _MM_SHUFFLE(3, 1, 2, 0));
292 __m128i val0_0 = _mm_unpacklo_epi8(val_0, _mm_setzero_si128());
293 __m128i val0_1 = _mm_unpacklo_epi8(val_1, _mm_setzero_si128());
294 __m128i val0_2 = _mm_unpacklo_epi8(val_2, _mm_setzero_si128());
295 __m128i val0_3 = _mm_unpacklo_epi8(val_3, _mm_setzero_si128());
297 __m128i val1_0 = _mm_unpackhi_epi8(val_0, _mm_setzero_si128());
298 __m128i val1_1 = _mm_unpackhi_epi8(val_1, _mm_setzero_si128());
299 __m128i val1_2 = _mm_unpackhi_epi8(val_2, _mm_setzero_si128());
300 __m128i val1_3 = _mm_unpackhi_epi8(val_3, _mm_setzero_si128());
302 val1_0 = _mm_sub_epi16(val1_0, val0_0);
303 val1_1 = _mm_sub_epi16(val1_1, val0_1);
304 val1_2 = _mm_sub_epi16(val1_2, val0_2);
305 val1_3 = _mm_sub_epi16(val1_3, val0_3);
307 __m128i t0 = _mm_mulhrs_epi16(val1_0, a10);
308 __m128i t1 = _mm_mulhrs_epi16(val1_1, a32);
309 __m128i t2 = _mm_mulhrs_epi16(val1_2, a54);
310 __m128i t3 = _mm_mulhrs_epi16(val1_3, a76);
312 __m128i r0 = _mm_add_epi16(val0_0, t0);
313 __m128i r1 = _mm_add_epi16(val0_1, t1);
314 __m128i r2 = _mm_add_epi16(val0_2, t2);
315 __m128i r3 = _mm_add_epi16(val0_3, t3);
317 __m128i q0 = _mm_packus_epi16(r0, r1);
318 __m128i q1 = _mm_packus_epi16(r2, r3);
320 __m128i q2 = _mm_shuffle_epi8(q0, _mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15));
321 __m128i q3 = _mm_shuffle_epi8(q1, _mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15));
323 __m128i q4 = _mm_blend_epi16(q2, _mm_slli_si128(q3, 4), 0xCC /*0b11001100*/);
324 __m128i q5 = _mm_blend_epi16(_mm_srli_si128(q2, 4), q3, 0xCC /*0b11001100*/);
326 _mm_storel_epi64(reinterpret_cast<__m128i *>(dptr_ + (y + 0) * dstep + x), q4);
327 _mm_storel_epi64(reinterpret_cast<__m128i *>(dptr_ + (y + 1) * dstep + x), _mm_srli_si128(q4, 8));
328 _mm_storel_epi64(reinterpret_cast<__m128i *>(dptr_ + (y + 2) * dstep + x), q5);
329 _mm_storel_epi64(reinterpret_cast<__m128i *>(dptr_ + (y + 3) * dstep + x), _mm_srli_si128(q5, 8));
333 x = dwidth - cols_block_size;
334 goto horizontal_pass;
338 for (int c = 0; c < channels; c++) {
339 for (int y = 0; y <= dheight - kRowsBlockSize; y += kRowsBlockSize) {
340 auto sptr_ = sptr + c * origSrcW * origSrcH;
341 auto dptr_ = dptr + c * origDstW * origDstH;
344 full_pass_vec(sptr_, dptr_, tptr_, y);
346 if (y + kRowsBlockSize > dheight - kRowsBlockSize)
347 full_pass_vec(sptr_, dptr_, tptr_, dheight - kRowsBlockSize);
352 void resize_area_u8_downscale(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* buffer) {
353 auto dstDims = outBlob->getTensorDesc().getDims();
354 auto srcDims = inBlob->getTensorDesc().getDims();
356 auto dwidth = static_cast<const int>(dstDims[3]);
357 auto dheight = static_cast<const int>(dstDims[2]);
358 auto swidth = static_cast<const int>(srcDims[3]);
359 auto sheight = static_cast<const int>(srcDims[2]);
360 auto channels = static_cast<const int>(srcDims[1]);
362 auto src_strides = inBlob->getTensorDesc().getBlockingDesc().getStrides();
363 auto dst_strides = outBlob->getTensorDesc().getBlockingDesc().getStrides();
364 auto origSrcW = src_strides[2];
365 auto origSrcH = src_strides[1] / src_strides[2];
366 auto origDstW = dst_strides[2];
367 auto origDstH = dst_strides[1] / dst_strides[2];
369 const int src_go_x = 0;
370 const int src_go_y = 0;
371 const int dst_go_x = 0;
372 const int dst_go_y = 0;
374 auto src_full_width = static_cast<const int>(srcDims[3]);
375 auto src_full_height = static_cast<const int>(srcDims[2]);
376 auto dst_full_width = static_cast<const int>(dstDims[3]);
377 auto dst_full_height = static_cast<const int>(dstDims[2]);
379 auto sptr = static_cast<uint8_t*>(inBlob->buffer()) + inBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
380 auto dptr = static_cast<uint8_t*>(outBlob->buffer()) + outBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
381 auto sstep = static_cast<const int>(inBlob->getTensorDesc().getBlockingDesc().getStrides()[2]);
382 auto dstep = static_cast<const int>(outBlob->getTensorDesc().getBlockingDesc().getStrides()[2]);
384 float scale_x = static_cast<float>(src_full_width) / dst_full_width;
385 float scale_y = static_cast<float>(src_full_height) / dst_full_height;
387 int x_max_count = getResizeAreaTabSize(dst_go_x, src_full_width, dwidth, scale_x);
388 int y_max_count = getResizeAreaTabSize(dst_go_y, src_full_height, dheight, scale_y);
390 auto* xsi = reinterpret_cast<uint16_t*>(buffer);
391 auto* ysi = xsi + dwidth;
392 auto* xalpha = ysi + dheight;
393 auto* yalpha = xalpha + dwidth*x_max_count + 8*16;
395 computeResizeAreaTab(src_go_x, dst_go_x, src_full_width, dwidth, scale_x, xsi, xalpha, x_max_count);
396 computeResizeAreaTab(src_go_y, dst_go_y, src_full_height, dheight, scale_y, ysi, yalpha, y_max_count);
398 int vest_sum_size = 2*swidth;
399 uint16_t* vert_sum = yalpha + dheight*y_max_count;
400 uint16_t* alpha0 = vert_sum + vest_sum_size;
401 uint16_t* alpha1 = alpha0 + dwidth;
402 uint16_t* alpha2 = alpha1 + dwidth;
403 uint16_t* alpha3 = alpha2 + dwidth;
404 uint16_t* sxid0 = alpha3 + dwidth;
405 uint16_t* sxid1 = sxid0 + 4*dwidth;
406 uint16_t* sxid2 = sxid1 + 4*dwidth;
407 uint16_t* sxid3 = sxid2 + 4*dwidth;
409 uint16_t* alpha[] = {alpha0, alpha1, alpha2, alpha3};
410 uint16_t* sxid[] = {sxid0, sxid1, sxid2, sxid3};
411 generate_alpha_and_id_arrays(x_max_count, dwidth, xalpha, xsi, alpha, sxid);
413 auto full_pass = [&](int c, int y) {
414 uint8_t* pdst_row = dptr + (y * dstep) + c * origDstW * origDstH;
415 uint16_t* vert_sum_ = vert_sum;
417 int ysi_row = ysi[y];
419 memset(vert_sum_, 0, swidth * sizeof(uint16_t));
421 for (int dy = 0; dy < y_max_count; dy++) {
422 uint16_t yalpha_dy = yalpha[y * y_max_count + dy];
423 const uint8_t *sptr_dy = sptr + ((ysi_row + dy) * sstep) + c * origSrcW * origSrcH;
424 if (ysi_row + dy >= sheight) break;
428 __m128i yalpha_dy_sse = _mm_set1_epi16(yalpha_dy);
429 for (; x <= swidth - 16; x += 16) {
430 __m128i sval = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sptr_dy + x));
433 __m128i sval_Q16_lo = _mm_unpacklo_epi8(_mm_setzero_si128(), sval);
434 __m128i sval_Q16_hi = _mm_unpackhi_epi8(_mm_setzero_si128(), sval);
436 __m128i vert_sum_lo = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + x + 0));
437 __m128i vert_sum_hi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + x + 8));
439 vert_sum_lo = _mm_add_epi16(vert_sum_lo, _mm_mulhi_epu16(yalpha_dy_sse, sval_Q16_lo));
440 vert_sum_hi = _mm_add_epi16(vert_sum_hi, _mm_mulhi_epu16(yalpha_dy_sse, sval_Q16_hi));
442 _mm_storeu_si128(reinterpret_cast<__m128i*>(vert_sum_ + x + 0), vert_sum_lo);
443 _mm_storeu_si128(reinterpret_cast<__m128i*>(vert_sum_ + x + 8), vert_sum_hi);
446 for (; x < swidth; x++) {
447 vert_sum_[x] += mulq16(yalpha_dy, static_cast<uint16_t>(sptr_dy[x] << 8));
451 if (x_max_count == 2) {
453 for (; x <= dwidth - 8; x += 8) {
454 __m128i res = _mm_set1_epi16(1 << (8 - 1));
458 __m128i chunk0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0));
459 __m128i chunk1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 8));
461 __m128i sx0_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 2));
462 __m128i sx0_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 2 + 8));
464 __m128i sx1_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 2));
465 __m128i sx1_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 2 + 8));
467 __m128i vert_sum0 = _mm_or_si128(_mm_shuffle_epi8(chunk0, sx0_id0),
468 _mm_shuffle_epi8(chunk1, sx0_id1));
469 __m128i vert_sum1 = _mm_or_si128(_mm_shuffle_epi8(chunk0, sx1_id0),
470 _mm_shuffle_epi8(chunk1, sx1_id1));
472 res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha0 + x)), vert_sum0));
473 res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha1 + x)), vert_sum1));
475 res = _mm_srli_epi16(res, 8);
476 res = _mm_packus_epi16(res, res);
477 _mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res);
480 for (; x < dwidth; x++) {
481 uint16_t res = 1 << (8 - 1);
483 res += mulq16(alpha0[x], vert_sum_[id + 0]);
484 res += mulq16(alpha1[x], vert_sum_[id + 1]);
485 pdst_row[x] = saturateU32toU8(res >> 8);
487 } else if (x_max_count == 3) {
489 for (; x <= dwidth - 8; x += 8) {
490 __m128i res = _mm_set1_epi16(1 << (8 - 1));
494 __m128i chunk0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0));
495 __m128i chunk1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 8));
496 __m128i chunk2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 16));
498 __m128i sx0_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 3));
499 __m128i sx0_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 3 + 8));
500 __m128i sx0_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 3 + 16));
502 __m128i sx1_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 3));
503 __m128i sx1_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 3 + 8));
504 __m128i sx1_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 3 + 16));
506 __m128i sx2_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 3));
507 __m128i sx2_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 3 + 8));
508 __m128i sx2_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 3 + 16));
510 __m128i vert_sum0 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx0_id0),
511 _mm_shuffle_epi8(chunk1, sx0_id1)),
512 _mm_shuffle_epi8(chunk2, sx0_id2));
513 __m128i vert_sum1 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx1_id0),
514 _mm_shuffle_epi8(chunk1, sx1_id1)),
515 _mm_shuffle_epi8(chunk2, sx1_id2));
516 __m128i vert_sum2 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx2_id0),
517 _mm_shuffle_epi8(chunk1, sx2_id1)),
518 _mm_shuffle_epi8(chunk2, sx2_id2));
520 res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha0 + x)), vert_sum0));
521 res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha1 + x)), vert_sum1));
522 res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha2 + x)), vert_sum2));
524 res = _mm_srli_epi16(res, 8);
525 res = _mm_packus_epi16(res, res);
526 _mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res);
529 for (; x < dwidth; x++) {
530 uint16_t res = 1 << (8 - 1);
532 res += mulq16(alpha0[x], vert_sum_[id + 0]);
533 res += mulq16(alpha1[x], vert_sum_[id + 1]);
534 res += mulq16(alpha2[x], vert_sum_[id + 2]);
535 pdst_row[x] = saturateU32toU8(res >> 8);
537 } else if (x_max_count == 4) {
539 for (; x <= dwidth - 8; x += 8) {
540 __m128i res = _mm_set1_epi16(1 << (8 - 1));
544 __m128i chunk0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0));
545 __m128i chunk1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 8));
546 __m128i chunk2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 16));
547 __m128i chunk3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 24));
549 __m128i sx0_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4));
550 __m128i sx0_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4 + 8));
551 __m128i sx0_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4 + 16));
552 __m128i sx0_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4 + 24));
554 __m128i sx1_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4));
555 __m128i sx1_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4 + 8));
556 __m128i sx1_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4 + 16));
557 __m128i sx1_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4 + 24));
559 __m128i sx2_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4));
560 __m128i sx2_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4 + 8));
561 __m128i sx2_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4 + 16));
562 __m128i sx2_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4 + 24));
564 __m128i sx3_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4));
565 __m128i sx3_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4 + 8));
566 __m128i sx3_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4 + 16));
567 __m128i sx3_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4 + 24));
569 __m128i vert_sum0 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx0_id0),
570 _mm_shuffle_epi8(chunk1, sx0_id1)),
571 _mm_or_si128(_mm_shuffle_epi8(chunk2, sx0_id2),
572 _mm_shuffle_epi8(chunk3, sx0_id3)));
573 __m128i vert_sum1 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx1_id0),
574 _mm_shuffle_epi8(chunk1, sx1_id1)),
575 _mm_or_si128(_mm_shuffle_epi8(chunk2, sx1_id2),
576 _mm_shuffle_epi8(chunk3, sx1_id3)));
577 __m128i vert_sum2 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx2_id0),
578 _mm_shuffle_epi8(chunk1, sx2_id1)),
579 _mm_or_si128(_mm_shuffle_epi8(chunk2, sx2_id2),
580 _mm_shuffle_epi8(chunk3, sx2_id3)));
581 __m128i vert_sum3 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx3_id0),
582 _mm_shuffle_epi8(chunk1, sx3_id1)),
583 _mm_or_si128(_mm_shuffle_epi8(chunk2, sx3_id2),
584 _mm_shuffle_epi8(chunk3, sx3_id3)));
586 res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha0 + x)), vert_sum0));
587 res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha1 + x)), vert_sum1));
588 res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha2 + x)), vert_sum2));
589 res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha3 + x)), vert_sum3));
591 res = _mm_srli_epi16(res, 8);
592 res = _mm_packus_epi16(res, res);
593 _mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res);
596 for (; x < dwidth; x++) {
597 uint16_t res = 1 << (8 - 1);
599 res += mulq16(alpha0[x], vert_sum_[id + 0]);
600 res += mulq16(alpha1[x], vert_sum_[id + 1]);
601 res += mulq16(alpha2[x], vert_sum_[id + 2]);
602 res += mulq16(alpha3[x], vert_sum_[id + 3]);
603 pdst_row[x] = saturateU32toU8(res >> 8);
605 } else if (x_max_count <= 7) {
607 for (; x <= dwidth - 8; x += 8) {
608 __m128i res = _mm_set1_epi16(1 << (16 - 8 - 1));
609 for (int i = 0; i < x_max_count; i++) {
610 __m128i valpha = _mm_setr_epi16(xalpha[x * x_max_count + x_max_count * 0 + i],
611 xalpha[x * x_max_count + x_max_count * 1 + i],
612 xalpha[x * x_max_count + x_max_count * 2 + i],
613 xalpha[x * x_max_count + x_max_count * 3 + i],
614 xalpha[x * x_max_count + x_max_count * 4 + i],
615 xalpha[x * x_max_count + x_max_count * 5 + i],
616 xalpha[x * x_max_count + x_max_count * 6 + i],
617 xalpha[x * x_max_count + x_max_count * 7 + i]);
618 __m128i vvert_sum = _mm_setr_epi16(vert_sum_[xsi[x + 0] + i],
619 vert_sum_[xsi[x + 1] + i],
620 vert_sum_[xsi[x + 2] + i],
621 vert_sum_[xsi[x + 3] + i],
622 vert_sum_[xsi[x + 4] + i],
623 vert_sum_[xsi[x + 5] + i],
624 vert_sum_[xsi[x + 6] + i],
625 vert_sum_[xsi[x + 7] + i]);
627 res = _mm_add_epi16(res, _mm_mulhi_epu16(valpha, vvert_sum));
629 res = _mm_srli_epi16(res, 8);
630 res = _mm_packus_epi16(res, res);
631 _mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res);
634 for (; x < dwidth; x++) {
635 uint16_t res = 1 << (8 - 1);
636 for (int i = 0; i < x_max_count; i++) {
637 uint16_t a = xalpha[x * x_max_count + i];
640 res += mulq16(a, vert_sum_[sx]);
642 pdst_row[x] = saturateU32toU8(res >> 8);
645 for (int x = 0; x < dwidth; x++) {
646 uint16_t res = 1 << (8 - 1);
647 __m128i vres = _mm_setzero_si128();
651 for (; i <= x_max_count - 8; i += 8) {
652 __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i*>(xalpha + x * x_max_count + i));
653 __m128i s = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id + i));
655 vres = _mm_add_epi16(vres, _mm_mulhi_epu16(a, s));
657 vres = _mm_add_epi16(vres, _mm_slli_si128(vres, 2));
658 vres = _mm_add_epi16(vres, _mm_slli_si128(vres, 4));
659 vres = _mm_add_epi16(vres, _mm_slli_si128(vres, 8));
660 res += static_cast<uint16_t>(_mm_extract_epi16(vres, 7));
662 for (; i < x_max_count; i++) {
663 uint16_t a = xalpha[x * x_max_count + i];
664 uint16_t s = vert_sum_[id + i];
669 pdst_row[x] = saturateU32toU8(res >> 8);
674 for (int c = 0; c < channels; c++) {
675 for (int y = 0; y < dheight; y++) {
681 } // namespace Resize
682 } // namespace InferenceEngine