inference-engine/src/inference_engine/cpu_x86_sse42/ie_preprocess_data_sse42.cpp

   1 // Copyright (C) 2018-2019 Intel Corporation
   2 // SPDX-License-Identifier: Apache-2.0
   3 //
   4
   5 #include "ie_preprocess_data.hpp"
   6 #include "ie_preprocess_data_sse42.hpp"
   7
   8 #include <nmmintrin.h>  // SSE 4.2
   9
  10 #include <stdint.h>
  11
  12 namespace InferenceEngine {
  13 namespace Resize {
  14
  15 static inline int ceil(double value) {
  16     __m128d t = _mm_set_sd(value);
  17     int i = _mm_cvtsd_si32(t);
  18     return i + _mm_movemask_pd(_mm_cmplt_sd(_mm_cvtsi32_sd(t, i), t));
  19 }
  20
  21
  22 static inline int floor(double value) {
  23     __m128d t = _mm_set_sd(value);
  24     int i = _mm_cvtsd_si32(t);
  25     return i - _mm_movemask_pd(_mm_cmplt_sd(t, _mm_cvtsi32_sd(t, i)));
  26 }
  27
  28 static inline int16_t mulq15(int16_t a, int16_t b) {
  29     return static_cast<int16_t>(((1 << 14) + (int32_t)a * (int32_t)b) >> 15);
  30 }
  31
  32 static inline uint16_t mulq16(uint16_t a, uint16_t b) {
  33     return static_cast<uint16_t>(((uint32_t)a * (uint32_t)b) >> 16);
  34 }
  35
  36 void resize_bilinear_u8(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* buffer) {
  37     Border border = {BORDER_REPLICATE, 0};
  38
  39     auto dstDims = outBlob->getTensorDesc().getDims();
  40     auto srcDims = inBlob->getTensorDesc().getDims();
  41
  42     auto dwidth = static_cast<const int>(dstDims[3]);
  43     auto dheight = static_cast<const int>(dstDims[2]);
  44     auto swidth = static_cast<const int>(srcDims[3]);
  45     auto channels = static_cast<const int>(srcDims[1]);
  46
  47     auto src_strides = inBlob->getTensorDesc().getBlockingDesc().getStrides();
  48     auto dst_strides = outBlob->getTensorDesc().getBlockingDesc().getStrides();
  49     auto origSrcW = src_strides[2];
  50     auto origSrcH = src_strides[1] / src_strides[2];
  51     auto origDstW = dst_strides[2];
  52     auto origDstH = dst_strides[1] / dst_strides[2];
  53
  54     const int src_go_x = 0;
  55     const int src_go_y = 0;
  56     const int dst_go_x = 0;
  57     const int dst_go_y = 0;
  58     auto src_full_width = static_cast<const int>(srcDims[3]);
  59     auto src_full_height = static_cast<const int>(srcDims[2]);
  60     auto dst_full_width = static_cast<const int>(dstDims[3]);
  61     auto dst_full_height = static_cast<const int>(dstDims[2]);
  62
  63     const uint8_t *sptr = static_cast<uint8_t *>(inBlob->buffer()) +
  64                           inBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
  65     uint8_t *dptr = static_cast<uint8_t *>(outBlob->buffer()) +
  66                     outBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
  67
  68     auto sstep = static_cast<const int>(inBlob->getTensorDesc().getBlockingDesc().getStrides()[2]);
  69     auto dstep = static_cast<const int>(outBlob->getTensorDesc().getBlockingDesc().getStrides()[2]);
  70     auto scale_x = static_cast<float>(src_full_width) / dst_full_width;
  71     auto scale_y = static_cast<float>(src_full_height) / dst_full_height;
  72
  73     const int BITS = 15;
  74     const int SCALE = (1 << BITS);
  75     const int alpha_clones_num = 4;
  76     const int cols_block_size = 8;
  77     const int kRowsBlockSize = 4;
  78
  79     auto *pxofs1 = reinterpret_cast<int32_t *>(buffer);
  80     auto *alpha = reinterpret_cast<int16_t *>(pxofs1 + dwidth);
  81     auto *yofs = reinterpret_cast<int32_t *>(alpha + dwidth * alpha_clones_num);
  82     auto *beta = reinterpret_cast<int16_t *>(yofs + dheight);
  83     auto *tptr = reinterpret_cast<uint8_t *>(beta + dheight);
  84
  85     auto tptr_ = tptr;
  86
  87     tptr_[0] = (uint8_t) border.value;
  88     tptr_[1] = (uint8_t) border.value;
  89     tptr_[2] = (uint8_t) border.value;
  90     tptr_[3] = (uint8_t) border.value;
  91     tptr_[swidth + 0 + 4] = (uint8_t) border.value;
  92     tptr_[swidth + 1 + 4] = (uint8_t) border.value;
  93     tptr_[swidth + 2 + 4] = (uint8_t) border.value;
  94     tptr_[swidth + 3 + 4] = (uint8_t) border.value;
  95     tptr_[swidth * kRowsBlockSize + 0 + 4] = (uint8_t) border.value;
  96     tptr_[swidth * kRowsBlockSize + 1 + 4] = (uint8_t) border.value;
  97     tptr_[swidth * kRowsBlockSize + 2 + 4] = (uint8_t) border.value;
  98     tptr_[swidth * kRowsBlockSize + 3 + 4] = (uint8_t) border.value;
  99
 100     for (int dx = dst_go_x; dx < dst_go_x + dwidth; dx++) {
 101         auto fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
 102         int32_t sx = floor(fx);
 103         fx -= sx;
 104
 105         int32_t sx0 = sx;
 106         if (sx < 0 && border.type == BORDER_REPLICATE) {
 107             fx = 0;
 108             sx0 = 0;
 109         }
 110
 111         fx = fx * SCALE;
 112
 113         if (sx >= src_full_width - 1 && border.type == BORDER_REPLICATE) {
 114             fx = 1.f * SCALE - 1;
 115             sx0 = (std::max)(src_full_width - 2, 0);
 116         }
 117
 118         pxofs1[dx - dst_go_x] = kRowsBlockSize * (sx0 - src_go_x);
 119         for (int i = 0; i < alpha_clones_num; i++) {
 120             alpha[(dx - dst_go_x) * alpha_clones_num + i] = (int16_t) fx;
 121         }
 122     }
 123
 124     for (int dy = dst_go_y; dy < dst_go_y + dheight; dy++) {
 125         float fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
 126         int32_t sy = floor(fy);
 127         fy -= sy;
 128
 129         int32_t sy0 = sy;
 130         if (sy < 0 && border.type == BORDER_REPLICATE) {
 131             fy = 0;
 132             sy0 = 0;
 133         }
 134
 135         fy = fy * SCALE;
 136
 137         if (sy >= src_full_height - 1 && border.type == BORDER_REPLICATE) {
 138             fy = 1.f * SCALE - 1;
 139             sy0 = (std::max)(src_full_height - 2, 0);
 140         }
 141
 142         yofs[dy - dst_go_y] = (sy0 - src_go_y) * sstep;
 143         beta[dy - dst_go_y] = (int16_t) fy;
 144     }
 145
 146     if (swidth < cols_block_size || dwidth < cols_block_size || dheight < kRowsBlockSize) {
 147         auto full_pass = [&](int c, int y) {
 148             auto sptr_ = sptr + c * origSrcW * origSrcH;
 149             auto dptr_ = dptr + c * origDstW * origDstH;
 150             auto tptr_ = tptr;
 151
 152             for (int x = 0; x < swidth; x++) {
 153                 int val0 = (yofs[y] < 0) ? border.value : sptr_[yofs[y] + x + 0];
 154                 int val1 = (yofs[y] / sstep + 1 >= src_full_height - src_go_y) ? border.value : sptr_[yofs[y] + x +
 155                                                                                                       sstep];
 156
 157                 int res = val0 + mulq15(beta[y], (int16_t) (val1 - val0));
 158                 tptr_[x + 4] = (uint8_t) res;
 159             }
 160
 161             for (int x = 0; x < dwidth; x++) {
 162                 int val0 = tptr_[pxofs1[x] / kRowsBlockSize + 0 + 4];
 163                 int val1 = tptr_[pxofs1[x] / kRowsBlockSize + 1 + 4];
 164
 165                 int res = val0 + mulq15(alpha[x * alpha_clones_num], (int16_t) (val1 - val0));
 166                 dptr_[y * dstep + x] = (uint8_t) res;
 167             }
 168         };
 169
 170         for (int c = 0; c < channels; c++) {
 171             for (int y = 0; y < dheight; y++) {
 172                 full_pass(c, y);
 173             }
 174         }
 175
 176         return;
 177     }
 178
 179     auto full_pass_vec = [&](const uint8_t* sptr_, uint8_t* dptr_, uint8_t* tptr_, int y) {
 180         int32_t filtered_rows_id[4];
 181         for (int i = 0; i < 4; i++) {
 182             filtered_rows_id[i] = (yofs[y + i] < 0) ? 0 :
 183                                   (yofs[y + i] / sstep >= src_full_height - src_go_y - 1) ? 0 : yofs[y + i];
 184         }
 185
 186         __m128i b0 = _mm_set1_epi16(beta[y + 0]);
 187         __m128i b1 = _mm_set1_epi16(beta[y + 1]);
 188         __m128i b2 = _mm_set1_epi16(beta[y + 2]);
 189         __m128i b3 = _mm_set1_epi16(beta[y + 3]);
 190
 191         int x = 0;
 192         vertical_pass:
 193         for (; x <= swidth - cols_block_size; x += cols_block_size) {
 194             __m128i val0lo = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(sptr_ + x + filtered_rows_id[0])),
 195                                               *(reinterpret_cast<const int64_t *>(sptr_ + x + filtered_rows_id[1])), 1);
 196             __m128i val0hi = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(sptr_ + x + filtered_rows_id[2])),
 197                                               *(reinterpret_cast<const int64_t *>(sptr_ + x + filtered_rows_id[3])), 1);
 198             __m128i val1lo = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(sptr_ + x + filtered_rows_id[0] + sstep)),
 199                                               *(reinterpret_cast<const int64_t *>(sptr_ + x + filtered_rows_id[1] + sstep)), 1);
 200             __m128i val1hi = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(sptr_ + x + filtered_rows_id[2] + sstep)),
 201                                               *(reinterpret_cast<const int64_t *>(sptr_ + x + filtered_rows_id[3] + sstep)), 1);
 202
 203             __m128i val0_0 = _mm_unpacklo_epi8(val0lo, _mm_setzero_si128());
 204             __m128i val0_1 = _mm_unpackhi_epi8(val0lo, _mm_setzero_si128());
 205             __m128i val0_2 = _mm_unpacklo_epi8(val0hi, _mm_setzero_si128());
 206             __m128i val0_3 = _mm_unpackhi_epi8(val0hi, _mm_setzero_si128());
 207
 208             __m128i val1_0 = _mm_unpacklo_epi8(val1lo, _mm_setzero_si128());
 209             __m128i val1_1 = _mm_unpackhi_epi8(val1lo, _mm_setzero_si128());
 210             __m128i val1_2 = _mm_unpacklo_epi8(val1hi, _mm_setzero_si128());
 211             __m128i val1_3 = _mm_unpackhi_epi8(val1hi, _mm_setzero_si128());
 212
 213             __m128i s0_0 = _mm_sub_epi16(val1_0, val0_0);
 214             __m128i s0_1 = _mm_sub_epi16(val1_1, val0_1);
 215             __m128i s0_2 = _mm_sub_epi16(val1_2, val0_2);
 216             __m128i s0_3 = _mm_sub_epi16(val1_3, val0_3);
 217
 218             __m128i t0 = _mm_mulhrs_epi16(s0_0, b0);
 219             __m128i t1 = _mm_mulhrs_epi16(s0_1, b1);
 220             __m128i t2 = _mm_mulhrs_epi16(s0_2, b2);
 221             __m128i t3 = _mm_mulhrs_epi16(s0_3, b3);
 222
 223             __m128i r0 = _mm_add_epi16(val0_0, t0);
 224             __m128i r1 = _mm_add_epi16(val0_1, t1);
 225             __m128i r2 = _mm_add_epi16(val0_2, t2);
 226             __m128i r3 = _mm_add_epi16(val0_3, t3);
 227
 228             __m128i q0 = _mm_packus_epi16(r0, r1);
 229             __m128i q1 = _mm_packus_epi16(r2, r3);
 230
 231             __m128i q2 = _mm_blend_epi16(q0, _mm_slli_si128(q1, 4), 0xCC /*0b11001100*/);
 232             __m128i q3 = _mm_blend_epi16(_mm_srli_si128(q0, 4), q1, 0xCC /*0b11001100*/);
 233
 234             __m128i q4 = _mm_shuffle_epi8(q2, _mm_setr_epi8(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15));
 235             __m128i q5 = _mm_shuffle_epi8(q3, _mm_setr_epi8(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15));
 236
 237             _mm_storeu_si128(reinterpret_cast<__m128i *>(tptr_ + (x + 0) * kRowsBlockSize + 4), q4);
 238             _mm_storeu_si128(reinterpret_cast<__m128i *>(tptr_ + (x + 4) * kRowsBlockSize + 4), q5);
 239         }
 240
 241         if (x < swidth) {
 242             x = swidth - cols_block_size;
 243             goto vertical_pass;
 244         }
 245
 246         if (border.type == BORDER_CONSTANT) {
 247             for (int i = 0; i < kRowsBlockSize; i++) {
 248                 if (yofs[y + i] < 0) {
 249                     for (x = 0; x < swidth; x++) {
 250                         int val0 = border.value;
 251                         int val1 = sptr_[yofs[y + i] + x + sstep];
 252
 253                         int res = val0 + mulq15(beta[y + i], (int16_t) (val1 - val0));
 254                         tptr_[x * 4 + i + 4] = (uint8_t) res;
 255                     }
 256                 }
 257
 258                 if (yofs[y + i] / sstep >= src_full_height - src_go_y - 1) {
 259                     for (x = 0; x < swidth; x++) {
 260                         int val0 = sptr_[yofs[y + i] + x];
 261                         int val1 = border.value;
 262
 263                         int res = val0 + mulq15(beta[y + i], (int16_t) (val1 - val0));
 264                         tptr_[x * 4 + i + 4] = (uint8_t) res;
 265                     }
 266                 }
 267             }
 268         }
 269
 270         x = 0;
 271         horizontal_pass:
 272         for (; x <= dwidth - cols_block_size; x += cols_block_size) {
 273             __m128i a10 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(alpha + (x + 0) * alpha_clones_num));
 274             __m128i a32 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(alpha + (x + 2) * alpha_clones_num));
 275             __m128i a54 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(alpha + (x + 4) * alpha_clones_num));
 276             __m128i a76 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(alpha + (x + 6) * alpha_clones_num));
 277
 278             __m128i val_0 = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(tptr_ + pxofs1[x + 0] + 4)),
 279                                              *(reinterpret_cast<const int64_t *>(tptr_ + pxofs1[x + 1] + 4)), 1);
 280             __m128i val_1 = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(tptr_ + pxofs1[x + 2] + 4)),
 281                                              *(reinterpret_cast<const int64_t *>(tptr_ + pxofs1[x + 3] + 4)), 1);
 282             __m128i val_2 = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(tptr_ + pxofs1[x + 4] + 4)),
 283                                              *(reinterpret_cast<const int64_t *>(tptr_ + pxofs1[x + 5] + 4)), 1);
 284             __m128i val_3 = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(tptr_ + pxofs1[x + 6] + 4)),
 285                                              *(reinterpret_cast<const int64_t *>(tptr_ + pxofs1[x + 7] + 4)), 1);
 286
 287             val_0 = _mm_shuffle_epi32(val_0, _MM_SHUFFLE(3, 1, 2, 0));
 288             val_1 = _mm_shuffle_epi32(val_1, _MM_SHUFFLE(3, 1, 2, 0));
 289             val_2 = _mm_shuffle_epi32(val_2, _MM_SHUFFLE(3, 1, 2, 0));
 290             val_3 = _mm_shuffle_epi32(val_3, _MM_SHUFFLE(3, 1, 2, 0));
 291
 292             __m128i val0_0 = _mm_unpacklo_epi8(val_0, _mm_setzero_si128());
 293             __m128i val0_1 = _mm_unpacklo_epi8(val_1, _mm_setzero_si128());
 294             __m128i val0_2 = _mm_unpacklo_epi8(val_2, _mm_setzero_si128());
 295             __m128i val0_3 = _mm_unpacklo_epi8(val_3, _mm_setzero_si128());
 296
 297             __m128i val1_0 = _mm_unpackhi_epi8(val_0, _mm_setzero_si128());
 298             __m128i val1_1 = _mm_unpackhi_epi8(val_1, _mm_setzero_si128());
 299             __m128i val1_2 = _mm_unpackhi_epi8(val_2, _mm_setzero_si128());
 300             __m128i val1_3 = _mm_unpackhi_epi8(val_3, _mm_setzero_si128());
 301
 302             val1_0 = _mm_sub_epi16(val1_0, val0_0);
 303             val1_1 = _mm_sub_epi16(val1_1, val0_1);
 304             val1_2 = _mm_sub_epi16(val1_2, val0_2);
 305             val1_3 = _mm_sub_epi16(val1_3, val0_3);
 306
 307             __m128i t0 = _mm_mulhrs_epi16(val1_0, a10);
 308             __m128i t1 = _mm_mulhrs_epi16(val1_1, a32);
 309             __m128i t2 = _mm_mulhrs_epi16(val1_2, a54);
 310             __m128i t3 = _mm_mulhrs_epi16(val1_3, a76);
 311
 312             __m128i r0 = _mm_add_epi16(val0_0, t0);
 313             __m128i r1 = _mm_add_epi16(val0_1, t1);
 314             __m128i r2 = _mm_add_epi16(val0_2, t2);
 315             __m128i r3 = _mm_add_epi16(val0_3, t3);
 316
 317             __m128i q0 = _mm_packus_epi16(r0, r1);
 318             __m128i q1 = _mm_packus_epi16(r2, r3);
 319
 320             __m128i q2 = _mm_shuffle_epi8(q0, _mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15));
 321             __m128i q3 = _mm_shuffle_epi8(q1, _mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15));
 322
 323             __m128i q4 = _mm_blend_epi16(q2, _mm_slli_si128(q3, 4), 0xCC /*0b11001100*/);
 324             __m128i q5 = _mm_blend_epi16(_mm_srli_si128(q2, 4), q3, 0xCC /*0b11001100*/);
 325
 326             _mm_storel_epi64(reinterpret_cast<__m128i *>(dptr_ + (y + 0) * dstep + x), q4);
 327             _mm_storel_epi64(reinterpret_cast<__m128i *>(dptr_ + (y + 1) * dstep + x), _mm_srli_si128(q4, 8));
 328             _mm_storel_epi64(reinterpret_cast<__m128i *>(dptr_ + (y + 2) * dstep + x), q5);
 329             _mm_storel_epi64(reinterpret_cast<__m128i *>(dptr_ + (y + 3) * dstep + x), _mm_srli_si128(q5, 8));
 330         }
 331
 332         if (x < dwidth) {
 333             x = dwidth - cols_block_size;
 334             goto horizontal_pass;
 335         }
 336     };
 337
 338     for (int c = 0; c < channels; c++) {
 339         for (int y = 0; y <= dheight - kRowsBlockSize; y += kRowsBlockSize) {
 340             auto sptr_ = sptr + c * origSrcW * origSrcH;
 341             auto dptr_ = dptr + c * origDstW * origDstH;
 342             auto tptr_ = tptr;
 343
 344             full_pass_vec(sptr_, dptr_, tptr_, y);
 345
 346             if (y + kRowsBlockSize > dheight - kRowsBlockSize)
 347                 full_pass_vec(sptr_, dptr_, tptr_, dheight - kRowsBlockSize);
 348         }
 349     }
 350 }
 351
 352 void resize_area_u8_downscale(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* buffer) {
 353     auto dstDims = outBlob->getTensorDesc().getDims();
 354     auto srcDims = inBlob->getTensorDesc().getDims();
 355
 356     auto dwidth = static_cast<const int>(dstDims[3]);
 357     auto dheight = static_cast<const int>(dstDims[2]);
 358     auto swidth = static_cast<const int>(srcDims[3]);
 359     auto sheight = static_cast<const int>(srcDims[2]);
 360     auto channels = static_cast<const int>(srcDims[1]);
 361
 362     auto src_strides = inBlob->getTensorDesc().getBlockingDesc().getStrides();
 363     auto dst_strides = outBlob->getTensorDesc().getBlockingDesc().getStrides();
 364     auto origSrcW = src_strides[2];
 365     auto origSrcH = src_strides[1] / src_strides[2];
 366     auto origDstW = dst_strides[2];
 367     auto origDstH = dst_strides[1] / dst_strides[2];
 368
 369     const int src_go_x = 0;
 370     const int src_go_y = 0;
 371     const int dst_go_x = 0;
 372     const int dst_go_y = 0;
 373
 374     auto src_full_width = static_cast<const int>(srcDims[3]);
 375     auto src_full_height = static_cast<const int>(srcDims[2]);
 376     auto dst_full_width = static_cast<const int>(dstDims[3]);
 377     auto dst_full_height = static_cast<const int>(dstDims[2]);
 378
 379     auto sptr = static_cast<uint8_t*>(inBlob->buffer()) + inBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
 380     auto dptr = static_cast<uint8_t*>(outBlob->buffer()) + outBlob->getTensorDesc().getBlockingDesc().getOffsetPadding();
 381     auto sstep = static_cast<const int>(inBlob->getTensorDesc().getBlockingDesc().getStrides()[2]);
 382     auto dstep = static_cast<const int>(outBlob->getTensorDesc().getBlockingDesc().getStrides()[2]);
 383
 384     float scale_x = static_cast<float>(src_full_width) / dst_full_width;
 385     float scale_y = static_cast<float>(src_full_height) / dst_full_height;
 386
 387     int x_max_count = getResizeAreaTabSize(dst_go_x, src_full_width,  dwidth,  scale_x);
 388     int y_max_count = getResizeAreaTabSize(dst_go_y, src_full_height, dheight, scale_y);
 389
 390     auto* xsi = reinterpret_cast<uint16_t*>(buffer);
 391     auto* ysi = xsi + dwidth;
 392     auto* xalpha = ysi + dheight;
 393     auto* yalpha = xalpha + dwidth*x_max_count + 8*16;
 394
 395     computeResizeAreaTab(src_go_x, dst_go_x, src_full_width,   dwidth, scale_x, xsi, xalpha, x_max_count);
 396     computeResizeAreaTab(src_go_y, dst_go_y, src_full_height, dheight, scale_y, ysi, yalpha, y_max_count);
 397
 398     int vest_sum_size = 2*swidth;
 399     uint16_t* vert_sum = yalpha + dheight*y_max_count;
 400     uint16_t* alpha0 = vert_sum + vest_sum_size;
 401     uint16_t* alpha1 = alpha0 + dwidth;
 402     uint16_t* alpha2 = alpha1 + dwidth;
 403     uint16_t* alpha3 = alpha2 + dwidth;
 404     uint16_t* sxid0 = alpha3 + dwidth;
 405     uint16_t* sxid1 = sxid0 + 4*dwidth;
 406     uint16_t* sxid2 = sxid1 + 4*dwidth;
 407     uint16_t* sxid3 = sxid2 + 4*dwidth;
 408
 409     uint16_t* alpha[] = {alpha0, alpha1, alpha2, alpha3};
 410     uint16_t* sxid[] = {sxid0, sxid1, sxid2, sxid3};
 411     generate_alpha_and_id_arrays(x_max_count, dwidth, xalpha, xsi, alpha, sxid);
 412
 413     auto full_pass = [&](int c, int y) {
 414         uint8_t* pdst_row = dptr + (y * dstep) + c * origDstW * origDstH;
 415         uint16_t* vert_sum_ = vert_sum;
 416
 417         int ysi_row = ysi[y];
 418
 419         memset(vert_sum_, 0, swidth * sizeof(uint16_t));
 420
 421         for (int dy = 0; dy < y_max_count; dy++) {
 422             uint16_t yalpha_dy = yalpha[y * y_max_count + dy];
 423             const uint8_t *sptr_dy = sptr + ((ysi_row + dy) * sstep) + c * origSrcW * origSrcH;
 424             if (ysi_row + dy >= sheight) break;
 425
 426             int x = 0;
 427
 428             __m128i yalpha_dy_sse = _mm_set1_epi16(yalpha_dy);
 429             for (; x <= swidth - 16; x += 16) {
 430                 __m128i sval = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sptr_dy + x));
 431
 432                 // sptr_dy[x] << 8
 433                 __m128i sval_Q16_lo = _mm_unpacklo_epi8(_mm_setzero_si128(), sval);
 434                 __m128i sval_Q16_hi = _mm_unpackhi_epi8(_mm_setzero_si128(), sval);
 435
 436                 __m128i vert_sum_lo = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + x + 0));
 437                 __m128i vert_sum_hi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + x + 8));
 438
 439                 vert_sum_lo = _mm_add_epi16(vert_sum_lo, _mm_mulhi_epu16(yalpha_dy_sse, sval_Q16_lo));
 440                 vert_sum_hi = _mm_add_epi16(vert_sum_hi, _mm_mulhi_epu16(yalpha_dy_sse, sval_Q16_hi));
 441
 442                 _mm_storeu_si128(reinterpret_cast<__m128i*>(vert_sum_ + x + 0), vert_sum_lo);
 443                 _mm_storeu_si128(reinterpret_cast<__m128i*>(vert_sum_ + x + 8), vert_sum_hi);
 444             }
 445
 446             for (; x < swidth; x++) {
 447                 vert_sum_[x] += mulq16(yalpha_dy, static_cast<uint16_t>(sptr_dy[x] << 8));
 448             }
 449         }
 450
 451         if (x_max_count == 2) {
 452             int x = 0;
 453             for (; x <= dwidth - 8; x += 8) {
 454                 __m128i res = _mm_set1_epi16(1 << (8 - 1));
 455
 456                 int id0 = xsi[x];
 457
 458                 __m128i chunk0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0));
 459                 __m128i chunk1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 8));
 460
 461                 __m128i sx0_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 2));
 462                 __m128i sx0_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 2 + 8));
 463
 464                 __m128i sx1_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 2));
 465                 __m128i sx1_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 2 + 8));
 466
 467                 __m128i vert_sum0 = _mm_or_si128(_mm_shuffle_epi8(chunk0, sx0_id0),
 468                                                  _mm_shuffle_epi8(chunk1, sx0_id1));
 469                 __m128i vert_sum1 = _mm_or_si128(_mm_shuffle_epi8(chunk0, sx1_id0),
 470                                                  _mm_shuffle_epi8(chunk1, sx1_id1));
 471
 472                 res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha0 + x)), vert_sum0));
 473                 res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha1 + x)), vert_sum1));
 474
 475                 res = _mm_srli_epi16(res, 8);
 476                 res = _mm_packus_epi16(res, res);
 477                 _mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res);
 478             }
 479
 480             for (; x < dwidth; x++) {
 481                 uint16_t res = 1 << (8 - 1);
 482                 int id = xsi[x];
 483                 res += mulq16(alpha0[x], vert_sum_[id + 0]);
 484                 res += mulq16(alpha1[x], vert_sum_[id + 1]);
 485                 pdst_row[x] = saturateU32toU8(res >> 8);
 486             }
 487         } else if (x_max_count == 3) {
 488             int x = 0;
 489             for (; x <= dwidth - 8; x += 8) {
 490                 __m128i res = _mm_set1_epi16(1 << (8 - 1));
 491
 492                 int id0 = xsi[x];
 493
 494                 __m128i chunk0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0));
 495                 __m128i chunk1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 8));
 496                 __m128i chunk2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 16));
 497
 498                 __m128i sx0_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 3));
 499                 __m128i sx0_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 3 + 8));
 500                 __m128i sx0_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 3 + 16));
 501
 502                 __m128i sx1_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 3));
 503                 __m128i sx1_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 3 + 8));
 504                 __m128i sx1_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 3 + 16));
 505
 506                 __m128i sx2_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 3));
 507                 __m128i sx2_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 3 + 8));
 508                 __m128i sx2_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 3 + 16));
 509
 510                 __m128i vert_sum0 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx0_id0),
 511                                                               _mm_shuffle_epi8(chunk1, sx0_id1)),
 512                                                  _mm_shuffle_epi8(chunk2, sx0_id2));
 513                 __m128i vert_sum1 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx1_id0),
 514                                                               _mm_shuffle_epi8(chunk1, sx1_id1)),
 515                                                  _mm_shuffle_epi8(chunk2, sx1_id2));
 516                 __m128i vert_sum2 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx2_id0),
 517                                                               _mm_shuffle_epi8(chunk1, sx2_id1)),
 518                                                  _mm_shuffle_epi8(chunk2, sx2_id2));
 519
 520                 res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha0 + x)), vert_sum0));
 521                 res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha1 + x)), vert_sum1));
 522                 res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha2 + x)), vert_sum2));
 523
 524                 res = _mm_srli_epi16(res, 8);
 525                 res = _mm_packus_epi16(res, res);
 526                 _mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res);
 527             }
 528
 529             for (; x < dwidth; x++) {
 530                 uint16_t res = 1 << (8 - 1);
 531                 int id = xsi[x];
 532                 res += mulq16(alpha0[x], vert_sum_[id + 0]);
 533                 res += mulq16(alpha1[x], vert_sum_[id + 1]);
 534                 res += mulq16(alpha2[x], vert_sum_[id + 2]);
 535                 pdst_row[x] = saturateU32toU8(res >> 8);
 536             }
 537         } else if (x_max_count == 4) {
 538             int x = 0;
 539             for (; x <= dwidth - 8; x += 8) {
 540                 __m128i res = _mm_set1_epi16(1 << (8 - 1));
 541
 542                 int id0 = xsi[x];
 543
 544                 __m128i chunk0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0));
 545                 __m128i chunk1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 8));
 546                 __m128i chunk2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 16));
 547                 __m128i chunk3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 24));
 548
 549                 __m128i sx0_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4));
 550                 __m128i sx0_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4 + 8));
 551                 __m128i sx0_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4 + 16));
 552                 __m128i sx0_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4 + 24));
 553
 554                 __m128i sx1_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4));
 555                 __m128i sx1_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4 + 8));
 556                 __m128i sx1_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4 + 16));
 557                 __m128i sx1_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4 + 24));
 558
 559                 __m128i sx2_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4));
 560                 __m128i sx2_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4 + 8));
 561                 __m128i sx2_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4 + 16));
 562                 __m128i sx2_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4 + 24));
 563
 564                 __m128i sx3_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4));
 565                 __m128i sx3_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4 + 8));
 566                 __m128i sx3_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4 + 16));
 567                 __m128i sx3_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4 + 24));
 568
 569                 __m128i vert_sum0 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx0_id0),
 570                                                               _mm_shuffle_epi8(chunk1, sx0_id1)),
 571                                                  _mm_or_si128(_mm_shuffle_epi8(chunk2, sx0_id2),
 572                                                               _mm_shuffle_epi8(chunk3, sx0_id3)));
 573                 __m128i vert_sum1 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx1_id0),
 574                                                               _mm_shuffle_epi8(chunk1, sx1_id1)),
 575                                                  _mm_or_si128(_mm_shuffle_epi8(chunk2, sx1_id2),
 576                                                               _mm_shuffle_epi8(chunk3, sx1_id3)));
 577                 __m128i vert_sum2 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx2_id0),
 578                                                               _mm_shuffle_epi8(chunk1, sx2_id1)),
 579                                                  _mm_or_si128(_mm_shuffle_epi8(chunk2, sx2_id2),
 580                                                               _mm_shuffle_epi8(chunk3, sx2_id3)));
 581                 __m128i vert_sum3 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx3_id0),
 582                                                               _mm_shuffle_epi8(chunk1, sx3_id1)),
 583                                                  _mm_or_si128(_mm_shuffle_epi8(chunk2, sx3_id2),
 584                                                               _mm_shuffle_epi8(chunk3, sx3_id3)));
 585
 586                 res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha0 + x)), vert_sum0));
 587                 res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha1 + x)), vert_sum1));
 588                 res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha2 + x)), vert_sum2));
 589                 res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha3 + x)), vert_sum3));
 590
 591                 res = _mm_srli_epi16(res, 8);
 592                 res = _mm_packus_epi16(res, res);
 593                 _mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res);
 594             }
 595
 596             for (; x < dwidth; x++) {
 597                 uint16_t res = 1 << (8 - 1);
 598                 int id = xsi[x];
 599                 res += mulq16(alpha0[x], vert_sum_[id + 0]);
 600                 res += mulq16(alpha1[x], vert_sum_[id + 1]);
 601                 res += mulq16(alpha2[x], vert_sum_[id + 2]);
 602                 res += mulq16(alpha3[x], vert_sum_[id + 3]);
 603                 pdst_row[x] = saturateU32toU8(res >> 8);
 604             }
 605         } else if (x_max_count <= 7) {
 606             int x = 0;
 607             for (; x <= dwidth - 8; x += 8) {
 608                 __m128i res = _mm_set1_epi16(1 << (16 - 8 - 1));
 609                 for (int i = 0; i < x_max_count; i++) {
 610                     __m128i valpha = _mm_setr_epi16(xalpha[x * x_max_count + x_max_count * 0 + i],
 611                                                     xalpha[x * x_max_count + x_max_count * 1 + i],
 612                                                     xalpha[x * x_max_count + x_max_count * 2 + i],
 613                                                     xalpha[x * x_max_count + x_max_count * 3 + i],
 614                                                     xalpha[x * x_max_count + x_max_count * 4 + i],
 615                                                     xalpha[x * x_max_count + x_max_count * 5 + i],
 616                                                     xalpha[x * x_max_count + x_max_count * 6 + i],
 617                                                     xalpha[x * x_max_count + x_max_count * 7 + i]);
 618                     __m128i vvert_sum = _mm_setr_epi16(vert_sum_[xsi[x + 0] + i],
 619                                                        vert_sum_[xsi[x + 1] + i],
 620                                                        vert_sum_[xsi[x + 2] + i],
 621                                                        vert_sum_[xsi[x + 3] + i],
 622                                                        vert_sum_[xsi[x + 4] + i],
 623                                                        vert_sum_[xsi[x + 5] + i],
 624                                                        vert_sum_[xsi[x + 6] + i],
 625                                                        vert_sum_[xsi[x + 7] + i]);
 626
 627                     res = _mm_add_epi16(res, _mm_mulhi_epu16(valpha, vvert_sum));
 628                 }
 629                 res = _mm_srli_epi16(res, 8);
 630                 res = _mm_packus_epi16(res, res);
 631                 _mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res);
 632             }
 633
 634             for (; x < dwidth; x++) {
 635                 uint16_t res = 1 << (8 - 1);
 636                 for (int i = 0; i < x_max_count; i++) {
 637                     uint16_t a = xalpha[x * x_max_count + i];
 638                     int sx = xsi[x] + i;
 639
 640                     res += mulq16(a, vert_sum_[sx]);
 641                 }
 642                 pdst_row[x] = saturateU32toU8(res >> 8);
 643             }
 644         } else {
 645             for (int x = 0; x < dwidth; x++) {
 646                 uint16_t res = 1 << (8 - 1);
 647                 __m128i vres = _mm_setzero_si128();
 648                 int id = xsi[x];
 649
 650                 int i = 0;
 651                 for (; i <= x_max_count - 8; i += 8) {
 652                     __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i*>(xalpha + x * x_max_count + i));
 653                     __m128i s = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id + i));
 654
 655                     vres = _mm_add_epi16(vres, _mm_mulhi_epu16(a, s));
 656                 }
 657                 vres = _mm_add_epi16(vres, _mm_slli_si128(vres, 2));
 658                 vres = _mm_add_epi16(vres, _mm_slli_si128(vres, 4));
 659                 vres = _mm_add_epi16(vres, _mm_slli_si128(vres, 8));
 660                 res += static_cast<uint16_t>(_mm_extract_epi16(vres, 7));
 661
 662                 for (; i < x_max_count; i++) {
 663                     uint16_t a = xalpha[x * x_max_count + i];
 664                     uint16_t s = vert_sum_[id + i];
 665
 666                     res += mulq16(a, s);
 667                 }
 668
 669                 pdst_row[x] = saturateU32toU8(res >> 8);
 670             }
 671         }
 672     };
 673
 674     for (int c = 0; c < channels; c++) {
 675         for (int y = 0; y < dheight; y++) {
 676             full_pass(c, y);
 677         }
 678     }
 679 }
 680
 681 }  // namespace Resize
 682 }  // namespace InferenceEngine