src/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c

   1 /*
   2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS.  All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10
  11 #include <immintrin.h>
  12 #include "vpx_ports/mem.h"
  13
  14 // filters for 16_h8 and 16_v8
  15 DECLARE_ALIGNED(32, const unsigned char, filt1_global_avx2[32])= {
  16   0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
  17   0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8};
  18
  19 DECLARE_ALIGNED(32, const unsigned char, filt2_global_avx2[32])= {
  20   2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
  21   2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10};
  22
  23 DECLARE_ALIGNED(32, const unsigned char, filt3_global_avx2[32])= {
  24   4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
  25   4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12};
  26
  27 DECLARE_ALIGNED(32, const unsigned char, filt4_global_avx2[32])= {
  28   6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
  29   6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14};
  30
  31
  32 void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,
  33                                   unsigned int src_pixels_per_line,
  34                                   unsigned char *output_ptr,
  35                                   unsigned int  output_pitch,
  36                                   unsigned int  output_height,
  37                                   int16_t *filter) {
  38   __m128i filtersReg;
  39   __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
  40   __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
  41   __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
  42   __m256i srcReg32b1, srcReg32b2, filtersReg32;
  43   unsigned int i;
  44   unsigned int src_stride, dst_stride;
  45
  46   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
  47   addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
  48   filtersReg = _mm_loadu_si128((__m128i *)filter);
  49   // converting the 16 bit (short) to 8 bit (byte) and have the same data
  50   // in both lanes of 128 bit register.
  51   filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
  52   // have the same data in both lanes of a 256 bit register
  53 #if defined (__GNUC__)
  54 #if ( __GNUC__ < 4 || (__GNUC__ == 4 && \
  55 (__GNUC_MINOR__ < 6 || (__GNUC_MINOR__ == 6 && __GNUC_PATCHLEVEL__ > 0))))
  56   filtersReg32 = _mm_broadcastsi128_si256((__m128i const *)&filtersReg);
  57 #elif(__GNUC__ == 4 && (__GNUC_MINOR__ == 7 && __GNUC_PATCHLEVEL__ > 0))
  58   filtersReg32 = _mm_broadcastsi128_si256(filtersReg);
  59 #else
  60   filtersReg32 = _mm256_broadcastsi128_si256(filtersReg);
  61 #endif
  62 #else
  63   filtersReg32 = _mm256_broadcastsi128_si256(filtersReg);
  64 #endif
  65
  66   // duplicate only the first 16 bits (first and second byte)
  67   // across 256 bit register
  68   firstFilters = _mm256_shuffle_epi8(filtersReg32,
  69                  _mm256_set1_epi16(0x100u));
  70   // duplicate only the second 16 bits (third and forth byte)
  71   // across 256 bit register
  72   secondFilters = _mm256_shuffle_epi8(filtersReg32,
  73                   _mm256_set1_epi16(0x302u));
  74   // duplicate only the third 16 bits (fifth and sixth byte)
  75   // across 256 bit register
  76   thirdFilters = _mm256_shuffle_epi8(filtersReg32,
  77                  _mm256_set1_epi16(0x504u));
  78   // duplicate only the forth 16 bits (seventh and eighth byte)
  79   // across 256 bit register
  80   forthFilters = _mm256_shuffle_epi8(filtersReg32,
  81                  _mm256_set1_epi16(0x706u));
  82
  83   filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2);
  84   filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2);
  85   filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2);
  86   filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2);
  87
  88   // multiple the size of the source and destination stride by two
  89   src_stride = src_pixels_per_line << 1;
  90   dst_stride = output_pitch << 1;
  91   for (i = output_height; i > 1; i-=2) {
  92     // load the 2 strides of source
  93     srcReg32b1 = _mm256_castsi128_si256(
  94                  _mm_loadu_si128((__m128i *)(src_ptr-3)));
  95     srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
  96                  _mm_loadu_si128((__m128i *)
  97                  (src_ptr+src_pixels_per_line-3)), 1);
  98
  99     // filter the source buffer
 100     srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
 101     srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
 102
 103     // multiply 2 adjacent elements with the filter and add the result
 104     srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
 105     srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
 106
 107     // add and saturate the results together
 108     srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
 109
 110     // filter the source buffer
 111     srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
 112     srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
 113
 114     // multiply 2 adjacent elements with the filter and add the result
 115     srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters);
 116     srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
 117
 118     // add and saturate the results together
 119     srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
 120                        _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
 121
 122     // reading 2 strides of the next 16 bytes
 123     // (part of it was being read by earlier read)
 124     srcReg32b2 = _mm256_castsi128_si256(
 125                  _mm_loadu_si128((__m128i *)(src_ptr+5)));
 126     srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
 127                  _mm_loadu_si128((__m128i *)
 128                  (src_ptr+src_pixels_per_line+5)), 1);
 129
 130     // add and saturate the results together
 131     srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
 132                        _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
 133
 134     // filter the source buffer
 135     srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
 136     srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
 137
 138     // multiply 2 adjacent elements with the filter and add the result
 139     srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
 140     srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
 141
 142     // add and saturate the results together
 143     srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
 144
 145     // filter the source buffer
 146     srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
 147     srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
 148
 149     // multiply 2 adjacent elements with the filter and add the result
 150     srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters);
 151     srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
 152
 153     // add and saturate the results together
 154     srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
 155                        _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
 156     srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
 157                        _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
 158
 159
 160     srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);
 161
 162     srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64);
 163
 164     // shift by 7 bit each 16 bit
 165     srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
 166     srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7);
 167
 168     // shrink to 8 bit each 16 bits, the first lane contain the first
 169     // convolve result and the second lane contain the second convolve
 170     // result
 171     srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1,
 172                                            srcRegFilt32b2_1);
 173
 174     src_ptr+=src_stride;
 175
 176     // save 16 bytes
 177     _mm_store_si128((__m128i*)output_ptr,
 178     _mm256_castsi256_si128(srcRegFilt32b1_1));
 179
 180     // save the next 16 bits
 181     _mm_store_si128((__m128i*)(output_ptr+output_pitch),
 182     _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
 183     output_ptr+=dst_stride;
 184   }
 185
 186   // if the number of strides is odd.
 187   // process only 16 bytes
 188   if (i > 0) {
 189     __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
 190     __m128i srcRegFilt2, srcRegFilt3;
 191
 192     srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));
 193
 194     // filter the source buffer
 195     srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1,
 196                     _mm256_castsi256_si128(filt1Reg));
 197     srcRegFilt2 = _mm_shuffle_epi8(srcReg1,
 198                   _mm256_castsi256_si128(filt2Reg));
 199
 200     // multiply 2 adjacent elements with the filter and add the result
 201     srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,
 202                     _mm256_castsi256_si128(firstFilters));
 203     srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
 204                   _mm256_castsi256_si128(secondFilters));
 205
 206     // add and saturate the results together
 207     srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
 208
 209     // filter the source buffer
 210     srcRegFilt3= _mm_shuffle_epi8(srcReg1,
 211                  _mm256_castsi256_si128(filt4Reg));
 212     srcRegFilt2= _mm_shuffle_epi8(srcReg1,
 213                  _mm256_castsi256_si128(filt3Reg));
 214
 215     // multiply 2 adjacent elements with the filter and add the result
 216     srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
 217                   _mm256_castsi256_si128(forthFilters));
 218     srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
 219                   _mm256_castsi256_si128(thirdFilters));
 220
 221     // add and saturate the results together
 222     srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
 223                     _mm_min_epi16(srcRegFilt3, srcRegFilt2));
 224
 225     // reading the next 16 bytes
 226     // (part of it was being read by earlier read)
 227     srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));
 228
 229     // add and saturate the results together
 230     srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
 231                     _mm_max_epi16(srcRegFilt3, srcRegFilt2));
 232
 233     // filter the source buffer
 234     srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2,
 235                     _mm256_castsi256_si128(filt1Reg));
 236     srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
 237                   _mm256_castsi256_si128(filt2Reg));
 238
 239     // multiply 2 adjacent elements with the filter and add the result
 240     srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1,
 241                     _mm256_castsi256_si128(firstFilters));
 242     srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
 243                   _mm256_castsi256_si128(secondFilters));
 244
 245     // add and saturate the results together
 246     srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
 247
 248     // filter the source buffer
 249     srcRegFilt3 = _mm_shuffle_epi8(srcReg2,
 250                   _mm256_castsi256_si128(filt4Reg));
 251     srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
 252                   _mm256_castsi256_si128(filt3Reg));
 253
 254     // multiply 2 adjacent elements with the filter and add the result
 255     srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
 256                   _mm256_castsi256_si128(forthFilters));
 257     srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
 258                   _mm256_castsi256_si128(thirdFilters));
 259
 260     // add and saturate the results together
 261     srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
 262                     _mm_min_epi16(srcRegFilt3, srcRegFilt2));
 263     srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
 264                     _mm_max_epi16(srcRegFilt3, srcRegFilt2));
 265
 266
 267     srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
 268                     _mm256_castsi256_si128(addFilterReg64));
 269
 270     srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
 271                     _mm256_castsi256_si128(addFilterReg64));
 272
 273     // shift by 7 bit each 16 bit
 274     srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
 275     srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
 276
 277     // shrink to 8 bit each 16 bits, the first lane contain the first
 278     // convolve result and the second lane contain the second convolve
 279     // result
 280     srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
 281
 282     // save 16 bytes
 283     _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
 284   }
 285 }
 286
 287 void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr,
 288                                   unsigned int src_pitch,
 289                                   unsigned char *output_ptr,
 290                                   unsigned int out_pitch,
 291                                   unsigned int output_height,
 292                                   int16_t *filter) {
 293   __m128i filtersReg;
 294   __m256i addFilterReg64;
 295   __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
 296   __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
 297   __m256i srcReg32b11, srcReg32b12, srcReg32b13, filtersReg32;
 298   __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
 299   unsigned int i;
 300   unsigned int src_stride, dst_stride;
 301
 302   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
 303   addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
 304   filtersReg = _mm_loadu_si128((__m128i *)filter);
 305   // converting the 16 bit (short) to  8 bit (byte) and have the
 306   // same data in both lanes of 128 bit register.
 307   filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
 308   // have the same data in both lanes of a 256 bit register
 309 #if defined (__GNUC__)
 310 #if ( __GNUC__ < 4 || (__GNUC__ == 4 && \
 311 (__GNUC_MINOR__ < 6 || (__GNUC_MINOR__ == 6 && __GNUC_PATCHLEVEL__ > 0))))
 312   filtersReg32 = _mm_broadcastsi128_si256((__m128i const *)&filtersReg);
 313 #elif(__GNUC__ == 4 && (__GNUC_MINOR__ == 7 && __GNUC_PATCHLEVEL__ > 0))
 314   filtersReg32 = _mm_broadcastsi128_si256(filtersReg);
 315 #else
 316   filtersReg32 = _mm256_broadcastsi128_si256(filtersReg);
 317 #endif
 318 #else
 319   filtersReg32 = _mm256_broadcastsi128_si256(filtersReg);
 320 #endif
 321
 322   // duplicate only the first 16 bits (first and second byte)
 323   // across 256 bit register
 324   firstFilters = _mm256_shuffle_epi8(filtersReg32,
 325                  _mm256_set1_epi16(0x100u));
 326   // duplicate only the second 16 bits (third and forth byte)
 327   // across 256 bit register
 328   secondFilters = _mm256_shuffle_epi8(filtersReg32,
 329                   _mm256_set1_epi16(0x302u));
 330   // duplicate only the third 16 bits (fifth and sixth byte)
 331   // across 256 bit register
 332   thirdFilters = _mm256_shuffle_epi8(filtersReg32,
 333                  _mm256_set1_epi16(0x504u));
 334   // duplicate only the forth 16 bits (seventh and eighth byte)
 335   // across 256 bit register
 336   forthFilters = _mm256_shuffle_epi8(filtersReg32,
 337                  _mm256_set1_epi16(0x706u));
 338
 339   // multiple the size of the source and destination stride by two
 340   src_stride = src_pitch << 1;
 341   dst_stride = out_pitch << 1;
 342
 343   // load 16 bytes 7 times in stride of src_pitch
 344   srcReg32b1 = _mm256_castsi128_si256(
 345                _mm_loadu_si128((__m128i *)(src_ptr)));
 346   srcReg32b2 = _mm256_castsi128_si256(
 347                _mm_loadu_si128((__m128i *)(src_ptr+src_pitch)));
 348   srcReg32b3 = _mm256_castsi128_si256(
 349                _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2)));
 350   srcReg32b4 = _mm256_castsi128_si256(
 351                _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3)));
 352   srcReg32b5 = _mm256_castsi128_si256(
 353                _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4)));
 354   srcReg32b6 = _mm256_castsi128_si256(
 355                _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5)));
 356   srcReg32b7 = _mm256_castsi128_si256(
 357                _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6)));
 358
 359   // have each consecutive loads on the same 256 register
 360   srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
 361                _mm256_castsi256_si128(srcReg32b2), 1);
 362   srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
 363                _mm256_castsi256_si128(srcReg32b3), 1);
 364   srcReg32b3 = _mm256_inserti128_si256(srcReg32b3,
 365                _mm256_castsi256_si128(srcReg32b4), 1);
 366   srcReg32b4 = _mm256_inserti128_si256(srcReg32b4,
 367                _mm256_castsi256_si128(srcReg32b5), 1);
 368   srcReg32b5 = _mm256_inserti128_si256(srcReg32b5,
 369                _mm256_castsi256_si128(srcReg32b6), 1);
 370   srcReg32b6 = _mm256_inserti128_si256(srcReg32b6,
 371                _mm256_castsi256_si128(srcReg32b7), 1);
 372
 373   // merge every two consecutive registers except the last one
 374   srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
 375   srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
 376
 377   // save
 378   srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
 379
 380   // save
 381   srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
 382
 383   // save
 384   srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
 385
 386   // save
 387   srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
 388
 389
 390   for (i = output_height; i > 1; i-=2) {
 391      // load the last 2 loads of 16 bytes and have every two
 392      // consecutive loads in the same 256 bit register
 393      srcReg32b8 = _mm256_castsi128_si256(
 394      _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)));
 395      srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
 396      _mm256_castsi256_si128(srcReg32b8), 1);
 397      srcReg32b9 = _mm256_castsi128_si256(
 398      _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*8)));
 399      srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
 400      _mm256_castsi256_si128(srcReg32b9), 1);
 401
 402      // merge every two consecutive registers
 403      // save
 404      srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
 405      srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
 406
 407      // multiply 2 adjacent elements with the filter and add the result
 408      srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
 409      srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
 410      srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
 411      srcReg32b8 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
 412
 413      // add and saturate the results together
 414      srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
 415      srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b8);
 416
 417
 418      // multiply 2 adjacent elements with the filter and add the result
 419      srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
 420      srcReg32b6 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
 421
 422      // multiply 2 adjacent elements with the filter and add the result
 423      srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
 424      srcReg32b13 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
 425
 426
 427      // add and saturate the results together
 428      srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
 429                    _mm256_min_epi16(srcReg32b8, srcReg32b12));
 430      srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
 431                   _mm256_min_epi16(srcReg32b6, srcReg32b13));
 432
 433      // add and saturate the results together
 434      srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
 435                    _mm256_max_epi16(srcReg32b8, srcReg32b12));
 436      srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
 437                   _mm256_max_epi16(srcReg32b6, srcReg32b13));
 438
 439
 440      srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
 441      srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
 442
 443      // shift by 7 bit each 16 bit
 444      srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);
 445      srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);
 446
 447      // shrink to 8 bit each 16 bits, the first lane contain the first
 448      // convolve result and the second lane contain the second convolve
 449      // result
 450      srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
 451
 452      src_ptr+=src_stride;
 453
 454      // save 16 bytes
 455      _mm_store_si128((__m128i*)output_ptr,
 456      _mm256_castsi256_si128(srcReg32b1));
 457
 458      // save the next 16 bits
 459      _mm_store_si128((__m128i*)(output_ptr+out_pitch),
 460      _mm256_extractf128_si256(srcReg32b1, 1));
 461
 462      output_ptr+=dst_stride;
 463
 464      // save part of the registers for next strides
 465      srcReg32b10 = srcReg32b11;
 466      srcReg32b1 = srcReg32b3;
 467      srcReg32b11 = srcReg32b2;
 468      srcReg32b3 = srcReg32b5;
 469      srcReg32b2 = srcReg32b4;
 470      srcReg32b5 = srcReg32b7;
 471      srcReg32b7 = srcReg32b9;
 472   }
 473   if (i > 0) {
 474     __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
 475     __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
 476     // load the last 16 bytes
 477     srcRegFilt8 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));
 478
 479     // merge the last 2 results together
 480     srcRegFilt4 = _mm_unpacklo_epi8(
 481                   _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
 482     srcRegFilt7 = _mm_unpackhi_epi8(
 483                   _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
 484
 485     // multiply 2 adjacent elements with the filter and add the result
 486     srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
 487                   _mm256_castsi256_si128(firstFilters));
 488     srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4,
 489                   _mm256_castsi256_si128(forthFilters));
 490     srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
 491                   _mm256_castsi256_si128(firstFilters));
 492     srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7,
 493                   _mm256_castsi256_si128(forthFilters));
 494
 495     // add and saturate the results together
 496     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
 497     srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
 498
 499
 500     // multiply 2 adjacent elements with the filter and add the result
 501     srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
 502                   _mm256_castsi256_si128(secondFilters));
 503     srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
 504                   _mm256_castsi256_si128(secondFilters));
 505
 506     // multiply 2 adjacent elements with the filter and add the result
 507     srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
 508                   _mm256_castsi256_si128(thirdFilters));
 509     srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
 510                   _mm256_castsi256_si128(thirdFilters));
 511
 512     // add and saturate the results together
 513     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
 514                   _mm_min_epi16(srcRegFilt4, srcRegFilt6));
 515     srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
 516                   _mm_min_epi16(srcRegFilt5, srcRegFilt7));
 517
 518     // add and saturate the results together
 519     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
 520                   _mm_max_epi16(srcRegFilt4, srcRegFilt6));
 521     srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
 522                   _mm_max_epi16(srcRegFilt5, srcRegFilt7));
 523
 524
 525     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
 526                   _mm256_castsi256_si128(addFilterReg64));
 527     srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
 528                   _mm256_castsi256_si128(addFilterReg64));
 529
 530     // shift by 7 bit each 16 bit
 531     srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
 532     srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7);
 533
 534     // shrink to 8 bit each 16 bits, the first lane contain the first
 535     // convolve result and the second lane contain the second convolve
 536     // result
 537     srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
 538
 539     // save 16 bytes
 540     _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
 541   }
 542 }