src/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_posix.cc

   1 /*
   2  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS. All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10
  11 #include "libyuv/row.h"
  12
  13 #ifdef __cplusplus
  14 namespace libyuv {
  15 extern "C" {
  16 #endif
  17
  18 // This module is for GCC x86 and x64.
  19 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
  20
  21 // Offsets for source bytes 0 to 9
  22 static uvec8 kShuf0 =
  23   { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
  24
  25 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
  26 static uvec8 kShuf1 =
  27   { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
  28
  29 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
  30 static uvec8 kShuf2 =
  31   { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
  32
  33 // Offsets for source bytes 0 to 10
  34 static uvec8 kShuf01 =
  35   { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
  36
  37 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
  38 static uvec8 kShuf11 =
  39   { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
  40
  41 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
  42 static uvec8 kShuf21 =
  43   { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
  44
  45 // Coefficients for source bytes 0 to 10
  46 static uvec8 kMadd01 =
  47   { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
  48
  49 // Coefficients for source bytes 10 to 21
  50 static uvec8 kMadd11 =
  51   { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
  52
  53 // Coefficients for source bytes 21 to 31
  54 static uvec8 kMadd21 =
  55   { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
  56
  57 // Coefficients for source bytes 21 to 31
  58 static vec16 kRound34 =
  59   { 2, 2, 2, 2, 2, 2, 2, 2 };
  60
  61 static uvec8 kShuf38a =
  62   { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
  63
  64 static uvec8 kShuf38b =
  65   { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
  66
  67 // Arrange words 0,3,6 into 0,1,2
  68 static uvec8 kShufAc =
  69   { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
  70
  71 // Arrange words 0,3,6 into 3,4,5
  72 static uvec8 kShufAc3 =
  73   { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
  74
  75 // Scaling values for boxes of 3x3 and 2x3
  76 static uvec16 kScaleAc33 =
  77   { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
  78
  79 // Arrange first value for pixels 0,1,2,3,4,5
  80 static uvec8 kShufAb0 =
  81   { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
  82
  83 // Arrange second value for pixels 0,1,2,3,4,5
  84 static uvec8 kShufAb1 =
  85   { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
  86
  87 // Arrange third value for pixels 0,1,2,3,4,5
  88 static uvec8 kShufAb2 =
  89   { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
  90
  91 // Scaling values for boxes of 3x2 and 2x2
  92 static uvec16 kScaleAb2 =
  93   { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
  94
  95 // GCC versions of row functions are verbatim conversions from Visual C.
  96 // Generated using gcc disassembly on Visual C object file:
  97 // objdump -D yuvscaler.obj >yuvscaler.txt
  98
  99 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
 100                         uint8* dst_ptr, int dst_width) {
 101   asm volatile (
 102     LABELALIGN
 103   "1:                                          \n"
 104     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
 105     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
 106     "lea       " MEMLEA(0x20,0) ",%0           \n"
 107     "psrlw     $0x8,%%xmm0                     \n"
 108     "psrlw     $0x8,%%xmm1                     \n"
 109     "packuswb  %%xmm1,%%xmm0                   \n"
 110     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
 111     "lea       " MEMLEA(0x10,1) ",%1           \n"
 112     "sub       $0x10,%2                        \n"
 113     "jg        1b                              \n"
 114   : "+r"(src_ptr),    // %0
 115     "+r"(dst_ptr),    // %1
 116     "+r"(dst_width)   // %2
 117   :
 118   : "memory", "cc"
 119 #if defined(__SSE2__)
 120     , "xmm0", "xmm1"
 121 #endif
 122   );
 123 }
 124
 125 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
 126                               uint8* dst_ptr, int dst_width) {
 127   asm volatile (
 128     "pcmpeqb   %%xmm5,%%xmm5                   \n"
 129     "psrlw     $0x8,%%xmm5                     \n"
 130
 131     LABELALIGN
 132   "1:                                          \n"
 133     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
 134     "movdqa    " MEMACCESS2(0x10, 0) ",%%xmm1  \n"
 135     "lea       " MEMLEA(0x20,0) ",%0           \n"
 136     "movdqa    %%xmm0,%%xmm2                   \n"
 137     "psrlw     $0x8,%%xmm0                     \n"
 138     "movdqa    %%xmm1,%%xmm3                   \n"
 139     "psrlw     $0x8,%%xmm1                     \n"
 140     "pand      %%xmm5,%%xmm2                   \n"
 141     "pand      %%xmm5,%%xmm3                   \n"
 142     "pavgw     %%xmm2,%%xmm0                   \n"
 143     "pavgw     %%xmm3,%%xmm1                   \n"
 144     "packuswb  %%xmm1,%%xmm0                   \n"
 145     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
 146     "lea       " MEMLEA(0x10,1) ",%1           \n"
 147     "sub       $0x10,%2                        \n"
 148     "jg        1b                              \n"
 149   : "+r"(src_ptr),    // %0
 150     "+r"(dst_ptr),    // %1
 151     "+r"(dst_width)   // %2
 152   :
 153   : "memory", "cc"
 154 #if defined(__SSE2__)
 155     , "xmm0", "xmm1", "xmm5"
 156 #endif
 157   );
 158 }
 159
 160 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
 161                            uint8* dst_ptr, int dst_width) {
 162   asm volatile (
 163     "pcmpeqb   %%xmm5,%%xmm5                   \n"
 164     "psrlw     $0x8,%%xmm5                     \n"
 165
 166     LABELALIGN
 167   "1:                                          \n"
 168     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
 169     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
 170     MEMOPREG(movdqa,0x00,0,3,1,xmm2)           //  movdqa  (%0,%3,1),%%xmm2
 171     BUNDLEALIGN
 172     MEMOPREG(movdqa,0x10,0,3,1,xmm3)           //  movdqa  0x10(%0,%3,1),%%xmm3
 173     "lea       " MEMLEA(0x20,0) ",%0           \n"
 174     "pavgb     %%xmm2,%%xmm0                   \n"
 175     "pavgb     %%xmm3,%%xmm1                   \n"
 176     "movdqa    %%xmm0,%%xmm2                   \n"
 177     "psrlw     $0x8,%%xmm0                     \n"
 178     "movdqa    %%xmm1,%%xmm3                   \n"
 179     "psrlw     $0x8,%%xmm1                     \n"
 180     "pand      %%xmm5,%%xmm2                   \n"
 181     "pand      %%xmm5,%%xmm3                   \n"
 182     "pavgw     %%xmm2,%%xmm0                   \n"
 183     "pavgw     %%xmm3,%%xmm1                   \n"
 184     "packuswb  %%xmm1,%%xmm0                   \n"
 185     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
 186     "lea       " MEMLEA(0x10,1) ",%1           \n"
 187     "sub       $0x10,%2                        \n"
 188     "jg        1b                              \n"
 189   : "+r"(src_ptr),    // %0
 190     "+r"(dst_ptr),    // %1
 191     "+r"(dst_width)   // %2
 192   : "r"((intptr_t)(src_stride))   // %3
 193   : "memory", "cc"
 194 #if defined(__native_client__) && defined(__x86_64__)
 195     , "r14"
 196 #endif
 197 #if defined(__SSE2__)
 198     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
 199 #endif
 200   );
 201 }
 202
 203 void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
 204                                   uint8* dst_ptr, int dst_width) {
 205   asm volatile (
 206     LABELALIGN
 207   "1:                                          \n"
 208     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
 209     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
 210     "lea       " MEMLEA(0x20,0) ",%0           \n"
 211     "psrlw     $0x8,%%xmm0                     \n"
 212     "psrlw     $0x8,%%xmm1                     \n"
 213     "packuswb  %%xmm1,%%xmm0                   \n"
 214     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
 215     "lea       " MEMLEA(0x10,1) ",%1           \n"
 216     "sub       $0x10,%2                        \n"
 217     "jg        1b                              \n"
 218   : "+r"(src_ptr),    // %0
 219     "+r"(dst_ptr),    // %1
 220     "+r"(dst_width)   // %2
 221   :
 222   : "memory", "cc"
 223 #if defined(__SSE2__)
 224     , "xmm0", "xmm1"
 225 #endif
 226   );
 227 }
 228
 229 void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
 230                                         ptrdiff_t src_stride,
 231                                         uint8* dst_ptr, int dst_width) {
 232   asm volatile (
 233     "pcmpeqb   %%xmm5,%%xmm5                   \n"
 234     "psrlw     $0x8,%%xmm5                     \n"
 235
 236     LABELALIGN
 237   "1:                                          \n"
 238     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
 239     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
 240     "lea       " MEMLEA(0x20,0) ",%0           \n"
 241     "movdqa    %%xmm0,%%xmm2                   \n"
 242     "psrlw     $0x8,%%xmm0                     \n"
 243     "movdqa    %%xmm1,%%xmm3                   \n"
 244     "psrlw     $0x8,%%xmm1                     \n"
 245     "pand      %%xmm5,%%xmm2                   \n"
 246     "pand      %%xmm5,%%xmm3                   \n"
 247     "pavgw     %%xmm2,%%xmm0                   \n"
 248     "pavgw     %%xmm3,%%xmm1                   \n"
 249     "packuswb  %%xmm1,%%xmm0                   \n"
 250     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
 251     "lea       " MEMLEA(0x10,1) ",%1           \n"
 252     "sub       $0x10,%2                        \n"
 253     "jg        1b                              \n"
 254   : "+r"(src_ptr),    // %0
 255     "+r"(dst_ptr),    // %1
 256     "+r"(dst_width)   // %2
 257   :
 258   : "memory", "cc"
 259 #if defined(__SSE2__)
 260     , "xmm0", "xmm1", "xmm5"
 261 #endif
 262   );
 263 }
 264
 265 void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
 266                                      ptrdiff_t src_stride,
 267                                      uint8* dst_ptr, int dst_width) {
 268   asm volatile (
 269     "pcmpeqb   %%xmm5,%%xmm5                   \n"
 270     "psrlw     $0x8,%%xmm5                     \n"
 271
 272     LABELALIGN
 273   "1:                                          \n"
 274     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
 275     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
 276     MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu  (%0,%3,1),%%xmm2
 277     BUNDLEALIGN
 278     MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu  0x10(%0,%3,1),%%xmm3
 279     "lea       " MEMLEA(0x20,0) ",%0           \n"
 280     "pavgb     %%xmm2,%%xmm0                   \n"
 281     "pavgb     %%xmm3,%%xmm1                   \n"
 282     "movdqa    %%xmm0,%%xmm2                   \n"
 283     "psrlw     $0x8,%%xmm0                     \n"
 284     "movdqa    %%xmm1,%%xmm3                   \n"
 285     "psrlw     $0x8,%%xmm1                     \n"
 286     "pand      %%xmm5,%%xmm2                   \n"
 287     "pand      %%xmm5,%%xmm3                   \n"
 288     "pavgw     %%xmm2,%%xmm0                   \n"
 289     "pavgw     %%xmm3,%%xmm1                   \n"
 290     "packuswb  %%xmm1,%%xmm0                   \n"
 291     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
 292     "lea       " MEMLEA(0x10,1) ",%1           \n"
 293     "sub       $0x10,%2                        \n"
 294     "jg        1b                              \n"
 295   : "+r"(src_ptr),    // %0
 296     "+r"(dst_ptr),    // %1
 297     "+r"(dst_width)   // %2
 298   : "r"((intptr_t)(src_stride))   // %3
 299   : "memory", "cc"
 300 #if defined(__native_client__) && defined(__x86_64__)
 301     , "r14"
 302 #endif
 303 #if defined(__SSE2__)
 304     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
 305 #endif
 306   );
 307 }
 308
 309 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
 310                         uint8* dst_ptr, int dst_width) {
 311   asm volatile (
 312     "pcmpeqb   %%xmm5,%%xmm5                   \n"
 313     "psrld     $0x18,%%xmm5                    \n"
 314     "pslld     $0x10,%%xmm5                    \n"
 315
 316     LABELALIGN
 317   "1:                                          \n"
 318     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
 319     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
 320     "lea       " MEMLEA(0x20,0) ",%0           \n"
 321     "pand      %%xmm5,%%xmm0                   \n"
 322     "pand      %%xmm5,%%xmm1                   \n"
 323     "packuswb  %%xmm1,%%xmm0                   \n"
 324     "psrlw     $0x8,%%xmm0                     \n"
 325     "packuswb  %%xmm0,%%xmm0                   \n"
 326     "movq      %%xmm0," MEMACCESS(1) "         \n"
 327     "lea       " MEMLEA(0x8,1) ",%1            \n"
 328     "sub       $0x8,%2                         \n"
 329     "jg        1b                              \n"
 330   : "+r"(src_ptr),    // %0
 331     "+r"(dst_ptr),    // %1
 332     "+r"(dst_width)   // %2
 333   :
 334   : "memory", "cc"
 335 #if defined(__SSE2__)
 336     , "xmm0", "xmm1", "xmm5"
 337 #endif
 338   );
 339 }
 340
 341 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
 342                            uint8* dst_ptr, int dst_width) {
 343   intptr_t stridex3 = 0;
 344   asm volatile (
 345     "pcmpeqb   %%xmm7,%%xmm7                   \n"
 346     "psrlw     $0x8,%%xmm7                     \n"
 347     "lea       " MEMLEA4(0x00,4,4,2) ",%3      \n"
 348
 349     LABELALIGN
 350   "1:                                          \n"
 351     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
 352     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
 353     MEMOPREG(movdqa,0x00,0,4,1,xmm2)           //  movdqa  (%0,%4,1),%%xmm2
 354     BUNDLEALIGN
 355     MEMOPREG(movdqa,0x10,0,4,1,xmm3)           //  movdqa  0x10(%0,%4,1),%%xmm3
 356     "pavgb     %%xmm2,%%xmm0                   \n"
 357     "pavgb     %%xmm3,%%xmm1                   \n"
 358     MEMOPREG(movdqa,0x00,0,4,2,xmm2)           //  movdqa  (%0,%4,2),%%xmm2
 359     BUNDLEALIGN
 360     MEMOPREG(movdqa,0x10,0,4,2,xmm3)           //  movdqa  0x10(%0,%4,2),%%xmm3
 361     MEMOPREG(movdqa,0x00,0,3,1,xmm4)           //  movdqa  (%0,%3,1),%%xmm4
 362     MEMOPREG(movdqa,0x10,0,3,1,xmm5)           //  movdqa  0x10(%0,%3,1),%%xmm5
 363     "lea       " MEMLEA(0x20,0) ",%0           \n"
 364     "pavgb     %%xmm4,%%xmm2                   \n"
 365     "pavgb     %%xmm2,%%xmm0                   \n"
 366     "pavgb     %%xmm5,%%xmm3                   \n"
 367     "pavgb     %%xmm3,%%xmm1                   \n"
 368     "movdqa    %%xmm0,%%xmm2                   \n"
 369     "psrlw     $0x8,%%xmm0                     \n"
 370     "movdqa    %%xmm1,%%xmm3                   \n"
 371     "psrlw     $0x8,%%xmm1                     \n"
 372     "pand      %%xmm7,%%xmm2                   \n"
 373     "pand      %%xmm7,%%xmm3                   \n"
 374     "pavgw     %%xmm2,%%xmm0                   \n"
 375     "pavgw     %%xmm3,%%xmm1                   \n"
 376     "packuswb  %%xmm1,%%xmm0                   \n"
 377     "movdqa    %%xmm0,%%xmm2                   \n"
 378     "psrlw     $0x8,%%xmm0                     \n"
 379     "pand      %%xmm7,%%xmm2                   \n"
 380     "pavgw     %%xmm2,%%xmm0                   \n"
 381     "packuswb  %%xmm0,%%xmm0                   \n"
 382     "movq      %%xmm0," MEMACCESS(1) "         \n"
 383     "lea       " MEMLEA(0x8,1) ",%1            \n"
 384     "sub       $0x8,%2                         \n"
 385     "jg        1b                              \n"
 386   : "+r"(src_ptr),     // %0
 387     "+r"(dst_ptr),     // %1
 388     "+r"(dst_width),   // %2
 389     "+r"(stridex3)     // %3
 390   : "r"((intptr_t)(src_stride))    // %4
 391   : "memory", "cc"
 392 #if defined(__native_client__) && defined(__x86_64__)
 393     , "r14"
 394 #endif
 395 #if defined(__SSE2__)
 396     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
 397 #endif
 398   );
 399 }
 400
 401 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
 402                           uint8* dst_ptr, int dst_width) {
 403   asm volatile (
 404     "movdqa    %0,%%xmm3                       \n"
 405     "movdqa    %1,%%xmm4                       \n"
 406     "movdqa    %2,%%xmm5                       \n"
 407   :
 408   : "m"(kShuf0),  // %0
 409     "m"(kShuf1),  // %1
 410     "m"(kShuf2)   // %2
 411   );
 412   asm volatile (
 413     LABELALIGN
 414   "1:                                          \n"
 415     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
 416     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm2   \n"
 417     "lea       " MEMLEA(0x20,0) ",%0           \n"
 418     "movdqa    %%xmm2,%%xmm1                   \n"
 419     "palignr   $0x8,%%xmm0,%%xmm1              \n"
 420     "pshufb    %%xmm3,%%xmm0                   \n"
 421     "pshufb    %%xmm4,%%xmm1                   \n"
 422     "pshufb    %%xmm5,%%xmm2                   \n"
 423     "movq      %%xmm0," MEMACCESS(1) "         \n"
 424     "movq      %%xmm1," MEMACCESS2(0x8,1) "    \n"
 425     "movq      %%xmm2," MEMACCESS2(0x10,1) "   \n"
 426     "lea       " MEMLEA(0x18,1) ",%1           \n"
 427     "sub       $0x18,%2                        \n"
 428     "jg        1b                              \n"
 429   : "+r"(src_ptr),   // %0
 430     "+r"(dst_ptr),   // %1
 431     "+r"(dst_width)  // %2
 432   :
 433   : "memory", "cc"
 434 #if defined(__SSE2__)
 435     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 436 #endif
 437   );
 438 }
 439
 440 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
 441                                 ptrdiff_t src_stride,
 442                                 uint8* dst_ptr, int dst_width) {
 443   asm volatile (
 444     "movdqa    %0,%%xmm2                       \n"  // kShuf01
 445     "movdqa    %1,%%xmm3                       \n"  // kShuf11
 446     "movdqa    %2,%%xmm4                       \n"  // kShuf21
 447   :
 448   : "m"(kShuf01),  // %0
 449     "m"(kShuf11),  // %1
 450     "m"(kShuf21)   // %2
 451   );
 452   asm volatile (
 453     "movdqa    %0,%%xmm5                       \n"  // kMadd01
 454     "movdqa    %1,%%xmm0                       \n"  // kMadd11
 455     "movdqa    %2,%%xmm1                       \n"  // kRound34
 456   :
 457   : "m"(kMadd01),  // %0
 458     "m"(kMadd11),  // %1
 459     "m"(kRound34)  // %2
 460   );
 461   asm volatile (
 462     LABELALIGN
 463   "1:                                          \n"
 464     "movdqa    " MEMACCESS(0) ",%%xmm6         \n"
 465     MEMOPREG(movdqa,0x00,0,3,1,xmm7)           //  movdqa  (%0,%3),%%xmm7
 466     "pavgb     %%xmm7,%%xmm6                   \n"
 467     "pshufb    %%xmm2,%%xmm6                   \n"
 468     "pmaddubsw %%xmm5,%%xmm6                   \n"
 469     "paddsw    %%xmm1,%%xmm6                   \n"
 470     "psrlw     $0x2,%%xmm6                     \n"
 471     "packuswb  %%xmm6,%%xmm6                   \n"
 472     "movq      %%xmm6," MEMACCESS(1) "         \n"
 473     "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
 474     MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3),%%xmm7
 475     "pavgb     %%xmm7,%%xmm6                   \n"
 476     "pshufb    %%xmm3,%%xmm6                   \n"
 477     "pmaddubsw %%xmm0,%%xmm6                   \n"
 478     "paddsw    %%xmm1,%%xmm6                   \n"
 479     "psrlw     $0x2,%%xmm6                     \n"
 480     "packuswb  %%xmm6,%%xmm6                   \n"
 481     "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
 482     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
 483     BUNDLEALIGN
 484     MEMOPREG(movdqa,0x10,0,3,1,xmm7)           //  movdqa  0x10(%0,%3),%%xmm7
 485     "lea       " MEMLEA(0x20,0) ",%0           \n"
 486     "pavgb     %%xmm7,%%xmm6                   \n"
 487     "pshufb    %%xmm4,%%xmm6                   \n"
 488     "pmaddubsw %4,%%xmm6                       \n"
 489     "paddsw    %%xmm1,%%xmm6                   \n"
 490     "psrlw     $0x2,%%xmm6                     \n"
 491     "packuswb  %%xmm6,%%xmm6                   \n"
 492     "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
 493     "lea       " MEMLEA(0x18,1) ",%1           \n"
 494     "sub       $0x18,%2                        \n"
 495     "jg        1b                              \n"
 496   : "+r"(src_ptr),   // %0
 497     "+r"(dst_ptr),   // %1
 498     "+r"(dst_width)  // %2
 499   : "r"((intptr_t)(src_stride)),  // %3
 500     "m"(kMadd21)     // %4
 501   : "memory", "cc"
 502 #if defined(__native_client__) && defined(__x86_64__)
 503     , "r14"
 504 #endif
 505 #if defined(__SSE2__)
 506     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
 507 #endif
 508   );
 509 }
 510
 511 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
 512                                 ptrdiff_t src_stride,
 513                                 uint8* dst_ptr, int dst_width) {
 514   asm volatile (
 515     "movdqa    %0,%%xmm2                       \n"  // kShuf01
 516     "movdqa    %1,%%xmm3                       \n"  // kShuf11
 517     "movdqa    %2,%%xmm4                       \n"  // kShuf21
 518   :
 519   : "m"(kShuf01),  // %0
 520     "m"(kShuf11),  // %1
 521     "m"(kShuf21)   // %2
 522   );
 523   asm volatile (
 524     "movdqa    %0,%%xmm5                       \n"  // kMadd01
 525     "movdqa    %1,%%xmm0                       \n"  // kMadd11
 526     "movdqa    %2,%%xmm1                       \n"  // kRound34
 527   :
 528   : "m"(kMadd01),  // %0
 529     "m"(kMadd11),  // %1
 530     "m"(kRound34)  // %2
 531   );
 532
 533   asm volatile (
 534     LABELALIGN
 535   "1:                                          \n"
 536     "movdqa    " MEMACCESS(0) ",%%xmm6         \n"
 537     MEMOPREG(movdqa,0x00,0,3,1,xmm7)           //  movdqa  (%0,%3,1),%%xmm7
 538     "pavgb     %%xmm6,%%xmm7                   \n"
 539     "pavgb     %%xmm7,%%xmm6                   \n"
 540     "pshufb    %%xmm2,%%xmm6                   \n"
 541     "pmaddubsw %%xmm5,%%xmm6                   \n"
 542     "paddsw    %%xmm1,%%xmm6                   \n"
 543     "psrlw     $0x2,%%xmm6                     \n"
 544     "packuswb  %%xmm6,%%xmm6                   \n"
 545     "movq      %%xmm6," MEMACCESS(1) "         \n"
 546     "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
 547     MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3,1),%%xmm7
 548     "pavgb     %%xmm6,%%xmm7                   \n"
 549     "pavgb     %%xmm7,%%xmm6                   \n"
 550     "pshufb    %%xmm3,%%xmm6                   \n"
 551     "pmaddubsw %%xmm0,%%xmm6                   \n"
 552     "paddsw    %%xmm1,%%xmm6                   \n"
 553     "psrlw     $0x2,%%xmm6                     \n"
 554     "packuswb  %%xmm6,%%xmm6                   \n"
 555     "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
 556     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
 557     MEMOPREG(movdqa,0x10,0,3,1,xmm7)           //  movdqa  0x10(%0,%3,1),%%xmm7
 558     "lea       " MEMLEA(0x20,0) ",%0           \n"
 559     "pavgb     %%xmm6,%%xmm7                   \n"
 560     "pavgb     %%xmm7,%%xmm6                   \n"
 561     "pshufb    %%xmm4,%%xmm6                   \n"
 562     "pmaddubsw %4,%%xmm6                       \n"
 563     "paddsw    %%xmm1,%%xmm6                   \n"
 564     "psrlw     $0x2,%%xmm6                     \n"
 565     "packuswb  %%xmm6,%%xmm6                   \n"
 566     "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
 567     "lea       " MEMLEA(0x18,1) ",%1           \n"
 568     "sub       $0x18,%2                        \n"
 569     "jg        1b                              \n"
 570     : "+r"(src_ptr),   // %0
 571       "+r"(dst_ptr),   // %1
 572       "+r"(dst_width)  // %2
 573     : "r"((intptr_t)(src_stride)),  // %3
 574       "m"(kMadd21)     // %4
 575     : "memory", "cc"
 576 #if defined(__native_client__) && defined(__x86_64__)
 577     , "r14"
 578 #endif
 579 #if defined(__SSE2__)
 580     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
 581 #endif
 582   );
 583 }
 584
 585 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
 586                           uint8* dst_ptr, int dst_width) {
 587   asm volatile (
 588     "movdqa    %3,%%xmm4                       \n"
 589     "movdqa    %4,%%xmm5                       \n"
 590
 591     LABELALIGN
 592   "1:                                          \n"
 593     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
 594     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
 595     "lea       " MEMLEA(0x20,0) ",%0           \n"
 596     "pshufb    %%xmm4,%%xmm0                   \n"
 597     "pshufb    %%xmm5,%%xmm1                   \n"
 598     "paddusb   %%xmm1,%%xmm0                   \n"
 599     "movq      %%xmm0," MEMACCESS(1) "         \n"
 600     "movhlps   %%xmm0,%%xmm1                   \n"
 601     "movd      %%xmm1," MEMACCESS2(0x8,1) "    \n"
 602     "lea       " MEMLEA(0xc,1) ",%1            \n"
 603     "sub       $0xc,%2                         \n"
 604     "jg        1b                              \n"
 605   : "+r"(src_ptr),   // %0
 606     "+r"(dst_ptr),   // %1
 607     "+r"(dst_width)  // %2
 608   : "m"(kShuf38a),   // %3
 609     "m"(kShuf38b)    // %4
 610   : "memory", "cc"
 611 #if defined(__SSE2__)
 612       , "xmm0", "xmm1", "xmm4", "xmm5"
 613 #endif
 614   );
 615 }
 616
 617 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
 618                                 ptrdiff_t src_stride,
 619                                 uint8* dst_ptr, int dst_width) {
 620   asm volatile (
 621     "movdqa    %0,%%xmm2                       \n"
 622     "movdqa    %1,%%xmm3                       \n"
 623     "movdqa    %2,%%xmm4                       \n"
 624     "movdqa    %3,%%xmm5                       \n"
 625   :
 626   : "m"(kShufAb0),   // %0
 627     "m"(kShufAb1),   // %1
 628     "m"(kShufAb2),   // %2
 629     "m"(kScaleAb2)   // %3
 630   );
 631   asm volatile (
 632     LABELALIGN
 633   "1:                                          \n"
 634     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
 635     MEMOPREG(pavgb,0x00,0,3,1,xmm0)            //  pavgb   (%0,%3,1),%%xmm0
 636     "lea       " MEMLEA(0x10,0) ",%0           \n"
 637     "movdqa    %%xmm0,%%xmm1                   \n"
 638     "pshufb    %%xmm2,%%xmm1                   \n"
 639     "movdqa    %%xmm0,%%xmm6                   \n"
 640     "pshufb    %%xmm3,%%xmm6                   \n"
 641     "paddusw   %%xmm6,%%xmm1                   \n"
 642     "pshufb    %%xmm4,%%xmm0                   \n"
 643     "paddusw   %%xmm0,%%xmm1                   \n"
 644     "pmulhuw   %%xmm5,%%xmm1                   \n"
 645     "packuswb  %%xmm1,%%xmm1                   \n"
 646     "sub       $0x6,%2                         \n"
 647     "movd      %%xmm1," MEMACCESS(1) "         \n"
 648     "psrlq     $0x10,%%xmm1                    \n"
 649     "movd      %%xmm1," MEMACCESS2(0x2,1) "    \n"
 650     "lea       " MEMLEA(0x6,1) ",%1            \n"
 651     "jg        1b                              \n"
 652   : "+r"(src_ptr),     // %0
 653     "+r"(dst_ptr),     // %1
 654     "+r"(dst_width)    // %2
 655   : "r"((intptr_t)(src_stride))  // %3
 656   : "memory", "cc"
 657 #if defined(__native_client__) && defined(__x86_64__)
 658     , "r14"
 659 #endif
 660 #if defined(__SSE2__)
 661     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
 662 #endif
 663   );
 664 }
 665
 666 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
 667                                 ptrdiff_t src_stride,
 668                                 uint8* dst_ptr, int dst_width) {
 669   asm volatile (
 670     "movdqa    %0,%%xmm2                       \n"
 671     "movdqa    %1,%%xmm3                       \n"
 672     "movdqa    %2,%%xmm4                       \n"
 673     "pxor      %%xmm5,%%xmm5                   \n"
 674   :
 675   : "m"(kShufAc),    // %0
 676     "m"(kShufAc3),   // %1
 677     "m"(kScaleAc33)  // %2
 678   );
 679   asm volatile (
 680     LABELALIGN
 681   "1:                                          \n"
 682     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
 683     MEMOPREG(movdqa,0x00,0,3,1,xmm6)           //  movdqa  (%0,%3,1),%%xmm6
 684     "movhlps   %%xmm0,%%xmm1                   \n"
 685     "movhlps   %%xmm6,%%xmm7                   \n"
 686     "punpcklbw %%xmm5,%%xmm0                   \n"
 687     "punpcklbw %%xmm5,%%xmm1                   \n"
 688     "punpcklbw %%xmm5,%%xmm6                   \n"
 689     "punpcklbw %%xmm5,%%xmm7                   \n"
 690     "paddusw   %%xmm6,%%xmm0                   \n"
 691     "paddusw   %%xmm7,%%xmm1                   \n"
 692     MEMOPREG(movdqa,0x00,0,3,2,xmm6)           //  movdqa  (%0,%3,2),%%xmm6
 693     "lea       " MEMLEA(0x10,0) ",%0           \n"
 694     "movhlps   %%xmm6,%%xmm7                   \n"
 695     "punpcklbw %%xmm5,%%xmm6                   \n"
 696     "punpcklbw %%xmm5,%%xmm7                   \n"
 697     "paddusw   %%xmm6,%%xmm0                   \n"
 698     "paddusw   %%xmm7,%%xmm1                   \n"
 699     "movdqa    %%xmm0,%%xmm6                   \n"
 700     "psrldq    $0x2,%%xmm0                     \n"
 701     "paddusw   %%xmm0,%%xmm6                   \n"
 702     "psrldq    $0x2,%%xmm0                     \n"
 703     "paddusw   %%xmm0,%%xmm6                   \n"
 704     "pshufb    %%xmm2,%%xmm6                   \n"
 705     "movdqa    %%xmm1,%%xmm7                   \n"
 706     "psrldq    $0x2,%%xmm1                     \n"
 707     "paddusw   %%xmm1,%%xmm7                   \n"
 708     "psrldq    $0x2,%%xmm1                     \n"
 709     "paddusw   %%xmm1,%%xmm7                   \n"
 710     "pshufb    %%xmm3,%%xmm7                   \n"
 711     "paddusw   %%xmm7,%%xmm6                   \n"
 712     "pmulhuw   %%xmm4,%%xmm6                   \n"
 713     "packuswb  %%xmm6,%%xmm6                   \n"
 714     "sub       $0x6,%2                         \n"
 715     "movd      %%xmm6," MEMACCESS(1) "         \n"
 716     "psrlq     $0x10,%%xmm6                    \n"
 717     "movd      %%xmm6," MEMACCESS2(0x2,1) "    \n"
 718     "lea       " MEMLEA(0x6,1) ",%1            \n"
 719     "jg        1b                              \n"
 720   : "+r"(src_ptr),    // %0
 721     "+r"(dst_ptr),    // %1
 722     "+r"(dst_width)   // %2
 723   : "r"((intptr_t)(src_stride))   // %3
 724   : "memory", "cc"
 725 #if defined(__native_client__) && defined(__x86_64__)
 726     , "r14"
 727 #endif
 728 #if defined(__SSE2__)
 729     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
 730 #endif
 731   );
 732 }
 733
 734 void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
 735                        uint16* dst_ptr, int src_width, int src_height) {
 736   int tmp_height = 0;
 737   intptr_t tmp_src = 0;
 738   asm volatile (
 739     "pxor      %%xmm4,%%xmm4                   \n"
 740     "sub       $0x1,%5                         \n"
 741
 742     LABELALIGN
 743   "1:                                          \n"
 744     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
 745     "mov       %0,%3                           \n"
 746     "add       %6,%0                           \n"
 747     "movdqa    %%xmm0,%%xmm1                   \n"
 748     "punpcklbw %%xmm4,%%xmm0                   \n"
 749     "punpckhbw %%xmm4,%%xmm1                   \n"
 750     "mov       %5,%2                           \n"
 751     "test      %2,%2                           \n"
 752     "je        3f                              \n"
 753
 754     LABELALIGN
 755   "2:                                          \n"
 756     "movdqa    " MEMACCESS(0) ",%%xmm2         \n"
 757     "add       %6,%0                           \n"
 758     "movdqa    %%xmm2,%%xmm3                   \n"
 759     "punpcklbw %%xmm4,%%xmm2                   \n"
 760     "punpckhbw %%xmm4,%%xmm3                   \n"
 761     "paddusw   %%xmm2,%%xmm0                   \n"
 762     "paddusw   %%xmm3,%%xmm1                   \n"
 763     "sub       $0x1,%2                         \n"
 764     "jg        2b                              \n"
 765
 766     LABELALIGN
 767   "3:                                          \n"
 768     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
 769     "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
 770     "lea       " MEMLEA(0x10,3) ",%0           \n"
 771     "lea       " MEMLEA(0x20,1) ",%1           \n"
 772     "sub       $0x10,%4                        \n"
 773     "jg        1b                              \n"
 774   : "+r"(src_ptr),     // %0
 775     "+r"(dst_ptr),     // %1
 776     "+r"(tmp_height),  // %2
 777     "+r"(tmp_src),     // %3
 778     "+r"(src_width),   // %4
 779     "+rm"(src_height)  // %5
 780   : "rm"((intptr_t)(src_stride))  // %6
 781   : "memory", "cc"
 782 #if defined(__SSE2__)
 783     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
 784 #endif
 785   );
 786 }
 787
 788 // Bilinear column filtering. SSSE3 version.
 789 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
 790                            int dst_width, int x, int dx) {
 791   intptr_t x0 = 0, x1 = 0, temp_pixel = 0;
 792   asm volatile (
 793     "movd      %6,%%xmm2                       \n"
 794     "movd      %7,%%xmm3                       \n"
 795     "movl      $0x04040000,%k2                 \n"
 796     "movd      %k2,%%xmm5                      \n"
 797     "pcmpeqb   %%xmm6,%%xmm6                   \n"
 798     "psrlw     $0x9,%%xmm6                     \n"
 799     "pextrw    $0x1,%%xmm2,%k3                 \n"
 800     "subl      $0x2,%5                         \n"
 801     "jl        29f                             \n"
 802     "movdqa    %%xmm2,%%xmm0                   \n"
 803     "paddd     %%xmm3,%%xmm0                   \n"
 804     "punpckldq %%xmm0,%%xmm2                   \n"
 805     "punpckldq %%xmm3,%%xmm3                   \n"
 806     "paddd     %%xmm3,%%xmm3                   \n"
 807     "pextrw    $0x3,%%xmm2,%k4                 \n"
 808
 809     LABELALIGN
 810   "2:                                          \n"
 811     "movdqa    %%xmm2,%%xmm1                   \n"
 812     "paddd     %%xmm3,%%xmm2                   \n"
 813     MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
 814     "movd      %k2,%%xmm0                      \n"
 815     "psrlw     $0x9,%%xmm1                     \n"
 816     BUNDLEALIGN
 817     MEMOPARG(movzwl,0x00,1,4,1,k2)             //  movzwl  (%1,%4,1),%k2
 818     "movd      %k2,%%xmm4                      \n"
 819     "pshufb    %%xmm5,%%xmm1                   \n"
 820     "punpcklwd %%xmm4,%%xmm0                   \n"
 821     "pxor      %%xmm6,%%xmm1                   \n"
 822     "pmaddubsw %%xmm1,%%xmm0                   \n"
 823     "pextrw    $0x1,%%xmm2,%k3                 \n"
 824     "pextrw    $0x3,%%xmm2,%k4                 \n"
 825     "psrlw     $0x7,%%xmm0                     \n"
 826     "packuswb  %%xmm0,%%xmm0                   \n"
 827     "movd      %%xmm0,%k2                      \n"
 828     "mov       %w2," MEMACCESS(0) "            \n"
 829     "lea       " MEMLEA(0x2,0) ",%0            \n"
 830     "sub       $0x2,%5                         \n"
 831     "jge       2b                              \n"
 832
 833     LABELALIGN
 834   "29:                                         \n"
 835     "addl      $0x1,%5                         \n"
 836     "jl        99f                             \n"
 837     MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
 838     "movd      %k2,%%xmm0                      \n"
 839     "psrlw     $0x9,%%xmm2                     \n"
 840     "pshufb    %%xmm5,%%xmm2                   \n"
 841     "pxor      %%xmm6,%%xmm2                   \n"
 842     "pmaddubsw %%xmm2,%%xmm0                   \n"
 843     "psrlw     $0x7,%%xmm0                     \n"
 844     "packuswb  %%xmm0,%%xmm0                   \n"
 845     "movd      %%xmm0,%k2                      \n"
 846     "mov       %b2," MEMACCESS(0) "            \n"
 847   "99:                                         \n"
 848   : "+r"(dst_ptr),     // %0
 849     "+r"(src_ptr),     // %1
 850     "+a"(temp_pixel),  // %2
 851     "+r"(x0),          // %3
 852     "+r"(x1),          // %4
 853     "+rm"(dst_width)   // %5
 854   : "rm"(x),           // %6
 855     "rm"(dx)           // %7
 856   : "memory", "cc"
 857 #if defined(__native_client__) && defined(__x86_64__)
 858     , "r14"
 859 #endif
 860 #if defined(__SSE2__)
 861     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
 862 #endif
 863   );
 864 }
 865
 866 // Reads 4 pixels, duplicates them and writes 8 pixels.
 867 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
 868 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
 869                        int dst_width, int x, int dx) {
 870   asm volatile (
 871     LABELALIGN
 872   "1:                                          \n"
 873     "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
 874     "lea       " MEMLEA(0x10,1) ",%1           \n"
 875     "movdqa    %%xmm0,%%xmm1                   \n"
 876     "punpcklbw %%xmm0,%%xmm0                   \n"
 877     "punpckhbw %%xmm1,%%xmm1                   \n"
 878     "sub       $0x20,%2                         \n"
 879     "movdqa    %%xmm0," MEMACCESS(0) "         \n"
 880     "movdqa    %%xmm1," MEMACCESS2(0x10,0) "   \n"
 881     "lea       " MEMLEA(0x20,0) ",%0           \n"
 882     "jg        1b                              \n"
 883
 884   : "+r"(dst_ptr),     // %0
 885     "+r"(src_ptr),     // %1
 886     "+r"(dst_width)    // %2
 887   :
 888   : "memory", "cc"
 889 #if defined(__SSE2__)
 890     , "xmm0", "xmm1"
 891 #endif
 892   );
 893 }
 894
 895 void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
 896                             ptrdiff_t src_stride,
 897                             uint8* dst_argb, int dst_width) {
 898   asm volatile (
 899     LABELALIGN
 900   "1:                                          \n"
 901     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
 902     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
 903     "lea       " MEMLEA(0x20,0) ",%0           \n"
 904     "shufps    $0xdd,%%xmm1,%%xmm0             \n"
 905     "sub       $0x4,%2                         \n"
 906     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
 907     "lea       " MEMLEA(0x10,1) ",%1           \n"
 908     "jg        1b                              \n"
 909   : "+r"(src_argb),  // %0
 910     "+r"(dst_argb),  // %1
 911     "+r"(dst_width)  // %2
 912   :
 913   : "memory", "cc"
 914 #if defined(__SSE2__)
 915     , "xmm0", "xmm1"
 916 #endif
 917   );
 918 }
 919
 920 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
 921                                   ptrdiff_t src_stride,
 922                                   uint8* dst_argb, int dst_width) {
 923   asm volatile (
 924     LABELALIGN
 925   "1:                                          \n"
 926     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
 927     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
 928     "lea       " MEMLEA(0x20,0) ",%0           \n"
 929     "movdqa    %%xmm0,%%xmm2                   \n"
 930     "shufps    $0x88,%%xmm1,%%xmm0             \n"
 931     "shufps    $0xdd,%%xmm1,%%xmm2             \n"
 932     "pavgb     %%xmm2,%%xmm0                   \n"
 933     "sub       $0x4,%2                         \n"
 934     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
 935     "lea       " MEMLEA(0x10,1) ",%1           \n"
 936     "jg        1b                              \n"
 937   : "+r"(src_argb),  // %0
 938     "+r"(dst_argb),  // %1
 939     "+r"(dst_width)  // %2
 940   :
 941   : "memory", "cc"
 942 #if defined(__SSE2__)
 943     , "xmm0", "xmm1"
 944 #endif
 945   );
 946 }
 947
 948 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
 949                                ptrdiff_t src_stride,
 950                                uint8* dst_argb, int dst_width) {
 951   asm volatile (
 952     LABELALIGN
 953   "1:                                          \n"
 954     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
 955     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
 956     BUNDLEALIGN
 957     MEMOPREG(movdqa,0x00,0,3,1,xmm2)           //  movdqa   (%0,%3,1),%%xmm2
 958     MEMOPREG(movdqa,0x10,0,3,1,xmm3)           //  movdqa   0x10(%0,%3,1),%%xmm3
 959     "lea       " MEMLEA(0x20,0) ",%0           \n"
 960     "pavgb     %%xmm2,%%xmm0                   \n"
 961     "pavgb     %%xmm3,%%xmm1                   \n"
 962     "movdqa    %%xmm0,%%xmm2                   \n"
 963     "shufps    $0x88,%%xmm1,%%xmm0             \n"
 964     "shufps    $0xdd,%%xmm1,%%xmm2             \n"
 965     "pavgb     %%xmm2,%%xmm0                   \n"
 966     "sub       $0x4,%2                         \n"
 967     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
 968     "lea       " MEMLEA(0x10,1) ",%1           \n"
 969     "jg        1b                              \n"
 970   : "+r"(src_argb),   // %0
 971     "+r"(dst_argb),   // %1
 972     "+r"(dst_width)   // %2
 973   : "r"((intptr_t)(src_stride))   // %3
 974   : "memory", "cc"
 975 #if defined(__native_client__) && defined(__x86_64__)
 976     , "r14"
 977 #endif
 978 #if defined(__SSE2__)
 979     , "xmm0", "xmm1", "xmm2", "xmm3"
 980 #endif
 981   );
 982 }
 983
 984 // Reads 4 pixels at a time.
 985 // Alignment requirement: dst_argb 16 byte aligned.
 986 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
 987                                int src_stepx,
 988                                uint8* dst_argb, int dst_width) {
 989   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
 990   intptr_t src_stepx_x12 = 0;
 991   asm volatile (
 992     "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
 993     "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
 994     LABELALIGN
 995   "1:                                          \n"
 996     "movd      " MEMACCESS(0) ",%%xmm0         \n"
 997     MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
 998     "punpckldq %%xmm1,%%xmm0                   \n"
 999     BUNDLEALIGN
1000     MEMOPREG(movd,0x00,0,1,2,xmm2)             //  movd      (%0,%1,2),%%xmm2
1001     MEMOPREG(movd,0x00,0,4,1,xmm3)             //  movd      (%0,%4,1),%%xmm3
1002     "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
1003     "punpckldq %%xmm3,%%xmm2                   \n"
1004     "punpcklqdq %%xmm2,%%xmm0                  \n"
1005     "sub       $0x4,%3                         \n"
1006     "movdqa    %%xmm0," MEMACCESS(2) "         \n"
1007     "lea       " MEMLEA(0x10,2) ",%2           \n"
1008     "jg        1b                              \n"
1009   : "+r"(src_argb),      // %0
1010     "+r"(src_stepx_x4),  // %1
1011     "+r"(dst_argb),      // %2
1012     "+r"(dst_width),     // %3
1013     "+r"(src_stepx_x12)  // %4
1014   :
1015   : "memory", "cc"
1016 #if defined(__native_client__) && defined(__x86_64__)
1017     , "r14"
1018 #endif
1019 #if defined(__SSE2__)
1020     , "xmm0", "xmm1", "xmm2", "xmm3"
1021 #endif
1022   );
1023 }
1024
1025 // Blends four 2x2 to 4x1.
1026 // Alignment requirement: dst_argb 16 byte aligned.
1027 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
1028                                   ptrdiff_t src_stride, int src_stepx,
1029                                   uint8* dst_argb, int dst_width) {
1030   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
1031   intptr_t src_stepx_x12 = 0;
1032   intptr_t row1 = (intptr_t)(src_stride);
1033   asm volatile (
1034     "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
1035     "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
1036     "lea       " MEMLEA4(0x00,0,5,1) ",%5      \n"
1037
1038     LABELALIGN
1039   "1:                                          \n"
1040     "movq      " MEMACCESS(0) ",%%xmm0         \n"
1041     MEMOPREG(movhps,0x00,0,1,1,xmm0)           //  movhps    (%0,%1,1),%%xmm0
1042     MEMOPREG(movq,0x00,0,1,2,xmm1)             //  movq      (%0,%1,2),%%xmm1
1043     BUNDLEALIGN
1044     MEMOPREG(movhps,0x00,0,4,1,xmm1)           //  movhps    (%0,%4,1),%%xmm1
1045     "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
1046     "movq      " MEMACCESS(5) ",%%xmm2         \n"
1047     BUNDLEALIGN
1048     MEMOPREG(movhps,0x00,5,1,1,xmm2)           //  movhps    (%5,%1,1),%%xmm2
1049     MEMOPREG(movq,0x00,5,1,2,xmm3)             //  movq      (%5,%1,2),%%xmm3
1050     MEMOPREG(movhps,0x00,5,4,1,xmm3)           //  movhps    (%5,%4,1),%%xmm3
1051     "lea       " MEMLEA4(0x00,5,1,4) ",%5      \n"
1052     "pavgb     %%xmm2,%%xmm0                   \n"
1053     "pavgb     %%xmm3,%%xmm1                   \n"
1054     "movdqa    %%xmm0,%%xmm2                   \n"
1055     "shufps    $0x88,%%xmm1,%%xmm0             \n"
1056     "shufps    $0xdd,%%xmm1,%%xmm2             \n"
1057     "pavgb     %%xmm2,%%xmm0                   \n"
1058     "sub       $0x4,%3                         \n"
1059     "movdqa    %%xmm0," MEMACCESS(2) "         \n"
1060     "lea       " MEMLEA(0x10,2) ",%2           \n"
1061     "jg        1b                              \n"
1062   : "+r"(src_argb),       // %0
1063     "+r"(src_stepx_x4),   // %1
1064     "+r"(dst_argb),       // %2
1065     "+rm"(dst_width),     // %3
1066     "+r"(src_stepx_x12),  // %4
1067     "+r"(row1)            // %5
1068   :
1069   : "memory", "cc"
1070 #if defined(__native_client__) && defined(__x86_64__)
1071     , "r14"
1072 #endif
1073 #if defined(__SSE2__)
1074     , "xmm0", "xmm1", "xmm2", "xmm3"
1075 #endif
1076   );
1077 }
1078
1079 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
1080                         int dst_width, int x, int dx) {
1081   intptr_t x0 = 0, x1 = 0;
1082   asm volatile (
1083     "movd      %5,%%xmm2                       \n"
1084     "movd      %6,%%xmm3                       \n"
1085     "pshufd    $0x0,%%xmm2,%%xmm2              \n"
1086     "pshufd    $0x11,%%xmm3,%%xmm0             \n"
1087     "paddd     %%xmm0,%%xmm2                   \n"
1088     "paddd     %%xmm3,%%xmm3                   \n"
1089     "pshufd    $0x5,%%xmm3,%%xmm0              \n"
1090     "paddd     %%xmm0,%%xmm2                   \n"
1091     "paddd     %%xmm3,%%xmm3                   \n"
1092     "pshufd    $0x0,%%xmm3,%%xmm3              \n"
1093     "pextrw    $0x1,%%xmm2,%k0                 \n"
1094     "pextrw    $0x3,%%xmm2,%k1                 \n"
1095     "cmp       $0x0,%4                         \n"
1096     "jl        99f                             \n"
1097     "sub       $0x4,%4                         \n"
1098     "jl        49f                             \n"
1099
1100     LABELALIGN
1101   "40:                                         \n"
1102     MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
1103     MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
1104     "pextrw    $0x5,%%xmm2,%k0                 \n"
1105     "pextrw    $0x7,%%xmm2,%k1                 \n"
1106     "paddd     %%xmm3,%%xmm2                   \n"
1107     "punpckldq %%xmm1,%%xmm0                   \n"
1108     MEMOPREG(movd,0x00,3,0,4,xmm1)             //  movd      (%3,%0,4),%%xmm1
1109     MEMOPREG(movd,0x00,3,1,4,xmm4)             //  movd      (%3,%1,4),%%xmm4
1110     "pextrw    $0x1,%%xmm2,%k0                 \n"
1111     "pextrw    $0x3,%%xmm2,%k1                 \n"
1112     "punpckldq %%xmm4,%%xmm1                   \n"
1113     "punpcklqdq %%xmm1,%%xmm0                  \n"
1114     "sub       $0x4,%4                         \n"
1115     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
1116     "lea       " MEMLEA(0x10,2) ",%2           \n"
1117     "jge       40b                             \n"
1118
1119   "49:                                         \n"
1120     "test      $0x2,%4                         \n"
1121     "je        29f                             \n"
1122     BUNDLEALIGN
1123     MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
1124     MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
1125     "pextrw    $0x5,%%xmm2,%k0                 \n"
1126     "punpckldq %%xmm1,%%xmm0                   \n"
1127     "movq      %%xmm0," MEMACCESS(2) "         \n"
1128     "lea       " MEMLEA(0x8,2) ",%2            \n"
1129   "29:                                         \n"
1130     "test      $0x1,%4                         \n"
1131     "je        99f                             \n"
1132     MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
1133     "movd      %%xmm0," MEMACCESS(2) "         \n"
1134   "99:                                         \n"
1135   : "+a"(x0),          // %0
1136     "+d"(x1),          // %1
1137     "+r"(dst_argb),    // %2
1138     "+r"(src_argb),    // %3
1139     "+r"(dst_width)    // %4
1140   : "rm"(x),           // %5
1141     "rm"(dx)           // %6
1142   : "memory", "cc"
1143 #if defined(__native_client__) && defined(__x86_64__)
1144     , "r14"
1145 #endif
1146 #if defined(__SSE2__)
1147     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
1148 #endif
1149   );
1150 }
1151
1152 // Reads 4 pixels, duplicates them and writes 8 pixels.
1153 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
1154 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
1155                            int dst_width, int x, int dx) {
1156   asm volatile (
1157     LABELALIGN
1158   "1:                                          \n"
1159     "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
1160     "lea       " MEMLEA(0x10,1) ",%1           \n"
1161     "movdqa    %%xmm0,%%xmm1                   \n"
1162     "punpckldq %%xmm0,%%xmm0                   \n"
1163     "punpckhdq %%xmm1,%%xmm1                   \n"
1164     "sub       $0x8,%2                         \n"
1165     "movdqa    %%xmm0," MEMACCESS(0) "         \n"
1166     "movdqa    %%xmm1," MEMACCESS2(0x10,0) "   \n"
1167     "lea       " MEMLEA(0x20,0) ",%0           \n"
1168     "jg        1b                              \n"
1169
1170   : "+r"(dst_argb),    // %0
1171     "+r"(src_argb),    // %1
1172     "+r"(dst_width)    // %2
1173   :
1174   : "memory", "cc"
1175 #if defined(__native_client__) && defined(__x86_64__)
1176     , "r14"
1177 #endif
1178 #if defined(__SSE2__)
1179     , "xmm0", "xmm1"
1180 #endif
1181   );
1182 }
1183
1184 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
1185 static uvec8 kShuffleColARGB = {
1186   0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
1187   8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
1188 };
1189
1190 // Shuffle table for duplicating 2 fractions into 8 bytes each
1191 static uvec8 kShuffleFractions = {
1192   0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
1193 };
1194
1195 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
1196 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
1197                                int dst_width, int x, int dx) {
1198   intptr_t x0 = 0, x1 = 0;
1199   asm volatile (
1200     "movdqa    %0,%%xmm4                       \n"
1201     "movdqa    %1,%%xmm5                       \n"
1202   :
1203   : "m"(kShuffleColARGB),  // %0
1204     "m"(kShuffleFractions)  // %1
1205   );
1206
1207   asm volatile (
1208     "movd      %5,%%xmm2                       \n"
1209     "movd      %6,%%xmm3                       \n"
1210     "pcmpeqb   %%xmm6,%%xmm6                   \n"
1211     "psrlw     $0x9,%%xmm6                     \n"
1212     "pextrw    $0x1,%%xmm2,%k3                 \n"
1213     "sub       $0x2,%2                         \n"
1214     "jl        29f                             \n"
1215     "movdqa    %%xmm2,%%xmm0                   \n"
1216     "paddd     %%xmm3,%%xmm0                   \n"
1217     "punpckldq %%xmm0,%%xmm2                   \n"
1218     "punpckldq %%xmm3,%%xmm3                   \n"
1219     "paddd     %%xmm3,%%xmm3                   \n"
1220     "pextrw    $0x3,%%xmm2,%k4                 \n"
1221
1222     LABELALIGN
1223   "2:                                          \n"
1224     "movdqa    %%xmm2,%%xmm1                   \n"
1225     "paddd     %%xmm3,%%xmm2                   \n"
1226     MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
1227     "psrlw     $0x9,%%xmm1                     \n"
1228     BUNDLEALIGN
1229     MEMOPREG(movhps,0x00,1,4,4,xmm0)           //  movhps    (%1,%4,4),%%xmm0
1230     "pshufb    %%xmm5,%%xmm1                   \n"
1231     "pshufb    %%xmm4,%%xmm0                   \n"
1232     "pxor      %%xmm6,%%xmm1                   \n"
1233     "pmaddubsw %%xmm1,%%xmm0                   \n"
1234     "psrlw     $0x7,%%xmm0                     \n"
1235     "pextrw    $0x1,%%xmm2,%k3                 \n"
1236     "pextrw    $0x3,%%xmm2,%k4                 \n"
1237     "packuswb  %%xmm0,%%xmm0                   \n"
1238     "movq      %%xmm0," MEMACCESS(0) "         \n"
1239     "lea       " MEMLEA(0x8,0) ",%0            \n"
1240     "sub       $0x2,%2                         \n"
1241     "jge       2b                              \n"
1242
1243     LABELALIGN
1244   "29:                                         \n"
1245     "add       $0x1,%2                         \n"
1246     "jl        99f                             \n"
1247     "psrlw     $0x9,%%xmm2                     \n"
1248     BUNDLEALIGN
1249     MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
1250     "pshufb    %%xmm5,%%xmm2                   \n"
1251     "pshufb    %%xmm4,%%xmm0                   \n"
1252     "pxor      %%xmm6,%%xmm2                   \n"
1253     "pmaddubsw %%xmm2,%%xmm0                   \n"
1254     "psrlw     $0x7,%%xmm0                     \n"
1255     "packuswb  %%xmm0,%%xmm0                   \n"
1256     "movd      %%xmm0," MEMACCESS(0) "         \n"
1257
1258     LABELALIGN
1259   "99:                                         \n"
1260   : "+r"(dst_argb),    // %0
1261     "+r"(src_argb),    // %1
1262     "+rm"(dst_width),  // %2
1263     "+r"(x0),          // %3
1264     "+r"(x1)           // %4
1265   : "rm"(x),           // %5
1266     "rm"(dx)           // %6
1267   : "memory", "cc"
1268 #if defined(__native_client__) && defined(__x86_64__)
1269     , "r14"
1270 #endif
1271 #if defined(__SSE2__)
1272     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
1273 #endif
1274   );
1275 }
1276
1277 // Divide num by div and return as 16.16 fixed point result.
1278 int FixedDiv_X86(int num, int div) {
1279   asm volatile (
1280     "cdq                                       \n"
1281     "shld      $0x10,%%eax,%%edx               \n"
1282     "shl       $0x10,%%eax                     \n"
1283     "idiv      %1                              \n"
1284     "mov       %0, %%eax                       \n"
1285     : "+a"(num)  // %0
1286     : "c"(div)   // %1
1287     : "memory", "cc", "edx"
1288   );
1289   return num;
1290 }
1291
1292 // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
1293 int FixedDiv1_X86(int num, int div) {
1294   asm volatile (
1295     "cdq                                       \n"
1296     "shld      $0x10,%%eax,%%edx               \n"
1297     "shl       $0x10,%%eax                     \n"
1298     "sub       $0x10001,%%eax                  \n"
1299     "sbb       $0x0,%%edx                      \n"
1300     "sub       $0x1,%1                         \n"
1301     "idiv      %1                              \n"
1302     "mov       %0, %%eax                       \n"
1303     : "+a"(num)  // %0
1304     : "c"(div)   // %1
1305     : "memory", "cc", "edx"
1306   );
1307   return num;
1308 }
1309
1310 #endif  // defined(__x86_64__) || defined(__i386__)
1311
1312 #ifdef __cplusplus
1313 }  // extern "C"
1314 }  // namespace libyuv
1315 #endif