src/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_win.cc

   1 /*
   2  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS. All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10
  11 #include "libyuv/row.h"
  12
  13 #ifdef __cplusplus
  14 namespace libyuv {
  15 extern "C" {
  16 #endif
  17
  18 // This module is for Visual C x86.
  19 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
  20
  21 // Offsets for source bytes 0 to 9
  22 static uvec8 kShuf0 =
  23   { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
  24
  25 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
  26 static uvec8 kShuf1 =
  27   { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
  28
  29 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
  30 static uvec8 kShuf2 =
  31   { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
  32
  33 // Offsets for source bytes 0 to 10
  34 static uvec8 kShuf01 =
  35   { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
  36
  37 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
  38 static uvec8 kShuf11 =
  39   { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
  40
  41 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
  42 static uvec8 kShuf21 =
  43   { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
  44
  45 // Coefficients for source bytes 0 to 10
  46 static uvec8 kMadd01 =
  47   { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
  48
  49 // Coefficients for source bytes 10 to 21
  50 static uvec8 kMadd11 =
  51   { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
  52
  53 // Coefficients for source bytes 21 to 31
  54 static uvec8 kMadd21 =
  55   { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
  56
  57 // Coefficients for source bytes 21 to 31
  58 static vec16 kRound34 =
  59   { 2, 2, 2, 2, 2, 2, 2, 2 };
  60
  61 static uvec8 kShuf38a =
  62   { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
  63
  64 static uvec8 kShuf38b =
  65   { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
  66
  67 // Arrange words 0,3,6 into 0,1,2
  68 static uvec8 kShufAc =
  69   { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
  70
  71 // Arrange words 0,3,6 into 3,4,5
  72 static uvec8 kShufAc3 =
  73   { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
  74
  75 // Scaling values for boxes of 3x3 and 2x3
  76 static uvec16 kScaleAc33 =
  77   { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
  78
  79 // Arrange first value for pixels 0,1,2,3,4,5
  80 static uvec8 kShufAb0 =
  81   { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
  82
  83 // Arrange second value for pixels 0,1,2,3,4,5
  84 static uvec8 kShufAb1 =
  85   { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
  86
  87 // Arrange third value for pixels 0,1,2,3,4,5
  88 static uvec8 kShufAb2 =
  89   { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
  90
  91 // Scaling values for boxes of 3x2 and 2x2
  92 static uvec16 kScaleAb2 =
  93   { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
  94
  95 // Reads 32 pixels, throws half away and writes 16 pixels.
  96 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
  97 __declspec(naked) __declspec(align(16))
  98 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
  99                         uint8* dst_ptr, int dst_width) {
 100   __asm {
 101     mov        eax, [esp + 4]        // src_ptr
 102                                      // src_stride ignored
 103     mov        edx, [esp + 12]       // dst_ptr
 104     mov        ecx, [esp + 16]       // dst_width
 105
 106     align      4
 107   wloop:
 108     movdqa     xmm0, [eax]
 109     movdqa     xmm1, [eax + 16]
 110     lea        eax,  [eax + 32]
 111     psrlw      xmm0, 8               // isolate odd pixels.
 112     psrlw      xmm1, 8
 113     packuswb   xmm0, xmm1
 114     sub        ecx, 16
 115     movdqa     [edx], xmm0
 116     lea        edx, [edx + 16]
 117     jg         wloop
 118
 119     ret
 120   }
 121 }
 122
 123 // Blends 32x1 rectangle to 16x1.
 124 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
 125 __declspec(naked) __declspec(align(16))
 126 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
 127                               uint8* dst_ptr, int dst_width) {
 128   __asm {
 129     mov        eax, [esp + 4]        // src_ptr
 130                                      // src_stride
 131     mov        edx, [esp + 12]       // dst_ptr
 132     mov        ecx, [esp + 16]       // dst_width
 133     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
 134     psrlw      xmm5, 8
 135
 136     align      4
 137   wloop:
 138     movdqa     xmm0, [eax]
 139     movdqa     xmm1, [eax + 16]
 140     lea        eax,  [eax + 32]
 141
 142     movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
 143     psrlw      xmm0, 8
 144     movdqa     xmm3, xmm1
 145     psrlw      xmm1, 8
 146     pand       xmm2, xmm5
 147     pand       xmm3, xmm5
 148     pavgw      xmm0, xmm2
 149     pavgw      xmm1, xmm3
 150     packuswb   xmm0, xmm1
 151
 152     sub        ecx, 16
 153     movdqa     [edx], xmm0
 154     lea        edx, [edx + 16]
 155     jg         wloop
 156
 157     ret
 158   }
 159 }
 160
 161 // Blends 32x2 rectangle to 16x1.
 162 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
 163 __declspec(naked) __declspec(align(16))
 164 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
 165                            uint8* dst_ptr, int dst_width) {
 166   __asm {
 167     push       esi
 168     mov        eax, [esp + 4 + 4]    // src_ptr
 169     mov        esi, [esp + 4 + 8]    // src_stride
 170     mov        edx, [esp + 4 + 12]   // dst_ptr
 171     mov        ecx, [esp + 4 + 16]   // dst_width
 172     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
 173     psrlw      xmm5, 8
 174
 175     align      4
 176   wloop:
 177     movdqa     xmm0, [eax]
 178     movdqa     xmm1, [eax + 16]
 179     movdqa     xmm2, [eax + esi]
 180     movdqa     xmm3, [eax + esi + 16]
 181     lea        eax,  [eax + 32]
 182     pavgb      xmm0, xmm2            // average rows
 183     pavgb      xmm1, xmm3
 184
 185     movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
 186     psrlw      xmm0, 8
 187     movdqa     xmm3, xmm1
 188     psrlw      xmm1, 8
 189     pand       xmm2, xmm5
 190     pand       xmm3, xmm5
 191     pavgw      xmm0, xmm2
 192     pavgw      xmm1, xmm3
 193     packuswb   xmm0, xmm1
 194
 195     sub        ecx, 16
 196     movdqa     [edx], xmm0
 197     lea        edx, [edx + 16]
 198     jg         wloop
 199
 200     pop        esi
 201     ret
 202   }
 203 }
 204
 205 // Reads 32 pixels, throws half away and writes 16 pixels.
 206 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
 207 __declspec(naked) __declspec(align(16))
 208 void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
 209                                   ptrdiff_t src_stride,
 210                                   uint8* dst_ptr, int dst_width) {
 211   __asm {
 212     mov        eax, [esp + 4]        // src_ptr
 213                                      // src_stride ignored
 214     mov        edx, [esp + 12]       // dst_ptr
 215     mov        ecx, [esp + 16]       // dst_width
 216
 217     align      4
 218   wloop:
 219     movdqu     xmm0, [eax]
 220     movdqu     xmm1, [eax + 16]
 221     lea        eax,  [eax + 32]
 222     psrlw      xmm0, 8               // isolate odd pixels.
 223     psrlw      xmm1, 8
 224     packuswb   xmm0, xmm1
 225     sub        ecx, 16
 226     movdqu     [edx], xmm0
 227     lea        edx, [edx + 16]
 228     jg         wloop
 229
 230     ret
 231   }
 232 }
 233
 234 // Blends 32x1 rectangle to 16x1.
 235 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
 236 __declspec(naked) __declspec(align(16))
 237 void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
 238                                         ptrdiff_t src_stride,
 239                                         uint8* dst_ptr, int dst_width) {
 240   __asm {
 241     mov        eax, [esp + 4]        // src_ptr
 242                                      // src_stride
 243     mov        edx, [esp + 12]       // dst_ptr
 244     mov        ecx, [esp + 16]       // dst_width
 245     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
 246     psrlw      xmm5, 8
 247
 248     align      4
 249   wloop:
 250     movdqu     xmm0, [eax]
 251     movdqu     xmm1, [eax + 16]
 252     lea        eax,  [eax + 32]
 253
 254     movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
 255     psrlw      xmm0, 8
 256     movdqa     xmm3, xmm1
 257     psrlw      xmm1, 8
 258     pand       xmm2, xmm5
 259     pand       xmm3, xmm5
 260     pavgw      xmm0, xmm2
 261     pavgw      xmm1, xmm3
 262     packuswb   xmm0, xmm1
 263
 264     sub        ecx, 16
 265     movdqu     [edx], xmm0
 266     lea        edx, [edx + 16]
 267     jg         wloop
 268
 269     ret
 270   }
 271 }
 272
 273 // Blends 32x2 rectangle to 16x1.
 274 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
 275 __declspec(naked) __declspec(align(16))
 276 void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
 277                                      ptrdiff_t src_stride,
 278                                      uint8* dst_ptr, int dst_width) {
 279   __asm {
 280     push       esi
 281     mov        eax, [esp + 4 + 4]    // src_ptr
 282     mov        esi, [esp + 4 + 8]    // src_stride
 283     mov        edx, [esp + 4 + 12]   // dst_ptr
 284     mov        ecx, [esp + 4 + 16]   // dst_width
 285     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
 286     psrlw      xmm5, 8
 287
 288     align      4
 289   wloop:
 290     movdqu     xmm0, [eax]
 291     movdqu     xmm1, [eax + 16]
 292     movdqu     xmm2, [eax + esi]
 293     movdqu     xmm3, [eax + esi + 16]
 294     lea        eax,  [eax + 32]
 295     pavgb      xmm0, xmm2            // average rows
 296     pavgb      xmm1, xmm3
 297
 298     movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
 299     psrlw      xmm0, 8
 300     movdqa     xmm3, xmm1
 301     psrlw      xmm1, 8
 302     pand       xmm2, xmm5
 303     pand       xmm3, xmm5
 304     pavgw      xmm0, xmm2
 305     pavgw      xmm1, xmm3
 306     packuswb   xmm0, xmm1
 307
 308     sub        ecx, 16
 309     movdqu     [edx], xmm0
 310     lea        edx, [edx + 16]
 311     jg         wloop
 312
 313     pop        esi
 314     ret
 315   }
 316 }
 317
 318 // Point samples 32 pixels to 8 pixels.
 319 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
 320 __declspec(naked) __declspec(align(16))
 321 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
 322                         uint8* dst_ptr, int dst_width) {
 323   __asm {
 324     mov        eax, [esp + 4]        // src_ptr
 325                                      // src_stride ignored
 326     mov        edx, [esp + 12]       // dst_ptr
 327     mov        ecx, [esp + 16]       // dst_width
 328     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff0000
 329     psrld      xmm5, 24
 330     pslld      xmm5, 16
 331
 332     align      4
 333   wloop:
 334     movdqa     xmm0, [eax]
 335     movdqa     xmm1, [eax + 16]
 336     lea        eax,  [eax + 32]
 337     pand       xmm0, xmm5
 338     pand       xmm1, xmm5
 339     packuswb   xmm0, xmm1
 340     psrlw      xmm0, 8
 341     packuswb   xmm0, xmm0
 342     sub        ecx, 8
 343     movq       qword ptr [edx], xmm0
 344     lea        edx, [edx + 8]
 345     jg         wloop
 346
 347     ret
 348   }
 349 }
 350
 351 // Blends 32x4 rectangle to 8x1.
 352 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
 353 __declspec(naked) __declspec(align(16))
 354 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
 355                            uint8* dst_ptr, int dst_width) {
 356   __asm {
 357     push       esi
 358     push       edi
 359     mov        eax, [esp + 8 + 4]    // src_ptr
 360     mov        esi, [esp + 8 + 8]    // src_stride
 361     mov        edx, [esp + 8 + 12]   // dst_ptr
 362     mov        ecx, [esp + 8 + 16]   // dst_width
 363     lea        edi, [esi + esi * 2]  // src_stride * 3
 364     pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
 365     psrlw      xmm7, 8
 366
 367     align      4
 368   wloop:
 369     movdqa     xmm0, [eax]
 370     movdqa     xmm1, [eax + 16]
 371     movdqa     xmm2, [eax + esi]
 372     movdqa     xmm3, [eax + esi + 16]
 373     pavgb      xmm0, xmm2            // average rows
 374     pavgb      xmm1, xmm3
 375     movdqa     xmm2, [eax + esi * 2]
 376     movdqa     xmm3, [eax + esi * 2 + 16]
 377     movdqa     xmm4, [eax + edi]
 378     movdqa     xmm5, [eax + edi + 16]
 379     lea        eax, [eax + 32]
 380     pavgb      xmm2, xmm4
 381     pavgb      xmm3, xmm5
 382     pavgb      xmm0, xmm2
 383     pavgb      xmm1, xmm3
 384
 385     movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
 386     psrlw      xmm0, 8
 387     movdqa     xmm3, xmm1
 388     psrlw      xmm1, 8
 389     pand       xmm2, xmm7
 390     pand       xmm3, xmm7
 391     pavgw      xmm0, xmm2
 392     pavgw      xmm1, xmm3
 393     packuswb   xmm0, xmm1
 394
 395     movdqa     xmm2, xmm0            // average columns (16 to 8 pixels)
 396     psrlw      xmm0, 8
 397     pand       xmm2, xmm7
 398     pavgw      xmm0, xmm2
 399     packuswb   xmm0, xmm0
 400
 401     sub        ecx, 8
 402     movq       qword ptr [edx], xmm0
 403     lea        edx, [edx + 8]
 404     jg         wloop
 405
 406     pop        edi
 407     pop        esi
 408     ret
 409   }
 410 }
 411
 412 // Point samples 32 pixels to 24 pixels.
 413 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
 414 // Then shuffled to do the scaling.
 415
 416 // Note that movdqa+palign may be better than movdqu.
 417 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
 418 __declspec(naked) __declspec(align(16))
 419 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
 420                           uint8* dst_ptr, int dst_width) {
 421   __asm {
 422     mov        eax, [esp + 4]        // src_ptr
 423                                      // src_stride ignored
 424     mov        edx, [esp + 12]       // dst_ptr
 425     mov        ecx, [esp + 16]       // dst_width
 426     movdqa     xmm3, kShuf0
 427     movdqa     xmm4, kShuf1
 428     movdqa     xmm5, kShuf2
 429
 430     align      4
 431   wloop:
 432     movdqa     xmm0, [eax]
 433     movdqa     xmm1, [eax + 16]
 434     lea        eax,  [eax + 32]
 435     movdqa     xmm2, xmm1
 436     palignr    xmm1, xmm0, 8
 437     pshufb     xmm0, xmm3
 438     pshufb     xmm1, xmm4
 439     pshufb     xmm2, xmm5
 440     movq       qword ptr [edx], xmm0
 441     movq       qword ptr [edx + 8], xmm1
 442     movq       qword ptr [edx + 16], xmm2
 443     lea        edx, [edx + 24]
 444     sub        ecx, 24
 445     jg         wloop
 446
 447     ret
 448   }
 449 }
 450
 451 // Blends 32x2 rectangle to 24x1
 452 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
 453 // Then shuffled to do the scaling.
 454
 455 // Register usage:
 456 // xmm0 src_row 0
 457 // xmm1 src_row 1
 458 // xmm2 shuf 0
 459 // xmm3 shuf 1
 460 // xmm4 shuf 2
 461 // xmm5 madd 0
 462 // xmm6 madd 1
 463 // xmm7 kRound34
 464
 465 // Note that movdqa+palign may be better than movdqu.
 466 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
 467 __declspec(naked) __declspec(align(16))
 468 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
 469                                 ptrdiff_t src_stride,
 470                                 uint8* dst_ptr, int dst_width) {
 471   __asm {
 472     push       esi
 473     mov        eax, [esp + 4 + 4]    // src_ptr
 474     mov        esi, [esp + 4 + 8]    // src_stride
 475     mov        edx, [esp + 4 + 12]   // dst_ptr
 476     mov        ecx, [esp + 4 + 16]   // dst_width
 477     movdqa     xmm2, kShuf01
 478     movdqa     xmm3, kShuf11
 479     movdqa     xmm4, kShuf21
 480     movdqa     xmm5, kMadd01
 481     movdqa     xmm6, kMadd11
 482     movdqa     xmm7, kRound34
 483
 484     align      4
 485   wloop:
 486     movdqa     xmm0, [eax]           // pixels 0..7
 487     movdqa     xmm1, [eax + esi]
 488     pavgb      xmm0, xmm1
 489     pshufb     xmm0, xmm2
 490     pmaddubsw  xmm0, xmm5
 491     paddsw     xmm0, xmm7
 492     psrlw      xmm0, 2
 493     packuswb   xmm0, xmm0
 494     movq       qword ptr [edx], xmm0
 495     movdqu     xmm0, [eax + 8]       // pixels 8..15
 496     movdqu     xmm1, [eax + esi + 8]
 497     pavgb      xmm0, xmm1
 498     pshufb     xmm0, xmm3
 499     pmaddubsw  xmm0, xmm6
 500     paddsw     xmm0, xmm7
 501     psrlw      xmm0, 2
 502     packuswb   xmm0, xmm0
 503     movq       qword ptr [edx + 8], xmm0
 504     movdqa     xmm0, [eax + 16]      // pixels 16..23
 505     movdqa     xmm1, [eax + esi + 16]
 506     lea        eax, [eax + 32]
 507     pavgb      xmm0, xmm1
 508     pshufb     xmm0, xmm4
 509     movdqa     xmm1, kMadd21
 510     pmaddubsw  xmm0, xmm1
 511     paddsw     xmm0, xmm7
 512     psrlw      xmm0, 2
 513     packuswb   xmm0, xmm0
 514     sub        ecx, 24
 515     movq       qword ptr [edx + 16], xmm0
 516     lea        edx, [edx + 24]
 517     jg         wloop
 518
 519     pop        esi
 520     ret
 521   }
 522 }
 523
 524 // Note that movdqa+palign may be better than movdqu.
 525 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
 526 __declspec(naked) __declspec(align(16))
 527 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
 528                                 ptrdiff_t src_stride,
 529                                 uint8* dst_ptr, int dst_width) {
 530   __asm {
 531     push       esi
 532     mov        eax, [esp + 4 + 4]    // src_ptr
 533     mov        esi, [esp + 4 + 8]    // src_stride
 534     mov        edx, [esp + 4 + 12]   // dst_ptr
 535     mov        ecx, [esp + 4 + 16]   // dst_width
 536     movdqa     xmm2, kShuf01
 537     movdqa     xmm3, kShuf11
 538     movdqa     xmm4, kShuf21
 539     movdqa     xmm5, kMadd01
 540     movdqa     xmm6, kMadd11
 541     movdqa     xmm7, kRound34
 542
 543     align      4
 544   wloop:
 545     movdqa     xmm0, [eax]           // pixels 0..7
 546     movdqa     xmm1, [eax + esi]
 547     pavgb      xmm1, xmm0
 548     pavgb      xmm0, xmm1
 549     pshufb     xmm0, xmm2
 550     pmaddubsw  xmm0, xmm5
 551     paddsw     xmm0, xmm7
 552     psrlw      xmm0, 2
 553     packuswb   xmm0, xmm0
 554     movq       qword ptr [edx], xmm0
 555     movdqu     xmm0, [eax + 8]       // pixels 8..15
 556     movdqu     xmm1, [eax + esi + 8]
 557     pavgb      xmm1, xmm0
 558     pavgb      xmm0, xmm1
 559     pshufb     xmm0, xmm3
 560     pmaddubsw  xmm0, xmm6
 561     paddsw     xmm0, xmm7
 562     psrlw      xmm0, 2
 563     packuswb   xmm0, xmm0
 564     movq       qword ptr [edx + 8], xmm0
 565     movdqa     xmm0, [eax + 16]      // pixels 16..23
 566     movdqa     xmm1, [eax + esi + 16]
 567     lea        eax, [eax + 32]
 568     pavgb      xmm1, xmm0
 569     pavgb      xmm0, xmm1
 570     pshufb     xmm0, xmm4
 571     movdqa     xmm1, kMadd21
 572     pmaddubsw  xmm0, xmm1
 573     paddsw     xmm0, xmm7
 574     psrlw      xmm0, 2
 575     packuswb   xmm0, xmm0
 576     sub        ecx, 24
 577     movq       qword ptr [edx + 16], xmm0
 578     lea        edx, [edx+24]
 579     jg         wloop
 580
 581     pop        esi
 582     ret
 583   }
 584 }
 585
 586 // 3/8 point sampler
 587
 588 // Scale 32 pixels to 12
 589 __declspec(naked) __declspec(align(16))
 590 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
 591                           uint8* dst_ptr, int dst_width) {
 592   __asm {
 593     mov        eax, [esp + 4]        // src_ptr
 594                                      // src_stride ignored
 595     mov        edx, [esp + 12]       // dst_ptr
 596     mov        ecx, [esp + 16]       // dst_width
 597     movdqa     xmm4, kShuf38a
 598     movdqa     xmm5, kShuf38b
 599
 600     align      4
 601   xloop:
 602     movdqa     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5
 603     movdqa     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11
 604     lea        eax, [eax + 32]
 605     pshufb     xmm0, xmm4
 606     pshufb     xmm1, xmm5
 607     paddusb    xmm0, xmm1
 608
 609     sub        ecx, 12
 610     movq       qword ptr [edx], xmm0  // write 12 pixels
 611     movhlps    xmm1, xmm0
 612     movd       [edx + 8], xmm1
 613     lea        edx, [edx + 12]
 614     jg         xloop
 615
 616     ret
 617   }
 618 }
 619
 620 // Scale 16x3 pixels to 6x1 with interpolation
 621 __declspec(naked) __declspec(align(16))
 622 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
 623                                 ptrdiff_t src_stride,
 624                                 uint8* dst_ptr, int dst_width) {
 625   __asm {
 626     push       esi
 627     mov        eax, [esp + 4 + 4]    // src_ptr
 628     mov        esi, [esp + 4 + 8]    // src_stride
 629     mov        edx, [esp + 4 + 12]   // dst_ptr
 630     mov        ecx, [esp + 4 + 16]   // dst_width
 631     movdqa     xmm2, kShufAc
 632     movdqa     xmm3, kShufAc3
 633     movdqa     xmm4, kScaleAc33
 634     pxor       xmm5, xmm5
 635
 636     align      4
 637   xloop:
 638     movdqa     xmm0, [eax]           // sum up 3 rows into xmm0/1
 639     movdqa     xmm6, [eax + esi]
 640     movhlps    xmm1, xmm0
 641     movhlps    xmm7, xmm6
 642     punpcklbw  xmm0, xmm5
 643     punpcklbw  xmm1, xmm5
 644     punpcklbw  xmm6, xmm5
 645     punpcklbw  xmm7, xmm5
 646     paddusw    xmm0, xmm6
 647     paddusw    xmm1, xmm7
 648     movdqa     xmm6, [eax + esi * 2]
 649     lea        eax, [eax + 16]
 650     movhlps    xmm7, xmm6
 651     punpcklbw  xmm6, xmm5
 652     punpcklbw  xmm7, xmm5
 653     paddusw    xmm0, xmm6
 654     paddusw    xmm1, xmm7
 655
 656     movdqa     xmm6, xmm0            // 8 pixels -> 0,1,2 of xmm6
 657     psrldq     xmm0, 2
 658     paddusw    xmm6, xmm0
 659     psrldq     xmm0, 2
 660     paddusw    xmm6, xmm0
 661     pshufb     xmm6, xmm2
 662
 663     movdqa     xmm7, xmm1            // 8 pixels -> 3,4,5 of xmm6
 664     psrldq     xmm1, 2
 665     paddusw    xmm7, xmm1
 666     psrldq     xmm1, 2
 667     paddusw    xmm7, xmm1
 668     pshufb     xmm7, xmm3
 669     paddusw    xmm6, xmm7
 670
 671     pmulhuw    xmm6, xmm4            // divide by 9,9,6, 9,9,6
 672     packuswb   xmm6, xmm6
 673
 674     sub        ecx, 6
 675     movd       [edx], xmm6           // write 6 pixels
 676     psrlq      xmm6, 16
 677     movd       [edx + 2], xmm6
 678     lea        edx, [edx + 6]
 679     jg         xloop
 680
 681     pop        esi
 682     ret
 683   }
 684 }
 685
 686 // Scale 16x2 pixels to 6x1 with interpolation
 687 __declspec(naked) __declspec(align(16))
 688 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
 689                                 ptrdiff_t src_stride,
 690                                 uint8* dst_ptr, int dst_width) {
 691   __asm {
 692     push       esi
 693     mov        eax, [esp + 4 + 4]    // src_ptr
 694     mov        esi, [esp + 4 + 8]    // src_stride
 695     mov        edx, [esp + 4 + 12]   // dst_ptr
 696     mov        ecx, [esp + 4 + 16]   // dst_width
 697     movdqa     xmm2, kShufAb0
 698     movdqa     xmm3, kShufAb1
 699     movdqa     xmm4, kShufAb2
 700     movdqa     xmm5, kScaleAb2
 701
 702     align      4
 703   xloop:
 704     movdqa     xmm0, [eax]           // average 2 rows into xmm0
 705     pavgb      xmm0, [eax + esi]
 706     lea        eax, [eax + 16]
 707
 708     movdqa     xmm1, xmm0            // 16 pixels -> 0,1,2,3,4,5 of xmm1
 709     pshufb     xmm1, xmm2
 710     movdqa     xmm6, xmm0
 711     pshufb     xmm6, xmm3
 712     paddusw    xmm1, xmm6
 713     pshufb     xmm0, xmm4
 714     paddusw    xmm1, xmm0
 715
 716     pmulhuw    xmm1, xmm5            // divide by 3,3,2, 3,3,2
 717     packuswb   xmm1, xmm1
 718
 719     sub        ecx, 6
 720     movd       [edx], xmm1           // write 6 pixels
 721     psrlq      xmm1, 16
 722     movd       [edx + 2], xmm1
 723     lea        edx, [edx + 6]
 724     jg         xloop
 725
 726     pop        esi
 727     ret
 728   }
 729 }
 730
 731 // Reads 16xN bytes and produces 16 shorts at a time.
 732 // TODO(fbarchard): Make this handle 4xN bytes for any width ARGB.
 733 __declspec(naked) __declspec(align(16))
 734 void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
 735                        uint16* dst_ptr, int src_width,
 736                        int src_height) {
 737   __asm {
 738     push       esi
 739     push       edi
 740     push       ebx
 741     push       ebp
 742     mov        esi, [esp + 16 + 4]   // src_ptr
 743     mov        edx, [esp + 16 + 8]   // src_stride
 744     mov        edi, [esp + 16 + 12]  // dst_ptr
 745     mov        ecx, [esp + 16 + 16]  // dst_width
 746     mov        ebx, [esp + 16 + 20]  // height
 747     pxor       xmm4, xmm4
 748     dec        ebx
 749
 750     align      4
 751   xloop:
 752     // first row
 753     movdqa     xmm0, [esi]
 754     lea        eax, [esi + edx]
 755     movdqa     xmm1, xmm0
 756     punpcklbw  xmm0, xmm4
 757     punpckhbw  xmm1, xmm4
 758     lea        esi, [esi + 16]
 759     mov        ebp, ebx
 760     test       ebp, ebp
 761     je         ydone
 762
 763     // sum remaining rows
 764     align      4
 765   yloop:
 766     movdqa     xmm2, [eax]       // read 16 pixels
 767     lea        eax, [eax + edx]  // advance to next row
 768     movdqa     xmm3, xmm2
 769     punpcklbw  xmm2, xmm4
 770     punpckhbw  xmm3, xmm4
 771     paddusw    xmm0, xmm2        // sum 16 words
 772     paddusw    xmm1, xmm3
 773     sub        ebp, 1
 774     jg         yloop
 775
 776     align      4
 777   ydone:
 778     movdqa     [edi], xmm0
 779     movdqa     [edi + 16], xmm1
 780     lea        edi, [edi + 32]
 781
 782     sub        ecx, 16
 783     jg         xloop
 784
 785     pop        ebp
 786     pop        ebx
 787     pop        edi
 788     pop        esi
 789     ret
 790   }
 791 }
 792
 793 // Bilinear column filtering. SSSE3 version.
 794 // TODO(fbarchard): Port to Neon
 795 // TODO(fbarchard): Switch the following:
 796 //    xor        ebx, ebx
 797 //    mov        bx, word ptr [esi + eax]  // 2 source x0 pixels
 798 // To
 799 //    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
 800 // when drmemory bug fixed.
 801 // https://code.google.com/p/drmemory/issues/detail?id=1396
 802
 803 __declspec(naked) __declspec(align(16))
 804 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
 805                            int dst_width, int x, int dx) {
 806   __asm {
 807     push       ebx
 808     push       esi
 809     push       edi
 810     mov        edi, [esp + 12 + 4]    // dst_ptr
 811     mov        esi, [esp + 12 + 8]    // src_ptr
 812     mov        ecx, [esp + 12 + 12]   // dst_width
 813     movd       xmm2, [esp + 12 + 16]  // x
 814     movd       xmm3, [esp + 12 + 20]  // dx
 815     mov        eax, 0x04040000      // shuffle to line up fractions with pixel.
 816     movd       xmm5, eax
 817     pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
 818     psrlw      xmm6, 9
 819     pextrw     eax, xmm2, 1         // get x0 integer. preroll
 820     sub        ecx, 2
 821     jl         xloop29
 822
 823     movdqa     xmm0, xmm2           // x1 = x0 + dx
 824     paddd      xmm0, xmm3
 825     punpckldq  xmm2, xmm0           // x0 x1
 826     punpckldq  xmm3, xmm3           // dx dx
 827     paddd      xmm3, xmm3           // dx * 2, dx * 2
 828     pextrw     edx, xmm2, 3         // get x1 integer. preroll
 829
 830     // 2 Pixel loop.
 831     align      4
 832   xloop2:
 833     movdqa     xmm1, xmm2           // x0, x1 fractions.
 834     paddd      xmm2, xmm3           // x += dx
 835     movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
 836     movd       xmm0, ebx
 837     psrlw      xmm1, 9              // 7 bit fractions.
 838     movzx      ebx, word ptr [esi + edx]  // 2 source x1 pixels
 839     movd       xmm4, ebx
 840     pshufb     xmm1, xmm5           // 0011
 841     punpcklwd  xmm0, xmm4
 842     pxor       xmm1, xmm6           // 0..7f and 7f..0
 843     pmaddubsw  xmm0, xmm1           // 16 bit, 2 pixels.
 844     pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
 845     pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
 846     psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.
 847     packuswb   xmm0, xmm0           // 8 bits, 2 pixels.
 848     movd       ebx, xmm0
 849     mov        [edi], bx
 850     lea        edi, [edi + 2]
 851     sub        ecx, 2               // 2 pixels
 852     jge        xloop2
 853
 854     align      4
 855  xloop29:
 856
 857     add        ecx, 2 - 1
 858     jl         xloop99
 859
 860     // 1 pixel remainder
 861     movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
 862     movd       xmm0, ebx
 863     psrlw      xmm2, 9              // 7 bit fractions.
 864     pshufb     xmm2, xmm5           // 0011
 865     pxor       xmm2, xmm6           // 0..7f and 7f..0
 866     pmaddubsw  xmm0, xmm2           // 16 bit
 867     psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.
 868     packuswb   xmm0, xmm0           // 8 bits
 869     movd       ebx, xmm0
 870     mov        [edi], bl
 871
 872     align      4
 873  xloop99:
 874
 875     pop        edi
 876     pop        esi
 877     pop        ebx
 878     ret
 879   }
 880 }
 881
 882 // Reads 16 pixels, duplicates them and writes 32 pixels.
 883 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
 884 __declspec(naked) __declspec(align(16))
 885 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
 886                        int dst_width, int x, int dx) {
 887   __asm {
 888     mov        edx, [esp + 4]    // dst_ptr
 889     mov        eax, [esp + 8]    // src_ptr
 890     mov        ecx, [esp + 12]   // dst_width
 891
 892     align      4
 893   wloop:
 894     movdqa     xmm0, [eax]
 895     lea        eax,  [eax + 16]
 896     movdqa     xmm1, xmm0
 897     punpcklbw  xmm0, xmm0
 898     punpckhbw  xmm1, xmm1
 899     sub        ecx, 32
 900     movdqa     [edx], xmm0
 901     movdqa     [edx + 16], xmm1
 902     lea        edx, [edx + 32]
 903     jg         wloop
 904
 905     ret
 906   }
 907 }
 908
 909 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
 910 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
 911 __declspec(naked) __declspec(align(16))
 912 void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
 913                             ptrdiff_t src_stride,
 914                             uint8* dst_argb, int dst_width) {
 915   __asm {
 916     mov        eax, [esp + 4]        // src_argb
 917                                      // src_stride ignored
 918     mov        edx, [esp + 12]       // dst_argb
 919     mov        ecx, [esp + 16]       // dst_width
 920
 921     align      4
 922   wloop:
 923     movdqa     xmm0, [eax]
 924     movdqa     xmm1, [eax + 16]
 925     lea        eax,  [eax + 32]
 926     shufps     xmm0, xmm1, 0xdd
 927     sub        ecx, 4
 928     movdqa     [edx], xmm0
 929     lea        edx, [edx + 16]
 930     jg         wloop
 931
 932     ret
 933   }
 934 }
 935
 936 // Blends 8x1 rectangle to 4x1.
 937 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
 938 __declspec(naked) __declspec(align(16))
 939 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
 940                                   ptrdiff_t src_stride,
 941                                   uint8* dst_argb, int dst_width) {
 942   __asm {
 943     mov        eax, [esp + 4]        // src_argb
 944                                      // src_stride ignored
 945     mov        edx, [esp + 12]       // dst_argb
 946     mov        ecx, [esp + 16]       // dst_width
 947
 948     align      4
 949   wloop:
 950     movdqa     xmm0, [eax]
 951     movdqa     xmm1, [eax + 16]
 952     lea        eax,  [eax + 32]
 953     movdqa     xmm2, xmm0
 954     shufps     xmm0, xmm1, 0x88      // even pixels
 955     shufps     xmm2, xmm1, 0xdd      // odd pixels
 956     pavgb      xmm0, xmm2
 957     sub        ecx, 4
 958     movdqa     [edx], xmm0
 959     lea        edx, [edx + 16]
 960     jg         wloop
 961
 962     ret
 963   }
 964 }
 965
 966 // Blends 8x2 rectangle to 4x1.
 967 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
 968 __declspec(naked) __declspec(align(16))
 969 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
 970                                ptrdiff_t src_stride,
 971                                uint8* dst_argb, int dst_width) {
 972   __asm {
 973     push       esi
 974     mov        eax, [esp + 4 + 4]    // src_argb
 975     mov        esi, [esp + 4 + 8]    // src_stride
 976     mov        edx, [esp + 4 + 12]   // dst_argb
 977     mov        ecx, [esp + 4 + 16]   // dst_width
 978
 979     align      4
 980   wloop:
 981     movdqa     xmm0, [eax]
 982     movdqa     xmm1, [eax + 16]
 983     movdqa     xmm2, [eax + esi]
 984     movdqa     xmm3, [eax + esi + 16]
 985     lea        eax,  [eax + 32]
 986     pavgb      xmm0, xmm2            // average rows
 987     pavgb      xmm1, xmm3
 988     movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
 989     shufps     xmm0, xmm1, 0x88      // even pixels
 990     shufps     xmm2, xmm1, 0xdd      // odd pixels
 991     pavgb      xmm0, xmm2
 992     sub        ecx, 4
 993     movdqa     [edx], xmm0
 994     lea        edx, [edx + 16]
 995     jg         wloop
 996
 997     pop        esi
 998     ret
 999   }
1000 }
1001
1002 // Reads 4 pixels at a time.
1003 // Alignment requirement: dst_argb 16 byte aligned.
1004 __declspec(naked) __declspec(align(16))
1005 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
1006                                int src_stepx,
1007                                uint8* dst_argb, int dst_width) {
1008   __asm {
1009     push       ebx
1010     push       edi
1011     mov        eax, [esp + 8 + 4]    // src_argb
1012                                      // src_stride ignored
1013     mov        ebx, [esp + 8 + 12]   // src_stepx
1014     mov        edx, [esp + 8 + 16]   // dst_argb
1015     mov        ecx, [esp + 8 + 20]   // dst_width
1016     lea        ebx, [ebx * 4]
1017     lea        edi, [ebx + ebx * 2]
1018
1019     align      4
1020   wloop:
1021     movd       xmm0, [eax]
1022     movd       xmm1, [eax + ebx]
1023     punpckldq  xmm0, xmm1
1024     movd       xmm2, [eax + ebx * 2]
1025     movd       xmm3, [eax + edi]
1026     lea        eax,  [eax + ebx * 4]
1027     punpckldq  xmm2, xmm3
1028     punpcklqdq xmm0, xmm2
1029     sub        ecx, 4
1030     movdqa     [edx], xmm0
1031     lea        edx, [edx + 16]
1032     jg         wloop
1033
1034     pop        edi
1035     pop        ebx
1036     ret
1037   }
1038 }
1039
1040 // Blends four 2x2 to 4x1.
1041 // Alignment requirement: dst_argb 16 byte aligned.
1042 __declspec(naked) __declspec(align(16))
1043 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
1044                                   ptrdiff_t src_stride,
1045                                   int src_stepx,
1046                                   uint8* dst_argb, int dst_width) {
1047   __asm {
1048     push       ebx
1049     push       esi
1050     push       edi
1051     mov        eax, [esp + 12 + 4]    // src_argb
1052     mov        esi, [esp + 12 + 8]    // src_stride
1053     mov        ebx, [esp + 12 + 12]   // src_stepx
1054     mov        edx, [esp + 12 + 16]   // dst_argb
1055     mov        ecx, [esp + 12 + 20]   // dst_width
1056     lea        esi, [eax + esi]       // row1 pointer
1057     lea        ebx, [ebx * 4]
1058     lea        edi, [ebx + ebx * 2]
1059
1060     align      4
1061   wloop:
1062     movq       xmm0, qword ptr [eax]  // row0 4 pairs
1063     movhps     xmm0, qword ptr [eax + ebx]
1064     movq       xmm1, qword ptr [eax + ebx * 2]
1065     movhps     xmm1, qword ptr [eax + edi]
1066     lea        eax,  [eax + ebx * 4]
1067     movq       xmm2, qword ptr [esi]  // row1 4 pairs
1068     movhps     xmm2, qword ptr [esi + ebx]
1069     movq       xmm3, qword ptr [esi + ebx * 2]
1070     movhps     xmm3, qword ptr [esi + edi]
1071     lea        esi,  [esi + ebx * 4]
1072     pavgb      xmm0, xmm2            // average rows
1073     pavgb      xmm1, xmm3
1074     movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
1075     shufps     xmm0, xmm1, 0x88      // even pixels
1076     shufps     xmm2, xmm1, 0xdd      // odd pixels
1077     pavgb      xmm0, xmm2
1078     sub        ecx, 4
1079     movdqa     [edx], xmm0
1080     lea        edx, [edx + 16]
1081     jg         wloop
1082
1083     pop        edi
1084     pop        esi
1085     pop        ebx
1086     ret
1087   }
1088 }
1089
1090 // Column scaling unfiltered. SSE2 version.
1091 __declspec(naked) __declspec(align(16))
1092 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
1093                         int dst_width, int x, int dx) {
1094   __asm {
1095     push       edi
1096     push       esi
1097     mov        edi, [esp + 8 + 4]    // dst_argb
1098     mov        esi, [esp + 8 + 8]    // src_argb
1099     mov        ecx, [esp + 8 + 12]   // dst_width
1100     movd       xmm2, [esp + 8 + 16]  // x
1101     movd       xmm3, [esp + 8 + 20]  // dx
1102
1103     pshufd     xmm2, xmm2, 0         // x0 x0 x0 x0
1104     pshufd     xmm0, xmm3, 0x11      // dx  0 dx  0
1105     paddd      xmm2, xmm0
1106     paddd      xmm3, xmm3            // 0, 0, 0,  dx * 2
1107     pshufd     xmm0, xmm3, 0x05      // dx * 2, dx * 2, 0, 0
1108     paddd      xmm2, xmm0            // x3 x2 x1 x0
1109     paddd      xmm3, xmm3            // 0, 0, 0,  dx * 4
1110     pshufd     xmm3, xmm3, 0         // dx * 4, dx * 4, dx * 4, dx * 4
1111
1112     pextrw     eax, xmm2, 1          // get x0 integer.
1113     pextrw     edx, xmm2, 3          // get x1 integer.
1114
1115     cmp        ecx, 0
1116     jle        xloop99
1117     sub        ecx, 4
1118     jl         xloop49
1119
1120     // 4 Pixel loop.
1121     align      4
1122  xloop4:
1123     movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
1124     movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
1125     pextrw     eax, xmm2, 5           // get x2 integer.
1126     pextrw     edx, xmm2, 7           // get x3 integer.
1127     paddd      xmm2, xmm3             // x += dx
1128     punpckldq  xmm0, xmm1             // x0 x1
1129
1130     movd       xmm1, [esi + eax * 4]  // 1 source x2 pixels
1131     movd       xmm4, [esi + edx * 4]  // 1 source x3 pixels
1132     pextrw     eax, xmm2, 1           // get x0 integer. next iteration.
1133     pextrw     edx, xmm2, 3           // get x1 integer. next iteration.
1134     punpckldq  xmm1, xmm4             // x2 x3
1135     punpcklqdq xmm0, xmm1             // x0 x1 x2 x3
1136     sub        ecx, 4                 // 4 pixels
1137     movdqu     [edi], xmm0
1138     lea        edi, [edi + 16]
1139     jge        xloop4
1140
1141     align      4
1142  xloop49:
1143     test       ecx, 2
1144     je         xloop29
1145
1146     // 2 Pixels.
1147     movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
1148     movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
1149     pextrw     eax, xmm2, 5           // get x2 integer.
1150     punpckldq  xmm0, xmm1             // x0 x1
1151
1152     movq       qword ptr [edi], xmm0
1153     lea        edi, [edi + 8]
1154
1155  xloop29:
1156     test       ecx, 1
1157     je         xloop99
1158
1159     // 1 Pixels.
1160     movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels
1161     movd       dword ptr [edi], xmm0
1162     align      4
1163  xloop99:
1164
1165     pop        esi
1166     pop        edi
1167     ret
1168   }
1169 }
1170
1171 // Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
1172 // TODO(fbarchard): Port to Neon
1173
1174 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
1175 static uvec8 kShuffleColARGB = {
1176   0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
1177   8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
1178 };
1179
1180 // Shuffle table for duplicating 2 fractions into 8 bytes each
1181 static uvec8 kShuffleFractions = {
1182   0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
1183 };
1184
1185 __declspec(naked) __declspec(align(16))
1186 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
1187                                int dst_width, int x, int dx) {
1188   __asm {
1189     push       esi
1190     push       edi
1191     mov        edi, [esp + 8 + 4]    // dst_argb
1192     mov        esi, [esp + 8 + 8]    // src_argb
1193     mov        ecx, [esp + 8 + 12]   // dst_width
1194     movd       xmm2, [esp + 8 + 16]  // x
1195     movd       xmm3, [esp + 8 + 20]  // dx
1196     movdqa     xmm4, kShuffleColARGB
1197     movdqa     xmm5, kShuffleFractions
1198     pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
1199     psrlw      xmm6, 9
1200     pextrw     eax, xmm2, 1         // get x0 integer. preroll
1201     sub        ecx, 2
1202     jl         xloop29
1203
1204     movdqa     xmm0, xmm2           // x1 = x0 + dx
1205     paddd      xmm0, xmm3
1206     punpckldq  xmm2, xmm0           // x0 x1
1207     punpckldq  xmm3, xmm3           // dx dx
1208     paddd      xmm3, xmm3           // dx * 2, dx * 2
1209     pextrw     edx, xmm2, 3         // get x1 integer. preroll
1210
1211     // 2 Pixel loop.
1212     align      4
1213   xloop2:
1214     movdqa     xmm1, xmm2           // x0, x1 fractions.
1215     paddd      xmm2, xmm3           // x += dx
1216     movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
1217     psrlw      xmm1, 9              // 7 bit fractions.
1218     movhps     xmm0, qword ptr [esi + edx * 4]  // 2 source x1 pixels
1219     pshufb     xmm1, xmm5           // 0000000011111111
1220     pshufb     xmm0, xmm4           // arrange pixels into pairs
1221     pxor       xmm1, xmm6           // 0..7f and 7f..0
1222     pmaddubsw  xmm0, xmm1           // argb_argb 16 bit, 2 pixels.
1223     pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
1224     pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
1225     psrlw      xmm0, 7              // argb 8.7 fixed point to low 8 bits.
1226     packuswb   xmm0, xmm0           // argb_argb 8 bits, 2 pixels.
1227     movq       qword ptr [edi], xmm0
1228     lea        edi, [edi + 8]
1229     sub        ecx, 2               // 2 pixels
1230     jge        xloop2
1231
1232     align      4
1233  xloop29:
1234
1235     add        ecx, 2 - 1
1236     jl         xloop99
1237
1238     // 1 pixel remainder
1239     psrlw      xmm2, 9              // 7 bit fractions.
1240     movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
1241     pshufb     xmm2, xmm5           // 00000000
1242     pshufb     xmm0, xmm4           // arrange pixels into pairs
1243     pxor       xmm2, xmm6           // 0..7f and 7f..0
1244     pmaddubsw  xmm0, xmm2           // argb 16 bit, 1 pixel.
1245     psrlw      xmm0, 7
1246     packuswb   xmm0, xmm0           // argb 8 bits, 1 pixel.
1247     movd       [edi], xmm0
1248
1249     align      4
1250  xloop99:
1251
1252     pop        edi
1253     pop        esi
1254     ret
1255   }
1256 }
1257
1258 // Reads 4 pixels, duplicates them and writes 8 pixels.
1259 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
1260 __declspec(naked) __declspec(align(16))
1261 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
1262                            int dst_width, int x, int dx) {
1263   __asm {
1264     mov        edx, [esp + 4]    // dst_argb
1265     mov        eax, [esp + 8]    // src_argb
1266     mov        ecx, [esp + 12]   // dst_width
1267
1268     align      4
1269   wloop:
1270     movdqa     xmm0, [eax]
1271     lea        eax,  [eax + 16]
1272     movdqa     xmm1, xmm0
1273     punpckldq  xmm0, xmm0
1274     punpckhdq  xmm1, xmm1
1275     sub        ecx, 8
1276     movdqa     [edx], xmm0
1277     movdqa     [edx + 16], xmm1
1278     lea        edx, [edx + 32]
1279     jg         wloop
1280
1281     ret
1282   }
1283 }
1284
1285 // Divide num by div and return as 16.16 fixed point result.
1286 __declspec(naked) __declspec(align(16))
1287 int FixedDiv_X86(int num, int div) {
1288   __asm {
1289     mov        eax, [esp + 4]    // num
1290     cdq                          // extend num to 64 bits
1291     shld       edx, eax, 16      // 32.16
1292     shl        eax, 16
1293     idiv       dword ptr [esp + 8]
1294     ret
1295   }
1296 }
1297
1298 // Divide num by div and return as 16.16 fixed point result.
1299 __declspec(naked) __declspec(align(16))
1300 int FixedDiv1_X86(int num, int div) {
1301   __asm {
1302     mov        eax, [esp + 4]    // num
1303     mov        ecx, [esp + 8]    // denom
1304     cdq                          // extend num to 64 bits
1305     shld       edx, eax, 16      // 32.16
1306     shl        eax, 16
1307     sub        eax, 0x00010001
1308     sbb        edx, 0
1309     sub        ecx, 1
1310     idiv       ecx
1311     ret
1312   }
1313 }
1314
1315 #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
1316
1317 #ifdef __cplusplus
1318 }  // extern "C"
1319 }  // namespace libyuv
1320 #endif