2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
11 #include "third_party/libyuv/include/libyuv/row.h"
18 // This module is for Visual C x86.
19 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
21 #ifdef HAS_ARGBTOYROW_SSSE3
23 // Constants for ARGB.
24 static const vec8 kARGBToY = {
25 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
29 static const vec8 kARGBToYJ = {
30 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
33 static const vec8 kARGBToU = {
34 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
37 static const vec8 kARGBToUJ = {
38 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
41 static const vec8 kARGBToV = {
42 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
45 static const vec8 kARGBToVJ = {
46 -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
49 // vpermd for vphaddw + vpackuswb vpermd.
50 static const lvec32 kPermdARGBToY_AVX = {
51 0, 4, 1, 5, 2, 6, 3, 7
54 // vpshufb for vphaddw + vpackuswb packed to shorts.
55 static const lvec8 kShufARGBToUV_AVX = {
56 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
57 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
60 // Constants for BGRA.
61 static const vec8 kBGRAToY = {
62 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
65 static const vec8 kBGRAToU = {
66 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
69 static const vec8 kBGRAToV = {
70 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
73 // Constants for ABGR.
74 static const vec8 kABGRToY = {
75 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
78 static const vec8 kABGRToU = {
79 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
82 static const vec8 kABGRToV = {
83 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
86 // Constants for RGBA.
87 static const vec8 kRGBAToY = {
88 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
91 static const vec8 kRGBAToU = {
92 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
95 static const vec8 kRGBAToV = {
96 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
99 static const uvec8 kAddY16 = {
100 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
103 static const vec16 kAddYJ64 = {
104 64, 64, 64, 64, 64, 64, 64, 64
107 static const uvec8 kAddUV128 = {
108 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
109 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
112 static const uvec16 kAddUVJ128 = {
113 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
116 // Shuffle table for converting RGB24 to ARGB.
117 static const uvec8 kShuffleMaskRGB24ToARGB = {
118 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
121 // Shuffle table for converting RAW to ARGB.
122 static const uvec8 kShuffleMaskRAWToARGB = {
123 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
126 // Shuffle table for converting ARGB to RGB24.
127 static const uvec8 kShuffleMaskARGBToRGB24 = {
128 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
131 // Shuffle table for converting ARGB to RAW.
132 static const uvec8 kShuffleMaskARGBToRAW = {
133 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
136 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
137 static const uvec8 kShuffleMaskARGBToRGB24_0 = {
138 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
141 // Shuffle table for converting ARGB to RAW.
142 static const uvec8 kShuffleMaskARGBToRAW_0 = {
143 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
146 // Duplicates gray value 3 times and fills in alpha opaque.
147 __declspec(naked) __declspec(align(16))
148 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
150 mov eax, [esp + 4] // src_y
151 mov edx, [esp + 8] // dst_argb
152 mov ecx, [esp + 12] // pix
153 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
158 movq xmm0, qword ptr [eax]
167 movdqa [edx + 16], xmm1
175 __declspec(naked) __declspec(align(16))
176 void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
179 mov eax, [esp + 4] // src_y
180 mov edx, [esp + 8] // dst_argb
181 mov ecx, [esp + 12] // pix
182 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
187 movq xmm0, qword ptr [eax]
196 movdqu [edx + 16], xmm1
204 __declspec(naked) __declspec(align(16))
205 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
207 mov eax, [esp + 4] // src_rgb24
208 mov edx, [esp + 8] // dst_argb
209 mov ecx, [esp + 12] // pix
210 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
212 movdqa xmm4, kShuffleMaskRGB24ToARGB
217 movdqu xmm1, [eax + 16]
218 movdqu xmm3, [eax + 32]
221 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
224 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
226 movdqa [edx + 32], xmm2
231 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
233 movdqa [edx + 16], xmm1
236 movdqa [edx + 48], xmm3
243 __declspec(naked) __declspec(align(16))
244 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
247 mov eax, [esp + 4] // src_raw
248 mov edx, [esp + 8] // dst_argb
249 mov ecx, [esp + 12] // pix
250 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
252 movdqa xmm4, kShuffleMaskRAWToARGB
257 movdqu xmm1, [eax + 16]
258 movdqu xmm3, [eax + 32]
261 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
264 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
266 movdqa [edx + 32], xmm2
271 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
273 movdqa [edx + 16], xmm1
276 movdqa [edx + 48], xmm3
283 // pmul method to replicate bits.
284 // Math to replicate bits:
285 // (v << 8) | (v << 3)
288 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
290 __declspec(naked) __declspec(align(16))
291 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
294 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
297 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
300 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
302 pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green
305 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
308 mov eax, [esp + 4] // src_rgb565
309 mov edx, [esp + 8] // dst_argb
310 mov ecx, [esp + 12] // pix
316 movdqu xmm0, [eax] // fetch 8 pixels of bgr565
319 pand xmm1, xmm3 // R in upper 5 bits
320 psllw xmm2, 11 // B in upper 5 bits
321 pmulhuw xmm1, xmm5 // * (256 + 8)
322 pmulhuw xmm2, xmm5 // * (256 + 8)
325 pand xmm0, xmm4 // G in middle 6 bits
326 pmulhuw xmm0, xmm6 // << 5 * (256 + 4)
331 movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
332 movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
341 __declspec(naked) __declspec(align(16))
342 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
345 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
348 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
351 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
353 movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green
355 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
358 mov eax, [esp + 4] // src_argb1555
359 mov edx, [esp + 8] // dst_argb
360 mov ecx, [esp + 12] // pix
366 movdqu xmm0, [eax] // fetch 8 pixels of 1555
369 psllw xmm1, 1 // R in upper 5 bits
370 psllw xmm2, 11 // B in upper 5 bits
372 pmulhuw xmm2, xmm5 // * (256 + 8)
373 pmulhuw xmm1, xmm5 // * (256 + 8)
377 pand xmm0, xmm4 // G in middle 5 bits
379 pmulhuw xmm0, xmm6 // << 6 * (256 + 8)
385 movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
386 movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
395 __declspec(naked) __declspec(align(16))
396 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
399 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
402 movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles
404 mov eax, [esp + 4] // src_argb4444
405 mov edx, [esp + 8] // dst_argb
406 mov ecx, [esp + 12] // pix
412 movdqu xmm0, [eax] // fetch 8 pixels of bgra4444
414 pand xmm0, xmm4 // mask low nibbles
415 pand xmm2, xmm5 // mask high nibbles
425 movdqa [eax * 2 + edx], xmm0 // store 4 pixels of ARGB
426 movdqa [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
434 __declspec(naked) __declspec(align(16))
435 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
437 mov eax, [esp + 4] // src_argb
438 mov edx, [esp + 8] // dst_rgb
439 mov ecx, [esp + 12] // pix
440 movdqa xmm6, kShuffleMaskARGBToRGB24
444 movdqu xmm0, [eax] // fetch 16 pixels of argb
445 movdqu xmm1, [eax + 16]
446 movdqu xmm2, [eax + 32]
447 movdqu xmm3, [eax + 48]
449 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
453 movdqa xmm4, xmm1 // 4 bytes from 1 for 0
454 psrldq xmm1, 4 // 8 bytes from 1
455 pslldq xmm4, 12 // 4 bytes from 1 for 0
456 movdqa xmm5, xmm2 // 8 bytes from 2 for 1
457 por xmm0, xmm4 // 4 bytes from 1 for 0
458 pslldq xmm5, 8 // 8 bytes from 2 for 1
459 movdqu [edx], xmm0 // store 0
460 por xmm1, xmm5 // 8 bytes from 2 for 1
461 psrldq xmm2, 8 // 4 bytes from 2
462 pslldq xmm3, 4 // 12 bytes from 3 for 2
463 por xmm2, xmm3 // 12 bytes from 3 for 2
464 movdqu [edx + 16], xmm1 // store 1
465 movdqu [edx + 32], xmm2 // store 2
473 __declspec(naked) __declspec(align(16))
474 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
476 mov eax, [esp + 4] // src_argb
477 mov edx, [esp + 8] // dst_rgb
478 mov ecx, [esp + 12] // pix
479 movdqa xmm6, kShuffleMaskARGBToRAW
483 movdqu xmm0, [eax] // fetch 16 pixels of argb
484 movdqu xmm1, [eax + 16]
485 movdqu xmm2, [eax + 32]
486 movdqu xmm3, [eax + 48]
488 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
492 movdqa xmm4, xmm1 // 4 bytes from 1 for 0
493 psrldq xmm1, 4 // 8 bytes from 1
494 pslldq xmm4, 12 // 4 bytes from 1 for 0
495 movdqa xmm5, xmm2 // 8 bytes from 2 for 1
496 por xmm0, xmm4 // 4 bytes from 1 for 0
497 pslldq xmm5, 8 // 8 bytes from 2 for 1
498 movdqu [edx], xmm0 // store 0
499 por xmm1, xmm5 // 8 bytes from 2 for 1
500 psrldq xmm2, 8 // 4 bytes from 2
501 pslldq xmm3, 4 // 12 bytes from 3 for 2
502 por xmm2, xmm3 // 12 bytes from 3 for 2
503 movdqu [edx + 16], xmm1 // store 1
504 movdqu [edx + 32], xmm2 // store 2
512 __declspec(naked) __declspec(align(16))
513 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
515 mov eax, [esp + 4] // src_argb
516 mov edx, [esp + 8] // dst_rgb
517 mov ecx, [esp + 12] // pix
518 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
520 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
523 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
528 movdqa xmm0, [eax] // fetch 4 pixels of argb
529 movdqa xmm1, xmm0 // B
530 movdqa xmm2, xmm0 // G
539 por xmm0, xmm1 // BGR
542 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
550 // TODO(fbarchard): Improve sign extension/packing.
551 __declspec(naked) __declspec(align(16))
552 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
554 mov eax, [esp + 4] // src_argb
555 mov edx, [esp + 8] // dst_rgb
556 mov ecx, [esp + 12] // pix
557 pcmpeqb xmm4, xmm4 // generate mask 0x0000001f
559 movdqa xmm5, xmm4 // generate mask 0x000003e0
561 movdqa xmm6, xmm4 // generate mask 0x00007c00
563 pcmpeqb xmm7, xmm7 // generate mask 0xffff8000
568 movdqa xmm0, [eax] // fetch 4 pixels of argb
569 movdqa xmm1, xmm0 // B
570 movdqa xmm2, xmm0 // G
571 movdqa xmm3, xmm0 // R
582 por xmm0, xmm2 // BGRA
585 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555
593 __declspec(naked) __declspec(align(16))
594 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
596 mov eax, [esp + 4] // src_argb
597 mov edx, [esp + 8] // dst_rgb
598 mov ecx, [esp + 12] // pix
599 pcmpeqb xmm4, xmm4 // generate mask 0xf000f000
601 movdqa xmm3, xmm4 // generate mask 0x00f000f0
606 movdqa xmm0, [eax] // fetch 4 pixels of argb
608 pand xmm0, xmm3 // low nibble
609 pand xmm1, xmm4 // high nibble
615 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444
623 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
624 __declspec(naked) __declspec(align(16))
625 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
627 mov eax, [esp + 4] /* src_argb */
628 mov edx, [esp + 8] /* dst_y */
629 mov ecx, [esp + 12] /* pix */
631 movdqa xmm4, kARGBToY
636 movdqa xmm1, [eax + 16]
637 movdqa xmm2, [eax + 32]
638 movdqa xmm3, [eax + 48]
658 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
659 __declspec(naked) __declspec(align(16))
660 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
662 mov eax, [esp + 4] /* src_argb */
663 mov edx, [esp + 8] /* dst_y */
664 mov ecx, [esp + 12] /* pix */
665 movdqa xmm4, kARGBToYJ
666 movdqa xmm5, kAddYJ64
671 movdqa xmm1, [eax + 16]
672 movdqa xmm2, [eax + 32]
673 movdqa xmm3, [eax + 48]
681 paddw xmm0, xmm5 // Add .5 for rounding.
694 #ifdef HAS_ARGBTOYROW_AVX2
695 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
696 __declspec(naked) __declspec(align(32))
697 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
699 mov eax, [esp + 4] /* src_argb */
700 mov edx, [esp + 8] /* dst_y */
701 mov ecx, [esp + 12] /* pix */
702 vbroadcastf128 ymm4, kARGBToY
703 vbroadcastf128 ymm5, kAddY16
704 vmovdqa ymm6, kPermdARGBToY_AVX
709 vmovdqu ymm1, [eax + 32]
710 vmovdqu ymm2, [eax + 64]
711 vmovdqu ymm3, [eax + 96]
712 vpmaddubsw ymm0, ymm0, ymm4
713 vpmaddubsw ymm1, ymm1, ymm4
714 vpmaddubsw ymm2, ymm2, ymm4
715 vpmaddubsw ymm3, ymm3, ymm4
717 vphaddw ymm0, ymm0, ymm1 // mutates.
718 vphaddw ymm2, ymm2, ymm3
721 vpackuswb ymm0, ymm0, ymm2 // mutates.
722 vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
723 vpaddb ymm0, ymm0, ymm5
732 #endif // HAS_ARGBTOYROW_AVX2
734 #ifdef HAS_ARGBTOYROW_AVX2
735 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
736 __declspec(naked) __declspec(align(32))
737 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
739 mov eax, [esp + 4] /* src_argb */
740 mov edx, [esp + 8] /* dst_y */
741 mov ecx, [esp + 12] /* pix */
742 vbroadcastf128 ymm4, kARGBToYJ
743 vbroadcastf128 ymm5, kAddYJ64
744 vmovdqa ymm6, kPermdARGBToY_AVX
749 vmovdqu ymm1, [eax + 32]
750 vmovdqu ymm2, [eax + 64]
751 vmovdqu ymm3, [eax + 96]
752 vpmaddubsw ymm0, ymm0, ymm4
753 vpmaddubsw ymm1, ymm1, ymm4
754 vpmaddubsw ymm2, ymm2, ymm4
755 vpmaddubsw ymm3, ymm3, ymm4
757 vphaddw ymm0, ymm0, ymm1 // mutates.
758 vphaddw ymm2, ymm2, ymm3
759 vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding.
760 vpaddw ymm2, ymm2, ymm5
763 vpackuswb ymm0, ymm0, ymm2 // mutates.
764 vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
774 #endif // HAS_ARGBTOYJROW_AVX2
776 __declspec(naked) __declspec(align(16))
777 void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
779 mov eax, [esp + 4] /* src_argb */
780 mov edx, [esp + 8] /* dst_y */
781 mov ecx, [esp + 12] /* pix */
783 movdqa xmm4, kARGBToY
788 movdqu xmm1, [eax + 16]
789 movdqu xmm2, [eax + 32]
790 movdqu xmm3, [eax + 48]
810 __declspec(naked) __declspec(align(16))
811 void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
813 mov eax, [esp + 4] /* src_argb */
814 mov edx, [esp + 8] /* dst_y */
815 mov ecx, [esp + 12] /* pix */
816 movdqa xmm4, kARGBToYJ
817 movdqa xmm5, kAddYJ64
822 movdqu xmm1, [eax + 16]
823 movdqu xmm2, [eax + 32]
824 movdqu xmm3, [eax + 48]
845 __declspec(naked) __declspec(align(16))
846 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
848 mov eax, [esp + 4] /* src_argb */
849 mov edx, [esp + 8] /* dst_y */
850 mov ecx, [esp + 12] /* pix */
852 movdqa xmm4, kBGRAToY
857 movdqa xmm1, [eax + 16]
858 movdqa xmm2, [eax + 32]
859 movdqa xmm3, [eax + 48]
879 __declspec(naked) __declspec(align(16))
880 void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
882 mov eax, [esp + 4] /* src_argb */
883 mov edx, [esp + 8] /* dst_y */
884 mov ecx, [esp + 12] /* pix */
886 movdqa xmm4, kBGRAToY
891 movdqu xmm1, [eax + 16]
892 movdqu xmm2, [eax + 32]
893 movdqu xmm3, [eax + 48]
913 __declspec(naked) __declspec(align(16))
914 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
916 mov eax, [esp + 4] /* src_argb */
917 mov edx, [esp + 8] /* dst_y */
918 mov ecx, [esp + 12] /* pix */
920 movdqa xmm4, kABGRToY
925 movdqa xmm1, [eax + 16]
926 movdqa xmm2, [eax + 32]
927 movdqa xmm3, [eax + 48]
947 __declspec(naked) __declspec(align(16))
948 void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
950 mov eax, [esp + 4] /* src_argb */
951 mov edx, [esp + 8] /* dst_y */
952 mov ecx, [esp + 12] /* pix */
954 movdqa xmm4, kABGRToY
959 movdqu xmm1, [eax + 16]
960 movdqu xmm2, [eax + 32]
961 movdqu xmm3, [eax + 48]
981 __declspec(naked) __declspec(align(16))
982 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
984 mov eax, [esp + 4] /* src_argb */
985 mov edx, [esp + 8] /* dst_y */
986 mov ecx, [esp + 12] /* pix */
988 movdqa xmm4, kRGBAToY
993 movdqa xmm1, [eax + 16]
994 movdqa xmm2, [eax + 32]
995 movdqa xmm3, [eax + 48]
1015 __declspec(naked) __declspec(align(16))
1016 void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1018 mov eax, [esp + 4] /* src_argb */
1019 mov edx, [esp + 8] /* dst_y */
1020 mov ecx, [esp + 12] /* pix */
1021 movdqa xmm5, kAddY16
1022 movdqa xmm4, kRGBAToY
1027 movdqu xmm1, [eax + 16]
1028 movdqu xmm2, [eax + 32]
1029 movdqu xmm3, [eax + 48]
1030 pmaddubsw xmm0, xmm4
1031 pmaddubsw xmm1, xmm4
1032 pmaddubsw xmm2, xmm4
1033 pmaddubsw xmm3, xmm4
1049 __declspec(naked) __declspec(align(16))
1050 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1051 uint8* dst_u, uint8* dst_v, int width) {
1055 mov eax, [esp + 8 + 4] // src_argb
1056 mov esi, [esp + 8 + 8] // src_stride_argb
1057 mov edx, [esp + 8 + 12] // dst_u
1058 mov edi, [esp + 8 + 16] // dst_v
1059 mov ecx, [esp + 8 + 20] // pix
1060 movdqa xmm7, kARGBToU
1061 movdqa xmm6, kARGBToV
1062 movdqa xmm5, kAddUV128
1063 sub edi, edx // stride from u to v
1067 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1069 movdqa xmm1, [eax + 16]
1070 movdqa xmm2, [eax + 32]
1071 movdqa xmm3, [eax + 48]
1072 pavgb xmm0, [eax + esi]
1073 pavgb xmm1, [eax + esi + 16]
1074 pavgb xmm2, [eax + esi + 32]
1075 pavgb xmm3, [eax + esi + 48]
1078 shufps xmm0, xmm1, 0x88
1079 shufps xmm4, xmm1, 0xdd
1082 shufps xmm2, xmm3, 0x88
1083 shufps xmm4, xmm3, 0xdd
1086 // step 2 - convert to U and V
1087 // from here down is very similar to Y code except
1088 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1091 pmaddubsw xmm0, xmm7 // U
1092 pmaddubsw xmm2, xmm7
1093 pmaddubsw xmm1, xmm6 // V
1094 pmaddubsw xmm3, xmm6
1100 paddb xmm0, xmm5 // -> unsigned
1102 // step 3 - store 8 U and 8 V values
1104 movlps qword ptr [edx], xmm0 // U
1105 movhps qword ptr [edx + edi], xmm0 // V
1115 __declspec(naked) __declspec(align(16))
1116 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1117 uint8* dst_u, uint8* dst_v, int width) {
1121 mov eax, [esp + 8 + 4] // src_argb
1122 mov esi, [esp + 8 + 8] // src_stride_argb
1123 mov edx, [esp + 8 + 12] // dst_u
1124 mov edi, [esp + 8 + 16] // dst_v
1125 mov ecx, [esp + 8 + 20] // pix
1126 movdqa xmm7, kARGBToUJ
1127 movdqa xmm6, kARGBToVJ
1128 movdqa xmm5, kAddUVJ128
1129 sub edi, edx // stride from u to v
1133 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1135 movdqa xmm1, [eax + 16]
1136 movdqa xmm2, [eax + 32]
1137 movdqa xmm3, [eax + 48]
1138 pavgb xmm0, [eax + esi]
1139 pavgb xmm1, [eax + esi + 16]
1140 pavgb xmm2, [eax + esi + 32]
1141 pavgb xmm3, [eax + esi + 48]
1144 shufps xmm0, xmm1, 0x88
1145 shufps xmm4, xmm1, 0xdd
1148 shufps xmm2, xmm3, 0x88
1149 shufps xmm4, xmm3, 0xdd
1152 // step 2 - convert to U and V
1153 // from here down is very similar to Y code except
1154 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1157 pmaddubsw xmm0, xmm7 // U
1158 pmaddubsw xmm2, xmm7
1159 pmaddubsw xmm1, xmm6 // V
1160 pmaddubsw xmm3, xmm6
1163 paddw xmm0, xmm5 // +.5 rounding -> unsigned
1169 // step 3 - store 8 U and 8 V values
1171 movlps qword ptr [edx], xmm0 // U
1172 movhps qword ptr [edx + edi], xmm0 // V
1182 #ifdef HAS_ARGBTOUVROW_AVX2
1183 __declspec(naked) __declspec(align(32))
1184 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
1185 uint8* dst_u, uint8* dst_v, int width) {
1189 mov eax, [esp + 8 + 4] // src_argb
1190 mov esi, [esp + 8 + 8] // src_stride_argb
1191 mov edx, [esp + 8 + 12] // dst_u
1192 mov edi, [esp + 8 + 16] // dst_v
1193 mov ecx, [esp + 8 + 20] // pix
1194 vbroadcastf128 ymm5, kAddUV128
1195 vbroadcastf128 ymm6, kARGBToV
1196 vbroadcastf128 ymm7, kARGBToU
1197 sub edi, edx // stride from u to v
1201 /* step 1 - subsample 32x2 argb pixels to 16x1 */
1203 vmovdqu ymm1, [eax + 32]
1204 vmovdqu ymm2, [eax + 64]
1205 vmovdqu ymm3, [eax + 96]
1206 vpavgb ymm0, ymm0, [eax + esi]
1207 vpavgb ymm1, ymm1, [eax + esi + 32]
1208 vpavgb ymm2, ymm2, [eax + esi + 64]
1209 vpavgb ymm3, ymm3, [eax + esi + 96]
1210 lea eax, [eax + 128]
1211 vshufps ymm4, ymm0, ymm1, 0x88
1212 vshufps ymm0, ymm0, ymm1, 0xdd
1213 vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
1214 vshufps ymm4, ymm2, ymm3, 0x88
1215 vshufps ymm2, ymm2, ymm3, 0xdd
1216 vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
1218 // step 2 - convert to U and V
1219 // from here down is very similar to Y code except
1220 // instead of 32 different pixels, its 16 pixels of U and 16 of V
1221 vpmaddubsw ymm1, ymm0, ymm7 // U
1222 vpmaddubsw ymm3, ymm2, ymm7
1223 vpmaddubsw ymm0, ymm0, ymm6 // V
1224 vpmaddubsw ymm2, ymm2, ymm6
1225 vphaddw ymm1, ymm1, ymm3 // mutates
1226 vphaddw ymm0, ymm0, ymm2
1227 vpsraw ymm1, ymm1, 8
1228 vpsraw ymm0, ymm0, 8
1229 vpacksswb ymm0, ymm1, ymm0 // mutates
1230 vpermq ymm0, ymm0, 0xd8 // For vpacksswb
1231 vpshufb ymm0, ymm0, kShufARGBToUV_AVX // For vshufps + vphaddw
1232 vpaddb ymm0, ymm0, ymm5 // -> unsigned
1234 // step 3 - store 16 U and 16 V values
1236 vextractf128 [edx], ymm0, 0 // U
1237 vextractf128 [edx + edi], ymm0, 1 // V
1247 #endif // HAS_ARGBTOUVROW_AVX2
1249 __declspec(naked) __declspec(align(16))
1250 void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
1251 uint8* dst_u, uint8* dst_v, int width) {
1255 mov eax, [esp + 8 + 4] // src_argb
1256 mov esi, [esp + 8 + 8] // src_stride_argb
1257 mov edx, [esp + 8 + 12] // dst_u
1258 mov edi, [esp + 8 + 16] // dst_v
1259 mov ecx, [esp + 8 + 20] // pix
1260 movdqa xmm7, kARGBToU
1261 movdqa xmm6, kARGBToV
1262 movdqa xmm5, kAddUV128
1263 sub edi, edx // stride from u to v
1267 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1269 movdqu xmm1, [eax + 16]
1270 movdqu xmm2, [eax + 32]
1271 movdqu xmm3, [eax + 48]
1272 movdqu xmm4, [eax + esi]
1274 movdqu xmm4, [eax + esi + 16]
1276 movdqu xmm4, [eax + esi + 32]
1278 movdqu xmm4, [eax + esi + 48]
1282 shufps xmm0, xmm1, 0x88
1283 shufps xmm4, xmm1, 0xdd
1286 shufps xmm2, xmm3, 0x88
1287 shufps xmm4, xmm3, 0xdd
1290 // step 2 - convert to U and V
1291 // from here down is very similar to Y code except
1292 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1295 pmaddubsw xmm0, xmm7 // U
1296 pmaddubsw xmm2, xmm7
1297 pmaddubsw xmm1, xmm6 // V
1298 pmaddubsw xmm3, xmm6
1304 paddb xmm0, xmm5 // -> unsigned
1306 // step 3 - store 8 U and 8 V values
1308 movlps qword ptr [edx], xmm0 // U
1309 movhps qword ptr [edx + edi], xmm0 // V
1319 __declspec(naked) __declspec(align(16))
1320 void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
1321 uint8* dst_u, uint8* dst_v, int width) {
1325 mov eax, [esp + 8 + 4] // src_argb
1326 mov esi, [esp + 8 + 8] // src_stride_argb
1327 mov edx, [esp + 8 + 12] // dst_u
1328 mov edi, [esp + 8 + 16] // dst_v
1329 mov ecx, [esp + 8 + 20] // pix
1330 movdqa xmm7, kARGBToUJ
1331 movdqa xmm6, kARGBToVJ
1332 movdqa xmm5, kAddUVJ128
1333 sub edi, edx // stride from u to v
1337 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1339 movdqu xmm1, [eax + 16]
1340 movdqu xmm2, [eax + 32]
1341 movdqu xmm3, [eax + 48]
1342 movdqu xmm4, [eax + esi]
1344 movdqu xmm4, [eax + esi + 16]
1346 movdqu xmm4, [eax + esi + 32]
1348 movdqu xmm4, [eax + esi + 48]
1352 shufps xmm0, xmm1, 0x88
1353 shufps xmm4, xmm1, 0xdd
1356 shufps xmm2, xmm3, 0x88
1357 shufps xmm4, xmm3, 0xdd
1360 // step 2 - convert to U and V
1361 // from here down is very similar to Y code except
1362 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1365 pmaddubsw xmm0, xmm7 // U
1366 pmaddubsw xmm2, xmm7
1367 pmaddubsw xmm1, xmm6 // V
1368 pmaddubsw xmm3, xmm6
1371 paddw xmm0, xmm5 // +.5 rounding -> unsigned
1377 // step 3 - store 8 U and 8 V values
1379 movlps qword ptr [edx], xmm0 // U
1380 movhps qword ptr [edx + edi], xmm0 // V
1390 __declspec(naked) __declspec(align(16))
1391 void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
1392 uint8* dst_u, uint8* dst_v, int width) {
1395 mov eax, [esp + 4 + 4] // src_argb
1396 mov edx, [esp + 4 + 8] // dst_u
1397 mov edi, [esp + 4 + 12] // dst_v
1398 mov ecx, [esp + 4 + 16] // pix
1399 movdqa xmm7, kARGBToU
1400 movdqa xmm6, kARGBToV
1401 movdqa xmm5, kAddUV128
1402 sub edi, edx // stride from u to v
1406 /* convert to U and V */
1407 movdqa xmm0, [eax] // U
1408 movdqa xmm1, [eax + 16]
1409 movdqa xmm2, [eax + 32]
1410 movdqa xmm3, [eax + 48]
1411 pmaddubsw xmm0, xmm7
1412 pmaddubsw xmm1, xmm7
1413 pmaddubsw xmm2, xmm7
1414 pmaddubsw xmm3, xmm7
1424 movdqa xmm0, [eax] // V
1425 movdqa xmm1, [eax + 16]
1426 movdqa xmm2, [eax + 32]
1427 movdqa xmm3, [eax + 48]
1428 pmaddubsw xmm0, xmm6
1429 pmaddubsw xmm1, xmm6
1430 pmaddubsw xmm2, xmm6
1431 pmaddubsw xmm3, xmm6
1439 movdqa [edx + edi], xmm0
1448 __declspec(naked) __declspec(align(16))
1449 void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0,
1450 uint8* dst_u, uint8* dst_v, int width) {
1453 mov eax, [esp + 4 + 4] // src_argb
1454 mov edx, [esp + 4 + 8] // dst_u
1455 mov edi, [esp + 4 + 12] // dst_v
1456 mov ecx, [esp + 4 + 16] // pix
1457 movdqa xmm7, kARGBToU
1458 movdqa xmm6, kARGBToV
1459 movdqa xmm5, kAddUV128
1460 sub edi, edx // stride from u to v
1464 /* convert to U and V */
1465 movdqu xmm0, [eax] // U
1466 movdqu xmm1, [eax + 16]
1467 movdqu xmm2, [eax + 32]
1468 movdqu xmm3, [eax + 48]
1469 pmaddubsw xmm0, xmm7
1470 pmaddubsw xmm1, xmm7
1471 pmaddubsw xmm2, xmm7
1472 pmaddubsw xmm3, xmm7
1482 movdqu xmm0, [eax] // V
1483 movdqu xmm1, [eax + 16]
1484 movdqu xmm2, [eax + 32]
1485 movdqu xmm3, [eax + 48]
1486 pmaddubsw xmm0, xmm6
1487 pmaddubsw xmm1, xmm6
1488 pmaddubsw xmm2, xmm6
1489 pmaddubsw xmm3, xmm6
1497 movdqu [edx + edi], xmm0
1506 __declspec(naked) __declspec(align(16))
1507 void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
1508 uint8* dst_u, uint8* dst_v, int width) {
1511 mov eax, [esp + 4 + 4] // src_argb
1512 mov edx, [esp + 4 + 8] // dst_u
1513 mov edi, [esp + 4 + 12] // dst_v
1514 mov ecx, [esp + 4 + 16] // pix
1515 movdqa xmm7, kARGBToU
1516 movdqa xmm6, kARGBToV
1517 movdqa xmm5, kAddUV128
1518 sub edi, edx // stride from u to v
1522 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1524 movdqa xmm1, [eax + 16]
1525 movdqa xmm2, [eax + 32]
1526 movdqa xmm3, [eax + 48]
1529 shufps xmm0, xmm1, 0x88
1530 shufps xmm4, xmm1, 0xdd
1533 shufps xmm2, xmm3, 0x88
1534 shufps xmm4, xmm3, 0xdd
1537 // step 2 - convert to U and V
1538 // from here down is very similar to Y code except
1539 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1542 pmaddubsw xmm0, xmm7 // U
1543 pmaddubsw xmm2, xmm7
1544 pmaddubsw xmm1, xmm6 // V
1545 pmaddubsw xmm3, xmm6
1551 paddb xmm0, xmm5 // -> unsigned
1553 // step 3 - store 8 U and 8 V values
1555 movlps qword ptr [edx], xmm0 // U
1556 movhps qword ptr [edx + edi], xmm0 // V
1565 __declspec(naked) __declspec(align(16))
1566 void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
1567 uint8* dst_u, uint8* dst_v, int width) {
1570 mov eax, [esp + 4 + 4] // src_argb
1571 mov edx, [esp + 4 + 8] // dst_u
1572 mov edi, [esp + 4 + 12] // dst_v
1573 mov ecx, [esp + 4 + 16] // pix
1574 movdqa xmm7, kARGBToU
1575 movdqa xmm6, kARGBToV
1576 movdqa xmm5, kAddUV128
1577 sub edi, edx // stride from u to v
1581 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1583 movdqu xmm1, [eax + 16]
1584 movdqu xmm2, [eax + 32]
1585 movdqu xmm3, [eax + 48]
1588 shufps xmm0, xmm1, 0x88
1589 shufps xmm4, xmm1, 0xdd
1592 shufps xmm2, xmm3, 0x88
1593 shufps xmm4, xmm3, 0xdd
1596 // step 2 - convert to U and V
1597 // from here down is very similar to Y code except
1598 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1601 pmaddubsw xmm0, xmm7 // U
1602 pmaddubsw xmm2, xmm7
1603 pmaddubsw xmm1, xmm6 // V
1604 pmaddubsw xmm3, xmm6
1610 paddb xmm0, xmm5 // -> unsigned
1612 // step 3 - store 8 U and 8 V values
1614 movlps qword ptr [edx], xmm0 // U
1615 movhps qword ptr [edx + edi], xmm0 // V
1624 __declspec(naked) __declspec(align(16))
1625 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1626 uint8* dst_u, uint8* dst_v, int width) {
1630 mov eax, [esp + 8 + 4] // src_argb
1631 mov esi, [esp + 8 + 8] // src_stride_argb
1632 mov edx, [esp + 8 + 12] // dst_u
1633 mov edi, [esp + 8 + 16] // dst_v
1634 mov ecx, [esp + 8 + 20] // pix
1635 movdqa xmm7, kBGRAToU
1636 movdqa xmm6, kBGRAToV
1637 movdqa xmm5, kAddUV128
1638 sub edi, edx // stride from u to v
1642 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1644 movdqa xmm1, [eax + 16]
1645 movdqa xmm2, [eax + 32]
1646 movdqa xmm3, [eax + 48]
1647 pavgb xmm0, [eax + esi]
1648 pavgb xmm1, [eax + esi + 16]
1649 pavgb xmm2, [eax + esi + 32]
1650 pavgb xmm3, [eax + esi + 48]
1653 shufps xmm0, xmm1, 0x88
1654 shufps xmm4, xmm1, 0xdd
1657 shufps xmm2, xmm3, 0x88
1658 shufps xmm4, xmm3, 0xdd
1661 // step 2 - convert to U and V
1662 // from here down is very similar to Y code except
1663 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1666 pmaddubsw xmm0, xmm7 // U
1667 pmaddubsw xmm2, xmm7
1668 pmaddubsw xmm1, xmm6 // V
1669 pmaddubsw xmm3, xmm6
1675 paddb xmm0, xmm5 // -> unsigned
1677 // step 3 - store 8 U and 8 V values
1679 movlps qword ptr [edx], xmm0 // U
1680 movhps qword ptr [edx + edi], xmm0 // V
1690 __declspec(naked) __declspec(align(16))
1691 void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
1692 uint8* dst_u, uint8* dst_v, int width) {
1696 mov eax, [esp + 8 + 4] // src_argb
1697 mov esi, [esp + 8 + 8] // src_stride_argb
1698 mov edx, [esp + 8 + 12] // dst_u
1699 mov edi, [esp + 8 + 16] // dst_v
1700 mov ecx, [esp + 8 + 20] // pix
1701 movdqa xmm7, kBGRAToU
1702 movdqa xmm6, kBGRAToV
1703 movdqa xmm5, kAddUV128
1704 sub edi, edx // stride from u to v
1708 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1710 movdqu xmm1, [eax + 16]
1711 movdqu xmm2, [eax + 32]
1712 movdqu xmm3, [eax + 48]
1713 movdqu xmm4, [eax + esi]
1715 movdqu xmm4, [eax + esi + 16]
1717 movdqu xmm4, [eax + esi + 32]
1719 movdqu xmm4, [eax + esi + 48]
1723 shufps xmm0, xmm1, 0x88
1724 shufps xmm4, xmm1, 0xdd
1727 shufps xmm2, xmm3, 0x88
1728 shufps xmm4, xmm3, 0xdd
1731 // step 2 - convert to U and V
1732 // from here down is very similar to Y code except
1733 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1736 pmaddubsw xmm0, xmm7 // U
1737 pmaddubsw xmm2, xmm7
1738 pmaddubsw xmm1, xmm6 // V
1739 pmaddubsw xmm3, xmm6
1745 paddb xmm0, xmm5 // -> unsigned
1747 // step 3 - store 8 U and 8 V values
1749 movlps qword ptr [edx], xmm0 // U
1750 movhps qword ptr [edx + edi], xmm0 // V
1760 __declspec(naked) __declspec(align(16))
1761 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1762 uint8* dst_u, uint8* dst_v, int width) {
1766 mov eax, [esp + 8 + 4] // src_argb
1767 mov esi, [esp + 8 + 8] // src_stride_argb
1768 mov edx, [esp + 8 + 12] // dst_u
1769 mov edi, [esp + 8 + 16] // dst_v
1770 mov ecx, [esp + 8 + 20] // pix
1771 movdqa xmm7, kABGRToU
1772 movdqa xmm6, kABGRToV
1773 movdqa xmm5, kAddUV128
1774 sub edi, edx // stride from u to v
1778 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1780 movdqa xmm1, [eax + 16]
1781 movdqa xmm2, [eax + 32]
1782 movdqa xmm3, [eax + 48]
1783 pavgb xmm0, [eax + esi]
1784 pavgb xmm1, [eax + esi + 16]
1785 pavgb xmm2, [eax + esi + 32]
1786 pavgb xmm3, [eax + esi + 48]
1789 shufps xmm0, xmm1, 0x88
1790 shufps xmm4, xmm1, 0xdd
1793 shufps xmm2, xmm3, 0x88
1794 shufps xmm4, xmm3, 0xdd
1797 // step 2 - convert to U and V
1798 // from here down is very similar to Y code except
1799 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1802 pmaddubsw xmm0, xmm7 // U
1803 pmaddubsw xmm2, xmm7
1804 pmaddubsw xmm1, xmm6 // V
1805 pmaddubsw xmm3, xmm6
1811 paddb xmm0, xmm5 // -> unsigned
1813 // step 3 - store 8 U and 8 V values
1815 movlps qword ptr [edx], xmm0 // U
1816 movhps qword ptr [edx + edi], xmm0 // V
1826 __declspec(naked) __declspec(align(16))
1827 void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
1828 uint8* dst_u, uint8* dst_v, int width) {
1832 mov eax, [esp + 8 + 4] // src_argb
1833 mov esi, [esp + 8 + 8] // src_stride_argb
1834 mov edx, [esp + 8 + 12] // dst_u
1835 mov edi, [esp + 8 + 16] // dst_v
1836 mov ecx, [esp + 8 + 20] // pix
1837 movdqa xmm7, kABGRToU
1838 movdqa xmm6, kABGRToV
1839 movdqa xmm5, kAddUV128
1840 sub edi, edx // stride from u to v
1844 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1846 movdqu xmm1, [eax + 16]
1847 movdqu xmm2, [eax + 32]
1848 movdqu xmm3, [eax + 48]
1849 movdqu xmm4, [eax + esi]
1851 movdqu xmm4, [eax + esi + 16]
1853 movdqu xmm4, [eax + esi + 32]
1855 movdqu xmm4, [eax + esi + 48]
1859 shufps xmm0, xmm1, 0x88
1860 shufps xmm4, xmm1, 0xdd
1863 shufps xmm2, xmm3, 0x88
1864 shufps xmm4, xmm3, 0xdd
1867 // step 2 - convert to U and V
1868 // from here down is very similar to Y code except
1869 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1872 pmaddubsw xmm0, xmm7 // U
1873 pmaddubsw xmm2, xmm7
1874 pmaddubsw xmm1, xmm6 // V
1875 pmaddubsw xmm3, xmm6
1881 paddb xmm0, xmm5 // -> unsigned
1883 // step 3 - store 8 U and 8 V values
1885 movlps qword ptr [edx], xmm0 // U
1886 movhps qword ptr [edx + edi], xmm0 // V
1896 __declspec(naked) __declspec(align(16))
1897 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1898 uint8* dst_u, uint8* dst_v, int width) {
1902 mov eax, [esp + 8 + 4] // src_argb
1903 mov esi, [esp + 8 + 8] // src_stride_argb
1904 mov edx, [esp + 8 + 12] // dst_u
1905 mov edi, [esp + 8 + 16] // dst_v
1906 mov ecx, [esp + 8 + 20] // pix
1907 movdqa xmm7, kRGBAToU
1908 movdqa xmm6, kRGBAToV
1909 movdqa xmm5, kAddUV128
1910 sub edi, edx // stride from u to v
1914 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1916 movdqa xmm1, [eax + 16]
1917 movdqa xmm2, [eax + 32]
1918 movdqa xmm3, [eax + 48]
1919 pavgb xmm0, [eax + esi]
1920 pavgb xmm1, [eax + esi + 16]
1921 pavgb xmm2, [eax + esi + 32]
1922 pavgb xmm3, [eax + esi + 48]
1925 shufps xmm0, xmm1, 0x88
1926 shufps xmm4, xmm1, 0xdd
1929 shufps xmm2, xmm3, 0x88
1930 shufps xmm4, xmm3, 0xdd
1933 // step 2 - convert to U and V
1934 // from here down is very similar to Y code except
1935 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1938 pmaddubsw xmm0, xmm7 // U
1939 pmaddubsw xmm2, xmm7
1940 pmaddubsw xmm1, xmm6 // V
1941 pmaddubsw xmm3, xmm6
1947 paddb xmm0, xmm5 // -> unsigned
1949 // step 3 - store 8 U and 8 V values
1951 movlps qword ptr [edx], xmm0 // U
1952 movhps qword ptr [edx + edi], xmm0 // V
1962 __declspec(naked) __declspec(align(16))
1963 void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
1964 uint8* dst_u, uint8* dst_v, int width) {
1968 mov eax, [esp + 8 + 4] // src_argb
1969 mov esi, [esp + 8 + 8] // src_stride_argb
1970 mov edx, [esp + 8 + 12] // dst_u
1971 mov edi, [esp + 8 + 16] // dst_v
1972 mov ecx, [esp + 8 + 20] // pix
1973 movdqa xmm7, kRGBAToU
1974 movdqa xmm6, kRGBAToV
1975 movdqa xmm5, kAddUV128
1976 sub edi, edx // stride from u to v
1980 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1982 movdqu xmm1, [eax + 16]
1983 movdqu xmm2, [eax + 32]
1984 movdqu xmm3, [eax + 48]
1985 movdqu xmm4, [eax + esi]
1987 movdqu xmm4, [eax + esi + 16]
1989 movdqu xmm4, [eax + esi + 32]
1991 movdqu xmm4, [eax + esi + 48]
1995 shufps xmm0, xmm1, 0x88
1996 shufps xmm4, xmm1, 0xdd
1999 shufps xmm2, xmm3, 0x88
2000 shufps xmm4, xmm3, 0xdd
2003 // step 2 - convert to U and V
2004 // from here down is very similar to Y code except
2005 // instead of 16 different pixels, its 8 pixels of U and 8 of V
2008 pmaddubsw xmm0, xmm7 // U
2009 pmaddubsw xmm2, xmm7
2010 pmaddubsw xmm1, xmm6 // V
2011 pmaddubsw xmm3, xmm6
2017 paddb xmm0, xmm5 // -> unsigned
2019 // step 3 - store 8 U and 8 V values
2021 movlps qword ptr [edx], xmm0 // U
2022 movhps qword ptr [edx + edi], xmm0 // V
2031 #endif // HAS_ARGBTOYROW_SSSE3
2033 #define YG 74 /* (int8)(1.164 * 64 + 0.5) */
2035 #define UB 127 /* min(63,(int8)(2.018 * 64)) */
2036 #define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
2040 #define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
2041 #define VR 102 /* (int8)(1.596 * 64 + 0.5) */
2044 #define BB UB * 128 + VB * 128
2045 #define BG UG * 128 + VG * 128
2046 #define BR UR * 128 + VR * 128
2048 #ifdef HAS_I422TOARGBROW_AVX2
2050 static const lvec8 kUVToB_AVX = {
2051 UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB,
2052 UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
2054 static const lvec8 kUVToR_AVX = {
2055 UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR,
2056 UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
2058 static const lvec8 kUVToG_AVX = {
2059 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
2060 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
2062 static const lvec16 kYToRgb_AVX = {
2063 YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG
2065 static const lvec16 kYSub16_AVX = {
2066 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
2068 static const lvec16 kUVBiasB_AVX = {
2069 BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB
2071 static const lvec16 kUVBiasG_AVX = {
2072 BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG
2074 static const lvec16 kUVBiasR_AVX = {
2075 BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR
2079 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2080 __declspec(naked) __declspec(align(16))
2081 void I422ToARGBRow_AVX2(const uint8* y_buf,
2089 mov eax, [esp + 8 + 4] // Y
2090 mov esi, [esp + 8 + 8] // U
2091 mov edi, [esp + 8 + 12] // V
2092 mov edx, [esp + 8 + 16] // argb
2093 mov ecx, [esp + 8 + 20] // width
2095 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2096 vpxor ymm4, ymm4, ymm4
2100 vmovq xmm0, qword ptr [esi] // U
2101 vmovq xmm1, qword ptr [esi + edi] // V
2103 vpunpcklbw ymm0, ymm0, ymm1 // UV
2104 vpermq ymm0, ymm0, 0xd8
2105 vpunpcklwd ymm0, ymm0, ymm0 // UVUV
2106 vpmaddubsw ymm2, ymm0, kUVToB_AVX // scale B UV
2107 vpmaddubsw ymm1, ymm0, kUVToG_AVX // scale G UV
2108 vpmaddubsw ymm0, ymm0, kUVToR_AVX // scale R UV
2109 vpsubw ymm2, ymm2, kUVBiasB_AVX // unbias back to signed
2110 vpsubw ymm1, ymm1, kUVBiasG_AVX
2111 vpsubw ymm0, ymm0, kUVBiasR_AVX
2113 // Step 2: Find Y contribution to 16 R,G,B values
2114 vmovdqu xmm3, [eax] // NOLINT
2116 vpermq ymm3, ymm3, 0xd8
2117 vpunpcklbw ymm3, ymm3, ymm4
2118 vpsubsw ymm3, ymm3, kYSub16_AVX
2119 vpmullw ymm3, ymm3, kYToRgb_AVX
2120 vpaddsw ymm2, ymm2, ymm3 // B += Y
2121 vpaddsw ymm1, ymm1, ymm3 // G += Y
2122 vpaddsw ymm0, ymm0, ymm3 // R += Y
2123 vpsraw ymm2, ymm2, 6
2124 vpsraw ymm1, ymm1, 6
2125 vpsraw ymm0, ymm0, 6
2126 vpackuswb ymm2, ymm2, ymm2 // B
2127 vpackuswb ymm1, ymm1, ymm1 // G
2128 vpackuswb ymm0, ymm0, ymm0 // R
2130 // Step 3: Weave into ARGB
2131 vpunpcklbw ymm2, ymm2, ymm1 // BG
2132 vpermq ymm2, ymm2, 0xd8
2133 vpunpcklbw ymm0, ymm0, ymm5 // RA
2134 vpermq ymm0, ymm0, 0xd8
2135 vpunpcklwd ymm1, ymm2, ymm0 // BGRA first 8 pixels
2136 vpunpckhwd ymm2, ymm2, ymm0 // BGRA next 8 pixels
2138 vmovdqu [edx + 32], ymm2
2149 #endif // HAS_I422TOARGBROW_AVX2
2151 #ifdef HAS_I422TOARGBROW_SSSE3
2153 static const vec8 kUVToB = {
2154 UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
2157 static const vec8 kUVToR = {
2158 UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
2161 static const vec8 kUVToG = {
2162 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
2165 static const vec8 kVUToB = {
2166 VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
2169 static const vec8 kVUToR = {
2170 VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
2173 static const vec8 kVUToG = {
2174 VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
2177 static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
2178 static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
2179 static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
2180 static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
2181 static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
2183 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
2185 // Read 8 UV from 444.
2186 #define READYUV444 __asm { \
2187 __asm movq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \
2188 __asm movq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \
2189 __asm lea esi, [esi + 8] \
2190 __asm punpcklbw xmm0, xmm1 /* UV */ \
2193 // Read 4 UV from 422, upsample to 8 UV.
2194 #define READYUV422 __asm { \
2195 __asm movd xmm0, [esi] /* U */ \
2196 __asm movd xmm1, [esi + edi] /* V */ \
2197 __asm lea esi, [esi + 4] \
2198 __asm punpcklbw xmm0, xmm1 /* UV */ \
2199 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
2202 // Read 2 UV from 411, upsample to 8 UV.
2203 #define READYUV411 __asm { \
2204 __asm movzx ebx, word ptr [esi] /* U */ /* NOLINT */ \
2205 __asm movd xmm0, ebx \
2206 __asm movzx ebx, word ptr [esi + edi] /* V */ /* NOLINT */ \
2207 __asm movd xmm1, ebx \
2208 __asm lea esi, [esi + 2] \
2209 __asm punpcklbw xmm0, xmm1 /* UV */ \
2210 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
2211 __asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \
2214 // Read 4 UV from NV12, upsample to 8 UV.
2215 #define READNV12 __asm { \
2216 __asm movq xmm0, qword ptr [esi] /* UV */ /* NOLINT */ \
2217 __asm lea esi, [esi + 8] \
2218 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
2221 // Convert 8 pixels: 8 UV and 8 Y.
2222 #define YUVTORGB __asm { \
2223 /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
2224 __asm movdqa xmm1, xmm0 \
2225 __asm movdqa xmm2, xmm0 \
2226 __asm pmaddubsw xmm0, kUVToB /* scale B UV */ \
2227 __asm pmaddubsw xmm1, kUVToG /* scale G UV */ \
2228 __asm pmaddubsw xmm2, kUVToR /* scale R UV */ \
2229 __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \
2230 __asm psubw xmm1, kUVBiasG \
2231 __asm psubw xmm2, kUVBiasR \
2232 /* Step 2: Find Y contribution to 8 R,G,B values */ \
2233 __asm movq xmm3, qword ptr [eax] /* NOLINT */ \
2234 __asm lea eax, [eax + 8] \
2235 __asm punpcklbw xmm3, xmm4 \
2236 __asm psubsw xmm3, kYSub16 \
2237 __asm pmullw xmm3, kYToRgb \
2238 __asm paddsw xmm0, xmm3 /* B += Y */ \
2239 __asm paddsw xmm1, xmm3 /* G += Y */ \
2240 __asm paddsw xmm2, xmm3 /* R += Y */ \
2241 __asm psraw xmm0, 6 \
2242 __asm psraw xmm1, 6 \
2243 __asm psraw xmm2, 6 \
2244 __asm packuswb xmm0, xmm0 /* B */ \
2245 __asm packuswb xmm1, xmm1 /* G */ \
2246 __asm packuswb xmm2, xmm2 /* R */ \
2249 // Convert 8 pixels: 8 VU and 8 Y.
2250 #define YVUTORGB __asm { \
2251 /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
2252 __asm movdqa xmm1, xmm0 \
2253 __asm movdqa xmm2, xmm0 \
2254 __asm pmaddubsw xmm0, kVUToB /* scale B UV */ \
2255 __asm pmaddubsw xmm1, kVUToG /* scale G UV */ \
2256 __asm pmaddubsw xmm2, kVUToR /* scale R UV */ \
2257 __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \
2258 __asm psubw xmm1, kUVBiasG \
2259 __asm psubw xmm2, kUVBiasR \
2260 /* Step 2: Find Y contribution to 8 R,G,B values */ \
2261 __asm movq xmm3, qword ptr [eax] /* NOLINT */ \
2262 __asm lea eax, [eax + 8] \
2263 __asm punpcklbw xmm3, xmm4 \
2264 __asm psubsw xmm3, kYSub16 \
2265 __asm pmullw xmm3, kYToRgb \
2266 __asm paddsw xmm0, xmm3 /* B += Y */ \
2267 __asm paddsw xmm1, xmm3 /* G += Y */ \
2268 __asm paddsw xmm2, xmm3 /* R += Y */ \
2269 __asm psraw xmm0, 6 \
2270 __asm psraw xmm1, 6 \
2271 __asm psraw xmm2, 6 \
2272 __asm packuswb xmm0, xmm0 /* B */ \
2273 __asm packuswb xmm1, xmm1 /* G */ \
2274 __asm packuswb xmm2, xmm2 /* R */ \
2277 // 8 pixels, dest aligned 16.
2278 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
2279 __declspec(naked) __declspec(align(16))
2280 void I444ToARGBRow_SSSE3(const uint8* y_buf,
2288 mov eax, [esp + 8 + 4] // Y
2289 mov esi, [esp + 8 + 8] // U
2290 mov edi, [esp + 8 + 12] // V
2291 mov edx, [esp + 8 + 16] // argb
2292 mov ecx, [esp + 8 + 20] // width
2294 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2302 // Step 3: Weave into ARGB
2303 punpcklbw xmm0, xmm1 // BG
2304 punpcklbw xmm2, xmm5 // RA
2306 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
2307 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
2309 movdqa [edx + 16], xmm1
2320 // 8 pixels, dest aligned 16.
2321 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2322 __declspec(naked) __declspec(align(16))
2323 void I422ToRGB24Row_SSSE3(const uint8* y_buf,
2331 mov eax, [esp + 8 + 4] // Y
2332 mov esi, [esp + 8 + 8] // U
2333 mov edi, [esp + 8 + 12] // V
2334 mov edx, [esp + 8 + 16] // rgb24
2335 mov ecx, [esp + 8 + 20] // width
2338 movdqa xmm5, kShuffleMaskARGBToRGB24_0
2339 movdqa xmm6, kShuffleMaskARGBToRGB24
2346 // Step 3: Weave into RRGB
2347 punpcklbw xmm0, xmm1 // BG
2348 punpcklbw xmm2, xmm2 // RR
2350 punpcklwd xmm0, xmm2 // BGRR first 4 pixels
2351 punpckhwd xmm1, xmm2 // BGRR next 4 pixels
2352 pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes.
2353 pshufb xmm1, xmm6 // Pack into first 12 bytes.
2354 palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1
2355 movq qword ptr [edx], xmm0 // First 8 bytes
2356 movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels.
2367 // 8 pixels, dest aligned 16.
2368 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2369 __declspec(naked) __declspec(align(16))
2370 void I422ToRAWRow_SSSE3(const uint8* y_buf,
2378 mov eax, [esp + 8 + 4] // Y
2379 mov esi, [esp + 8 + 8] // U
2380 mov edi, [esp + 8 + 12] // V
2381 mov edx, [esp + 8 + 16] // raw
2382 mov ecx, [esp + 8 + 20] // width
2385 movdqa xmm5, kShuffleMaskARGBToRAW_0
2386 movdqa xmm6, kShuffleMaskARGBToRAW
2393 // Step 3: Weave into RRGB
2394 punpcklbw xmm0, xmm1 // BG
2395 punpcklbw xmm2, xmm2 // RR
2397 punpcklwd xmm0, xmm2 // BGRR first 4 pixels
2398 punpckhwd xmm1, xmm2 // BGRR next 4 pixels
2399 pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes.
2400 pshufb xmm1, xmm6 // Pack into first 12 bytes.
2401 palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1
2402 movq qword ptr [edx], xmm0 // First 8 bytes
2403 movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels.
2414 // 8 pixels, dest unaligned.
2415 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2416 __declspec(naked) __declspec(align(16))
2417 void I422ToRGB565Row_SSSE3(const uint8* y_buf,
2425 mov eax, [esp + 8 + 4] // Y
2426 mov esi, [esp + 8 + 8] // U
2427 mov edi, [esp + 8 + 12] // V
2428 mov edx, [esp + 8 + 16] // rgb565
2429 mov ecx, [esp + 8 + 20] // width
2432 pcmpeqb xmm5, xmm5 // generate mask 0x0000001f
2434 pcmpeqb xmm6, xmm6 // generate mask 0x000007e0
2437 pcmpeqb xmm7, xmm7 // generate mask 0xfffff800
2445 // Step 3: Weave into RRGB
2446 punpcklbw xmm0, xmm1 // BG
2447 punpcklbw xmm2, xmm2 // RR
2449 punpcklwd xmm0, xmm2 // BGRR first 4 pixels
2450 punpckhwd xmm1, xmm2 // BGRR next 4 pixels
2452 // Step 3b: RRGB -> RGB565
2453 movdqa xmm3, xmm0 // B first 4 pixels of argb
2454 movdqa xmm2, xmm0 // G
2459 pand xmm3, xmm5 // B
2460 pand xmm2, xmm6 // G
2461 pand xmm0, xmm7 // R
2462 por xmm3, xmm2 // BG
2463 por xmm0, xmm3 // BGR
2464 movdqa xmm3, xmm1 // B next 4 pixels of argb
2465 movdqa xmm2, xmm1 // G
2470 pand xmm3, xmm5 // B
2471 pand xmm2, xmm6 // G
2472 pand xmm1, xmm7 // R
2473 por xmm3, xmm2 // BG
2474 por xmm1, xmm3 // BGR
2477 movdqu [edx], xmm0 // store 8 pixels of RGB565
2487 // 8 pixels, dest aligned 16.
2488 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2489 __declspec(naked) __declspec(align(16))
2490 void I422ToARGBRow_SSSE3(const uint8* y_buf,
2498 mov eax, [esp + 8 + 4] // Y
2499 mov esi, [esp + 8 + 8] // U
2500 mov edi, [esp + 8 + 12] // V
2501 mov edx, [esp + 8 + 16] // argb
2502 mov ecx, [esp + 8 + 20] // width
2504 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2512 // Step 3: Weave into ARGB
2513 punpcklbw xmm0, xmm1 // BG
2514 punpcklbw xmm2, xmm5 // RA
2516 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
2517 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
2519 movdqa [edx + 16], xmm1
2530 // 8 pixels, dest aligned 16.
2531 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2532 // Similar to I420 but duplicate UV once more.
2533 __declspec(naked) __declspec(align(16))
2534 void I411ToARGBRow_SSSE3(const uint8* y_buf,
2543 mov eax, [esp + 12 + 4] // Y
2544 mov esi, [esp + 12 + 8] // U
2545 mov edi, [esp + 12 + 12] // V
2546 mov edx, [esp + 12 + 16] // argb
2547 mov ecx, [esp + 12 + 20] // width
2549 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2554 READYUV411 // modifies EBX
2557 // Step 3: Weave into ARGB
2558 punpcklbw xmm0, xmm1 // BG
2559 punpcklbw xmm2, xmm5 // RA
2561 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
2562 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
2564 movdqa [edx + 16], xmm1
2576 // 8 pixels, dest aligned 16.
2577 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2578 __declspec(naked) __declspec(align(16))
2579 void NV12ToARGBRow_SSSE3(const uint8* y_buf,
2580 const uint8* uv_buf,
2585 mov eax, [esp + 4 + 4] // Y
2586 mov esi, [esp + 4 + 8] // UV
2587 mov edx, [esp + 4 + 12] // argb
2588 mov ecx, [esp + 4 + 16] // width
2589 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2597 // Step 3: Weave into ARGB
2598 punpcklbw xmm0, xmm1 // BG
2599 punpcklbw xmm2, xmm5 // RA
2601 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
2602 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
2604 movdqa [edx + 16], xmm1
2614 // 8 pixels, dest aligned 16.
2615 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2616 __declspec(naked) __declspec(align(16))
2617 void NV21ToARGBRow_SSSE3(const uint8* y_buf,
2618 const uint8* uv_buf,
2623 mov eax, [esp + 4 + 4] // Y
2624 mov esi, [esp + 4 + 8] // VU
2625 mov edx, [esp + 4 + 12] // argb
2626 mov ecx, [esp + 4 + 16] // width
2627 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2635 // Step 3: Weave into ARGB
2636 punpcklbw xmm0, xmm1 // BG
2637 punpcklbw xmm2, xmm5 // RA
2639 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
2640 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
2642 movdqa [edx + 16], xmm1
2652 // 8 pixels, unaligned.
2653 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
2654 __declspec(naked) __declspec(align(16))
2655 void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2663 mov eax, [esp + 8 + 4] // Y
2664 mov esi, [esp + 8 + 8] // U
2665 mov edi, [esp + 8 + 12] // V
2666 mov edx, [esp + 8 + 16] // argb
2667 mov ecx, [esp + 8 + 20] // width
2669 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2677 // Step 3: Weave into ARGB
2678 punpcklbw xmm0, xmm1 // BG
2679 punpcklbw xmm2, xmm5 // RA
2681 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
2682 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
2684 movdqu [edx + 16], xmm1
2695 // 8 pixels, unaligned.
2696 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2697 __declspec(naked) __declspec(align(16))
2698 void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2706 mov eax, [esp + 8 + 4] // Y
2707 mov esi, [esp + 8 + 8] // U
2708 mov edi, [esp + 8 + 12] // V
2709 mov edx, [esp + 8 + 16] // argb
2710 mov ecx, [esp + 8 + 20] // width
2712 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2720 // Step 3: Weave into ARGB
2721 punpcklbw xmm0, xmm1 // BG
2722 punpcklbw xmm2, xmm5 // RA
2724 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
2725 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
2727 movdqu [edx + 16], xmm1
2738 // 8 pixels, unaligned.
2739 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2740 // Similar to I420 but duplicate UV once more.
2741 __declspec(naked) __declspec(align(16))
2742 void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2751 mov eax, [esp + 12 + 4] // Y
2752 mov esi, [esp + 12 + 8] // U
2753 mov edi, [esp + 12 + 12] // V
2754 mov edx, [esp + 12 + 16] // argb
2755 mov ecx, [esp + 12 + 20] // width
2757 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2762 READYUV411 // modifies EBX
2765 // Step 3: Weave into ARGB
2766 punpcklbw xmm0, xmm1 // BG
2767 punpcklbw xmm2, xmm5 // RA
2769 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
2770 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
2772 movdqu [edx + 16], xmm1
2784 // 8 pixels, dest aligned 16.
2785 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2786 __declspec(naked) __declspec(align(16))
2787 void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2788 const uint8* uv_buf,
2793 mov eax, [esp + 4 + 4] // Y
2794 mov esi, [esp + 4 + 8] // UV
2795 mov edx, [esp + 4 + 12] // argb
2796 mov ecx, [esp + 4 + 16] // width
2797 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2805 // Step 3: Weave into ARGB
2806 punpcklbw xmm0, xmm1 // BG
2807 punpcklbw xmm2, xmm5 // RA
2809 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
2810 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
2812 movdqu [edx + 16], xmm1
2822 // 8 pixels, dest aligned 16.
2823 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2824 __declspec(naked) __declspec(align(16))
2825 void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2826 const uint8* uv_buf,
2831 mov eax, [esp + 4 + 4] // Y
2832 mov esi, [esp + 4 + 8] // VU
2833 mov edx, [esp + 4 + 12] // argb
2834 mov ecx, [esp + 4 + 16] // width
2835 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2843 // Step 3: Weave into ARGB
2844 punpcklbw xmm0, xmm1 // BG
2845 punpcklbw xmm2, xmm5 // RA
2847 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
2848 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
2850 movdqu [edx + 16], xmm1
2860 __declspec(naked) __declspec(align(16))
2861 void I422ToBGRARow_SSSE3(const uint8* y_buf,
2869 mov eax, [esp + 8 + 4] // Y
2870 mov esi, [esp + 8 + 8] // U
2871 mov edi, [esp + 8 + 12] // V
2872 mov edx, [esp + 8 + 16] // bgra
2873 mov ecx, [esp + 8 + 20] // width
2882 // Step 3: Weave into BGRA
2883 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2884 punpcklbw xmm1, xmm0 // GB
2885 punpcklbw xmm5, xmm2 // AR
2887 punpcklwd xmm5, xmm1 // BGRA first 4 pixels
2888 punpckhwd xmm0, xmm1 // BGRA next 4 pixels
2890 movdqa [edx + 16], xmm0
2901 __declspec(naked) __declspec(align(16))
2902 void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
2910 mov eax, [esp + 8 + 4] // Y
2911 mov esi, [esp + 8 + 8] // U
2912 mov edi, [esp + 8 + 12] // V
2913 mov edx, [esp + 8 + 16] // bgra
2914 mov ecx, [esp + 8 + 20] // width
2923 // Step 3: Weave into BGRA
2924 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2925 punpcklbw xmm1, xmm0 // GB
2926 punpcklbw xmm5, xmm2 // AR
2928 punpcklwd xmm5, xmm1 // BGRA first 4 pixels
2929 punpckhwd xmm0, xmm1 // BGRA next 4 pixels
2931 movdqu [edx + 16], xmm0
2942 __declspec(naked) __declspec(align(16))
2943 void I422ToABGRRow_SSSE3(const uint8* y_buf,
2951 mov eax, [esp + 8 + 4] // Y
2952 mov esi, [esp + 8 + 8] // U
2953 mov edi, [esp + 8 + 12] // V
2954 mov edx, [esp + 8 + 16] // abgr
2955 mov ecx, [esp + 8 + 20] // width
2957 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2965 // Step 3: Weave into ARGB
2966 punpcklbw xmm2, xmm1 // RG
2967 punpcklbw xmm0, xmm5 // BA
2969 punpcklwd xmm2, xmm0 // RGBA first 4 pixels
2970 punpckhwd xmm1, xmm0 // RGBA next 4 pixels
2972 movdqa [edx + 16], xmm1
2983 __declspec(naked) __declspec(align(16))
2984 void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
2992 mov eax, [esp + 8 + 4] // Y
2993 mov esi, [esp + 8 + 8] // U
2994 mov edi, [esp + 8 + 12] // V
2995 mov edx, [esp + 8 + 16] // abgr
2996 mov ecx, [esp + 8 + 20] // width
2998 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
3006 // Step 3: Weave into ARGB
3007 punpcklbw xmm2, xmm1 // RG
3008 punpcklbw xmm0, xmm5 // BA
3010 punpcklwd xmm2, xmm0 // RGBA first 4 pixels
3011 punpckhwd xmm1, xmm0 // RGBA next 4 pixels
3013 movdqu [edx + 16], xmm1
3024 __declspec(naked) __declspec(align(16))
3025 void I422ToRGBARow_SSSE3(const uint8* y_buf,
3033 mov eax, [esp + 8 + 4] // Y
3034 mov esi, [esp + 8 + 8] // U
3035 mov edi, [esp + 8 + 12] // V
3036 mov edx, [esp + 8 + 16] // rgba
3037 mov ecx, [esp + 8 + 20] // width
3046 // Step 3: Weave into RGBA
3047 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
3048 punpcklbw xmm1, xmm2 // GR
3049 punpcklbw xmm5, xmm0 // AB
3051 punpcklwd xmm5, xmm1 // RGBA first 4 pixels
3052 punpckhwd xmm0, xmm1 // RGBA next 4 pixels
3054 movdqa [edx + 16], xmm0
3065 __declspec(naked) __declspec(align(16))
3066 void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
3074 mov eax, [esp + 8 + 4] // Y
3075 mov esi, [esp + 8 + 8] // U
3076 mov edi, [esp + 8 + 12] // V
3077 mov edx, [esp + 8 + 16] // rgba
3078 mov ecx, [esp + 8 + 20] // width
3087 // Step 3: Weave into RGBA
3088 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
3089 punpcklbw xmm1, xmm2 // GR
3090 punpcklbw xmm5, xmm0 // AB
3092 punpcklwd xmm5, xmm1 // RGBA first 4 pixels
3093 punpckhwd xmm0, xmm1 // RGBA next 4 pixels
3095 movdqu [edx + 16], xmm0
3106 #endif // HAS_I422TOARGBROW_SSSE3
3108 #ifdef HAS_YTOARGBROW_SSE2
3109 __declspec(naked) __declspec(align(16))
3110 void YToARGBRow_SSE2(const uint8* y_buf,
3115 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
3119 pshufd xmm3, xmm3, 0
3120 mov eax, 0x004a004a // 74
3123 mov eax, [esp + 4] // Y
3124 mov edx, [esp + 8] // rgb
3125 mov ecx, [esp + 12] // width
3129 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
3130 movq xmm0, qword ptr [eax]
3132 punpcklbw xmm0, xmm5 // 0.Y
3136 packuswb xmm0, xmm0 // G
3138 // Step 2: Weave into ARGB
3139 punpcklbw xmm0, xmm0 // GG
3141 punpcklwd xmm0, xmm0 // BGRA first 4 pixels
3142 punpckhwd xmm1, xmm1 // BGRA next 4 pixels
3146 movdqa [edx + 16], xmm1
3154 #endif // HAS_YTOARGBROW_SSE2
3156 #ifdef HAS_MIRRORROW_SSSE3
3157 // Shuffle table for reversing the bytes.
3158 static const uvec8 kShuffleMirror = {
3159 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
3162 __declspec(naked) __declspec(align(16))
3163 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
3165 mov eax, [esp + 4] // src
3166 mov edx, [esp + 8] // dst
3167 mov ecx, [esp + 12] // width
3168 movdqa xmm5, kShuffleMirror
3173 movdqa xmm0, [eax + ecx]
3182 #endif // HAS_MIRRORROW_SSSE3
3184 #ifdef HAS_MIRRORROW_AVX2
3185 // Shuffle table for reversing the bytes.
3186 static const ulvec8 kShuffleMirror_AVX2 = {
3187 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u,
3188 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
3191 __declspec(naked) __declspec(align(16))
3192 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
3194 mov eax, [esp + 4] // src
3195 mov edx, [esp + 8] // dst
3196 mov ecx, [esp + 12] // width
3197 vmovdqa ymm5, kShuffleMirror_AVX2
3202 vmovdqu ymm0, [eax + ecx]
3203 vpshufb ymm0, ymm0, ymm5
3204 vpermq ymm0, ymm0, 0x4e // swap high and low halfs
3213 #endif // HAS_MIRRORROW_AVX2
3215 #ifdef HAS_MIRRORROW_SSE2
3216 // SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
3218 __declspec(naked) __declspec(align(16))
3219 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
3221 mov eax, [esp + 4] // src
3222 mov edx, [esp + 8] // dst
3223 mov ecx, [esp + 12] // width
3228 movdqu xmm0, [eax + ecx]
3229 movdqa xmm1, xmm0 // swap bytes
3233 pshuflw xmm0, xmm0, 0x1b // swap words
3234 pshufhw xmm0, xmm0, 0x1b
3235 pshufd xmm0, xmm0, 0x4e // swap qwords
3243 #endif // HAS_MIRRORROW_SSE2
3245 #ifdef HAS_MIRRORROW_UV_SSSE3
3246 // Shuffle table for reversing the bytes of UV channels.
3247 static const uvec8 kShuffleMirrorUV = {
3248 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
3251 __declspec(naked) __declspec(align(16))
3252 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
3256 mov eax, [esp + 4 + 4] // src
3257 mov edx, [esp + 4 + 8] // dst_u
3258 mov edi, [esp + 4 + 12] // dst_v
3259 mov ecx, [esp + 4 + 16] // width
3260 movdqa xmm1, kShuffleMirrorUV
3261 lea eax, [eax + ecx * 2 - 16]
3270 movlpd qword ptr [edx], xmm0
3271 movhpd qword ptr [edx + edi], xmm0
3279 #endif // HAS_MIRRORROW_UV_SSSE3
3281 #ifdef HAS_ARGBMIRRORROW_SSSE3
3282 // Shuffle table for reversing the bytes.
3283 static const uvec8 kARGBShuffleMirror = {
3284 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
3287 __declspec(naked) __declspec(align(16))
3288 void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
3290 mov eax, [esp + 4] // src
3291 mov edx, [esp + 8] // dst
3292 mov ecx, [esp + 12] // width
3293 lea eax, [eax - 16 + ecx * 4] // last 4 pixels.
3294 movdqa xmm5, kARGBShuffleMirror
3308 #endif // HAS_ARGBMIRRORROW_SSSE3
3310 #ifdef HAS_ARGBMIRRORROW_AVX2
3311 // Shuffle table for reversing the bytes.
3312 static const ulvec32 kARGBShuffleMirror_AVX2 = {
3313 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
3316 __declspec(naked) __declspec(align(16))
3317 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
3319 mov eax, [esp + 4] // src
3320 mov edx, [esp + 8] // dst
3321 mov ecx, [esp + 12] // width
3323 vmovdqa ymm5, kARGBShuffleMirror_AVX2
3327 vpermd ymm0, ymm5, [eax + ecx * 4] // permute dword order
3336 #endif // HAS_ARGBMIRRORROW_AVX2
3338 #ifdef HAS_SPLITUVROW_SSE2
3339 __declspec(naked) __declspec(align(16))
3340 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
3343 mov eax, [esp + 4 + 4] // src_uv
3344 mov edx, [esp + 4 + 8] // dst_u
3345 mov edi, [esp + 4 + 12] // dst_v
3346 mov ecx, [esp + 4 + 16] // pix
3347 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3354 movdqa xmm1, [eax + 16]
3358 pand xmm0, xmm5 // even bytes
3361 psrlw xmm2, 8 // odd bytes
3365 movdqa [edx + edi], xmm2
3375 __declspec(naked) __declspec(align(16))
3376 void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
3380 mov eax, [esp + 4 + 4] // src_uv
3381 mov edx, [esp + 4 + 8] // dst_u
3382 mov edi, [esp + 4 + 12] // dst_v
3383 mov ecx, [esp + 4 + 16] // pix
3384 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3391 movdqu xmm1, [eax + 16]
3395 pand xmm0, xmm5 // even bytes
3398 psrlw xmm2, 8 // odd bytes
3402 movdqu [edx + edi], xmm2
3411 #endif // HAS_SPLITUVROW_SSE2
3413 #ifdef HAS_SPLITUVROW_AVX2
3414 __declspec(naked) __declspec(align(16))
3415 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
3418 mov eax, [esp + 4 + 4] // src_uv
3419 mov edx, [esp + 4 + 8] // dst_u
3420 mov edi, [esp + 4 + 12] // dst_v
3421 mov ecx, [esp + 4 + 16] // pix
3422 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3423 vpsrlw ymm5, ymm5, 8
3429 vmovdqu ymm1, [eax + 32]
3431 vpsrlw ymm2, ymm0, 8 // odd bytes
3432 vpsrlw ymm3, ymm1, 8
3433 vpand ymm0, ymm0, ymm5 // even bytes
3434 vpand ymm1, ymm1, ymm5
3435 vpackuswb ymm0, ymm0, ymm1
3436 vpackuswb ymm2, ymm2, ymm3
3437 vpermq ymm0, ymm0, 0xd8
3438 vpermq ymm2, ymm2, 0xd8
3440 vmovdqu [edx + edi], ymm2
3450 #endif // HAS_SPLITUVROW_AVX2
3452 #ifdef HAS_MERGEUVROW_SSE2
3453 __declspec(naked) __declspec(align(16))
3454 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
3458 mov eax, [esp + 4 + 4] // src_u
3459 mov edx, [esp + 4 + 8] // src_v
3460 mov edi, [esp + 4 + 12] // dst_uv
3461 mov ecx, [esp + 4 + 16] // width
3466 movdqa xmm0, [eax] // read 16 U's
3467 movdqa xmm1, [eax + edx] // and 16 V's
3470 punpcklbw xmm0, xmm1 // first 8 UV pairs
3471 punpckhbw xmm2, xmm1 // next 8 UV pairs
3473 movdqa [edi + 16], xmm2
3483 __declspec(naked) __declspec(align(16))
3484 void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
3485 uint8* dst_uv, int width) {
3488 mov eax, [esp + 4 + 4] // src_u
3489 mov edx, [esp + 4 + 8] // src_v
3490 mov edi, [esp + 4 + 12] // dst_uv
3491 mov ecx, [esp + 4 + 16] // width
3496 movdqu xmm0, [eax] // read 16 U's
3497 movdqu xmm1, [eax + edx] // and 16 V's
3500 punpcklbw xmm0, xmm1 // first 8 UV pairs
3501 punpckhbw xmm2, xmm1 // next 8 UV pairs
3503 movdqu [edi + 16], xmm2
3512 #endif // HAS_MERGEUVROW_SSE2
3514 #ifdef HAS_MERGEUVROW_AVX2
3515 __declspec(naked) __declspec(align(16))
3516 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
3520 mov eax, [esp + 4 + 4] // src_u
3521 mov edx, [esp + 4 + 8] // src_v
3522 mov edi, [esp + 4 + 12] // dst_uv
3523 mov ecx, [esp + 4 + 16] // width
3528 vmovdqu ymm0, [eax] // read 32 U's
3529 vmovdqu ymm1, [eax + edx] // and 32 V's
3531 vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2
3532 vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3
3533 vperm2i128 ymm1, ymm2, ymm0, 0x20 // low 128 of ymm2 and low 128 of ymm0
3534 vperm2i128 ymm2, ymm2, ymm0, 0x31 // high 128 of ymm2 and high 128 of ymm0
3536 vmovdqu [edi + 32], ymm2
3546 #endif // HAS_MERGEUVROW_AVX2
3548 #ifdef HAS_COPYROW_SSE2
3549 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
3550 __declspec(naked) __declspec(align(16))
3551 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
3553 mov eax, [esp + 4] // src
3554 mov edx, [esp + 8] // dst
3555 mov ecx, [esp + 12] // count
3560 movdqa xmm1, [eax + 16]
3563 movdqa [edx + 16], xmm1
3570 #endif // HAS_COPYROW_SSE2
3572 // Unaligned Multiple of 1.
3573 __declspec(naked) __declspec(align(16))
3574 void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
3578 mov esi, [esp + 4] // src
3579 mov edi, [esp + 8] // dst
3580 mov ecx, [esp + 12] // count
3588 #ifdef HAS_COPYROW_X86
3589 __declspec(naked) __declspec(align(16))
3590 void CopyRow_X86(const uint8* src, uint8* dst, int count) {
3594 mov esi, [esp + 4] // src
3595 mov edi, [esp + 8] // dst
3596 mov ecx, [esp + 12] // count
3604 #endif // HAS_COPYROW_X86
3606 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
3608 __declspec(naked) __declspec(align(16))
3609 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
3611 mov eax, [esp + 4] // src
3612 mov edx, [esp + 8] // dst
3613 mov ecx, [esp + 12] // count
3614 pcmpeqb xmm0, xmm0 // generate mask 0xff000000
3616 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
3622 movdqa xmm3, [eax + 16]
3625 movdqa xmm5, [edx + 16]
3633 movdqa [edx + 16], xmm3
3641 #endif // HAS_ARGBCOPYALPHAROW_SSE2
3643 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
3645 __declspec(naked) __declspec(align(16))
3646 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
3648 mov eax, [esp + 4] // src
3649 mov edx, [esp + 8] // dst
3650 mov ecx, [esp + 12] // count
3651 vpcmpeqb ymm0, ymm0, ymm0
3652 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
3657 vmovdqu ymm2, [eax + 32]
3659 vpblendvb ymm1, ymm1, [edx], ymm0
3660 vpblendvb ymm2, ymm2, [edx + 32], ymm0
3662 vmovdqu [edx + 32], ymm2
3671 #endif // HAS_ARGBCOPYALPHAROW_AVX2
3673 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
3675 __declspec(naked) __declspec(align(16))
3676 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
3678 mov eax, [esp + 4] // src
3679 mov edx, [esp + 8] // dst
3680 mov ecx, [esp + 12] // count
3681 pcmpeqb xmm0, xmm0 // generate mask 0xff000000
3683 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
3688 movq xmm2, qword ptr [eax] // 8 Y's
3690 punpcklbw xmm2, xmm2
3691 punpckhwd xmm3, xmm2
3692 punpcklwd xmm2, xmm2
3694 movdqa xmm5, [edx + 16]
3702 movdqa [edx + 16], xmm3
3710 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
3712 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
3714 __declspec(naked) __declspec(align(16))
3715 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
3717 mov eax, [esp + 4] // src
3718 mov edx, [esp + 8] // dst
3719 mov ecx, [esp + 12] // count
3720 vpcmpeqb ymm0, ymm0, ymm0
3721 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
3725 vpmovzxbd ymm1, qword ptr [eax]
3726 vpmovzxbd ymm2, qword ptr [eax + 8]
3728 vpslld ymm1, ymm1, 24
3729 vpslld ymm2, ymm2, 24
3730 vpblendvb ymm1, ymm1, [edx], ymm0
3731 vpblendvb ymm2, ymm2, [edx + 32], ymm0
3733 vmovdqu [edx + 32], ymm2
3742 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
3744 #ifdef HAS_SETROW_X86
3745 // SetRow8 writes 'count' bytes using a 32 bit value repeated.
3746 __declspec(naked) __declspec(align(16))
3747 void SetRow_X86(uint8* dst, uint32 v32, int count) {
3750 mov edi, [esp + 4] // dst
3751 mov eax, [esp + 8] // v32
3752 mov ecx, [esp + 12] // count
3760 // SetRow32 writes 'count' words using a 32 bit value repeated.
3761 __declspec(naked) __declspec(align(16))
3762 void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
3763 int dst_stride, int height) {
3768 mov edi, [esp + 12 + 4] // dst
3769 mov eax, [esp + 12 + 8] // v32
3770 mov ebp, [esp + 12 + 12] // width
3771 mov edx, [esp + 12 + 16] // dst_stride
3772 mov esi, [esp + 12 + 20] // height
3774 sub edx, ecx // stride - width * 4
3790 #endif // HAS_SETROW_X86
3792 #ifdef HAS_YUY2TOYROW_AVX2
3793 __declspec(naked) __declspec(align(16))
3794 void YUY2ToYRow_AVX2(const uint8* src_yuy2,
3795 uint8* dst_y, int pix) {
3797 mov eax, [esp + 4] // src_yuy2
3798 mov edx, [esp + 8] // dst_y
3799 mov ecx, [esp + 12] // pix
3800 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3801 vpsrlw ymm5, ymm5, 8
3806 vmovdqu ymm1, [eax + 32]
3808 vpand ymm0, ymm0, ymm5 // even bytes are Y
3809 vpand ymm1, ymm1, ymm5
3810 vpackuswb ymm0, ymm0, ymm1 // mutates.
3811 vpermq ymm0, ymm0, 0xd8
3821 __declspec(naked) __declspec(align(16))
3822 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
3823 uint8* dst_u, uint8* dst_v, int pix) {
3827 mov eax, [esp + 8 + 4] // src_yuy2
3828 mov esi, [esp + 8 + 8] // stride_yuy2
3829 mov edx, [esp + 8 + 12] // dst_u
3830 mov edi, [esp + 8 + 16] // dst_v
3831 mov ecx, [esp + 8 + 20] // pix
3832 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3833 vpsrlw ymm5, ymm5, 8
3839 vmovdqu ymm1, [eax + 32]
3840 vpavgb ymm0, ymm0, [eax + esi]
3841 vpavgb ymm1, ymm1, [eax + esi + 32]
3843 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
3844 vpsrlw ymm1, ymm1, 8
3845 vpackuswb ymm0, ymm0, ymm1 // mutates.
3846 vpermq ymm0, ymm0, 0xd8
3847 vpand ymm1, ymm0, ymm5 // U
3848 vpsrlw ymm0, ymm0, 8 // V
3849 vpackuswb ymm1, ymm1, ymm1 // mutates.
3850 vpackuswb ymm0, ymm0, ymm0 // mutates.
3851 vpermq ymm1, ymm1, 0xd8
3852 vpermq ymm0, ymm0, 0xd8
3853 vextractf128 [edx], ymm1, 0 // U
3854 vextractf128 [edx + edi], ymm0, 0 // V
3866 __declspec(naked) __declspec(align(16))
3867 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
3868 uint8* dst_u, uint8* dst_v, int pix) {
3871 mov eax, [esp + 4 + 4] // src_yuy2
3872 mov edx, [esp + 4 + 8] // dst_u
3873 mov edi, [esp + 4 + 12] // dst_v
3874 mov ecx, [esp + 4 + 16] // pix
3875 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3876 vpsrlw ymm5, ymm5, 8
3882 vmovdqu ymm1, [eax + 32]
3884 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
3885 vpsrlw ymm1, ymm1, 8
3886 vpackuswb ymm0, ymm0, ymm1 // mutates.
3887 vpermq ymm0, ymm0, 0xd8
3888 vpand ymm1, ymm0, ymm5 // U
3889 vpsrlw ymm0, ymm0, 8 // V
3890 vpackuswb ymm1, ymm1, ymm1 // mutates.
3891 vpackuswb ymm0, ymm0, ymm0 // mutates.
3892 vpermq ymm1, ymm1, 0xd8
3893 vpermq ymm0, ymm0, 0xd8
3894 vextractf128 [edx], ymm1, 0 // U
3895 vextractf128 [edx + edi], ymm0, 0 // V
3906 __declspec(naked) __declspec(align(16))
3907 void UYVYToYRow_AVX2(const uint8* src_uyvy,
3908 uint8* dst_y, int pix) {
3910 mov eax, [esp + 4] // src_uyvy
3911 mov edx, [esp + 8] // dst_y
3912 mov ecx, [esp + 12] // pix
3917 vmovdqu ymm1, [eax + 32]
3919 vpsrlw ymm0, ymm0, 8 // odd bytes are Y
3920 vpsrlw ymm1, ymm1, 8
3921 vpackuswb ymm0, ymm0, ymm1 // mutates.
3922 vpermq ymm0, ymm0, 0xd8
3932 __declspec(naked) __declspec(align(16))
3933 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
3934 uint8* dst_u, uint8* dst_v, int pix) {
3938 mov eax, [esp + 8 + 4] // src_yuy2
3939 mov esi, [esp + 8 + 8] // stride_yuy2
3940 mov edx, [esp + 8 + 12] // dst_u
3941 mov edi, [esp + 8 + 16] // dst_v
3942 mov ecx, [esp + 8 + 20] // pix
3943 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3944 vpsrlw ymm5, ymm5, 8
3950 vmovdqu ymm1, [eax + 32]
3951 vpavgb ymm0, ymm0, [eax + esi]
3952 vpavgb ymm1, ymm1, [eax + esi + 32]
3954 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
3955 vpand ymm1, ymm1, ymm5
3956 vpackuswb ymm0, ymm0, ymm1 // mutates.
3957 vpermq ymm0, ymm0, 0xd8
3958 vpand ymm1, ymm0, ymm5 // U
3959 vpsrlw ymm0, ymm0, 8 // V
3960 vpackuswb ymm1, ymm1, ymm1 // mutates.
3961 vpackuswb ymm0, ymm0, ymm0 // mutates.
3962 vpermq ymm1, ymm1, 0xd8
3963 vpermq ymm0, ymm0, 0xd8
3964 vextractf128 [edx], ymm1, 0 // U
3965 vextractf128 [edx + edi], ymm0, 0 // V
3977 __declspec(naked) __declspec(align(16))
3978 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
3979 uint8* dst_u, uint8* dst_v, int pix) {
3982 mov eax, [esp + 4 + 4] // src_yuy2
3983 mov edx, [esp + 4 + 8] // dst_u
3984 mov edi, [esp + 4 + 12] // dst_v
3985 mov ecx, [esp + 4 + 16] // pix
3986 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3987 vpsrlw ymm5, ymm5, 8
3993 vmovdqu ymm1, [eax + 32]
3995 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
3996 vpand ymm1, ymm1, ymm5
3997 vpackuswb ymm0, ymm0, ymm1 // mutates.
3998 vpermq ymm0, ymm0, 0xd8
3999 vpand ymm1, ymm0, ymm5 // U
4000 vpsrlw ymm0, ymm0, 8 // V
4001 vpackuswb ymm1, ymm1, ymm1 // mutates.
4002 vpackuswb ymm0, ymm0, ymm0 // mutates.
4003 vpermq ymm1, ymm1, 0xd8
4004 vpermq ymm0, ymm0, 0xd8
4005 vextractf128 [edx], ymm1, 0 // U
4006 vextractf128 [edx + edi], ymm0, 0 // V
4016 #endif // HAS_YUY2TOYROW_AVX2
4018 #ifdef HAS_YUY2TOYROW_SSE2
4019 __declspec(naked) __declspec(align(16))
4020 void YUY2ToYRow_SSE2(const uint8* src_yuy2,
4021 uint8* dst_y, int pix) {
4023 mov eax, [esp + 4] // src_yuy2
4024 mov edx, [esp + 8] // dst_y
4025 mov ecx, [esp + 12] // pix
4026 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4032 movdqa xmm1, [eax + 16]
4034 pand xmm0, xmm5 // even bytes are Y
4045 __declspec(naked) __declspec(align(16))
4046 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
4047 uint8* dst_u, uint8* dst_v, int pix) {
4051 mov eax, [esp + 8 + 4] // src_yuy2
4052 mov esi, [esp + 8 + 8] // stride_yuy2
4053 mov edx, [esp + 8 + 12] // dst_u
4054 mov edi, [esp + 8 + 16] // dst_v
4055 mov ecx, [esp + 8 + 20] // pix
4056 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4063 movdqa xmm1, [eax + 16]
4064 movdqa xmm2, [eax + esi]
4065 movdqa xmm3, [eax + esi + 16]
4069 psrlw xmm0, 8 // YUYV -> UVUV
4073 pand xmm0, xmm5 // U
4077 movq qword ptr [edx], xmm0
4078 movq qword ptr [edx + edi], xmm1
4089 __declspec(naked) __declspec(align(16))
4090 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
4091 uint8* dst_u, uint8* dst_v, int pix) {
4094 mov eax, [esp + 4 + 4] // src_yuy2
4095 mov edx, [esp + 4 + 8] // dst_u
4096 mov edi, [esp + 4 + 12] // dst_v
4097 mov ecx, [esp + 4 + 16] // pix
4098 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4105 movdqa xmm1, [eax + 16]
4107 psrlw xmm0, 8 // YUYV -> UVUV
4111 pand xmm0, xmm5 // U
4115 movq qword ptr [edx], xmm0
4116 movq qword ptr [edx + edi], xmm1
4126 __declspec(naked) __declspec(align(16))
4127 void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
4128 uint8* dst_y, int pix) {
4130 mov eax, [esp + 4] // src_yuy2
4131 mov edx, [esp + 8] // dst_y
4132 mov ecx, [esp + 12] // pix
4133 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4139 movdqu xmm1, [eax + 16]
4141 pand xmm0, xmm5 // even bytes are Y
4152 __declspec(naked) __declspec(align(16))
4153 void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
4154 uint8* dst_u, uint8* dst_v, int pix) {
4158 mov eax, [esp + 8 + 4] // src_yuy2
4159 mov esi, [esp + 8 + 8] // stride_yuy2
4160 mov edx, [esp + 8 + 12] // dst_u
4161 mov edi, [esp + 8 + 16] // dst_v
4162 mov ecx, [esp + 8 + 20] // pix
4163 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4170 movdqu xmm1, [eax + 16]
4171 movdqu xmm2, [eax + esi]
4172 movdqu xmm3, [eax + esi + 16]
4176 psrlw xmm0, 8 // YUYV -> UVUV
4180 pand xmm0, xmm5 // U
4184 movq qword ptr [edx], xmm0
4185 movq qword ptr [edx + edi], xmm1
4196 __declspec(naked) __declspec(align(16))
4197 void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
4198 uint8* dst_u, uint8* dst_v, int pix) {
4201 mov eax, [esp + 4 + 4] // src_yuy2
4202 mov edx, [esp + 4 + 8] // dst_u
4203 mov edi, [esp + 4 + 12] // dst_v
4204 mov ecx, [esp + 4 + 16] // pix
4205 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4212 movdqu xmm1, [eax + 16]
4214 psrlw xmm0, 8 // YUYV -> UVUV
4218 pand xmm0, xmm5 // U
4222 movq qword ptr [edx], xmm0
4223 movq qword ptr [edx + edi], xmm1
4233 __declspec(naked) __declspec(align(16))
4234 void UYVYToYRow_SSE2(const uint8* src_uyvy,
4235 uint8* dst_y, int pix) {
4237 mov eax, [esp + 4] // src_uyvy
4238 mov edx, [esp + 8] // dst_y
4239 mov ecx, [esp + 12] // pix
4244 movdqa xmm1, [eax + 16]
4246 psrlw xmm0, 8 // odd bytes are Y
4257 __declspec(naked) __declspec(align(16))
4258 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
4259 uint8* dst_u, uint8* dst_v, int pix) {
4263 mov eax, [esp + 8 + 4] // src_yuy2
4264 mov esi, [esp + 8 + 8] // stride_yuy2
4265 mov edx, [esp + 8 + 12] // dst_u
4266 mov edi, [esp + 8 + 16] // dst_v
4267 mov ecx, [esp + 8 + 20] // pix
4268 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4275 movdqa xmm1, [eax + 16]
4276 movdqa xmm2, [eax + esi]
4277 movdqa xmm3, [eax + esi + 16]
4281 pand xmm0, xmm5 // UYVY -> UVUV
4285 pand xmm0, xmm5 // U
4289 movq qword ptr [edx], xmm0
4290 movq qword ptr [edx + edi], xmm1
4301 __declspec(naked) __declspec(align(16))
4302 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
4303 uint8* dst_u, uint8* dst_v, int pix) {
4306 mov eax, [esp + 4 + 4] // src_yuy2
4307 mov edx, [esp + 4 + 8] // dst_u
4308 mov edi, [esp + 4 + 12] // dst_v
4309 mov ecx, [esp + 4 + 16] // pix
4310 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4317 movdqa xmm1, [eax + 16]
4319 pand xmm0, xmm5 // UYVY -> UVUV
4323 pand xmm0, xmm5 // U
4327 movq qword ptr [edx], xmm0
4328 movq qword ptr [edx + edi], xmm1
4338 __declspec(naked) __declspec(align(16))
4339 void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
4340 uint8* dst_y, int pix) {
4342 mov eax, [esp + 4] // src_uyvy
4343 mov edx, [esp + 8] // dst_y
4344 mov ecx, [esp + 12] // pix
4349 movdqu xmm1, [eax + 16]
4351 psrlw xmm0, 8 // odd bytes are Y
4362 __declspec(naked) __declspec(align(16))
4363 void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
4364 uint8* dst_u, uint8* dst_v, int pix) {
4368 mov eax, [esp + 8 + 4] // src_yuy2
4369 mov esi, [esp + 8 + 8] // stride_yuy2
4370 mov edx, [esp + 8 + 12] // dst_u
4371 mov edi, [esp + 8 + 16] // dst_v
4372 mov ecx, [esp + 8 + 20] // pix
4373 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4380 movdqu xmm1, [eax + 16]
4381 movdqu xmm2, [eax + esi]
4382 movdqu xmm3, [eax + esi + 16]
4386 pand xmm0, xmm5 // UYVY -> UVUV
4390 pand xmm0, xmm5 // U
4394 movq qword ptr [edx], xmm0
4395 movq qword ptr [edx + edi], xmm1
4406 __declspec(naked) __declspec(align(16))
4407 void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
4408 uint8* dst_u, uint8* dst_v, int pix) {
4411 mov eax, [esp + 4 + 4] // src_yuy2
4412 mov edx, [esp + 4 + 8] // dst_u
4413 mov edi, [esp + 4 + 12] // dst_v
4414 mov ecx, [esp + 4 + 16] // pix
4415 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4422 movdqu xmm1, [eax + 16]
4424 pand xmm0, xmm5 // UYVY -> UVUV
4428 pand xmm0, xmm5 // U
4432 movq qword ptr [edx], xmm0
4433 movq qword ptr [edx + edi], xmm1
4442 #endif // HAS_YUY2TOYROW_SSE2
4444 #ifdef HAS_ARGBBLENDROW_SSE2
4445 // Blend 8 pixels at a time.
4446 __declspec(naked) __declspec(align(16))
4447 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4448 uint8* dst_argb, int width) {
4451 mov eax, [esp + 4 + 4] // src_argb0
4452 mov esi, [esp + 4 + 8] // src_argb1
4453 mov edx, [esp + 4 + 12] // dst_argb
4454 mov ecx, [esp + 4 + 16] // width
4455 pcmpeqb xmm7, xmm7 // generate constant 1
4457 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
4459 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
4461 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
4465 je convertloop1 // only 1 pixel?
4468 // 1 pixel loop until destination pointer is aligned.
4470 test edx, 15 // aligned?
4474 movdqa xmm0, xmm3 // src argb
4475 pxor xmm3, xmm4 // ~alpha
4476 movd xmm2, [esi] // _r_b
4477 psrlw xmm3, 8 // alpha
4478 pshufhw xmm3, xmm3, 0F5h // 8 alpha words
4479 pshuflw xmm3, xmm3, 0F5h
4480 pand xmm2, xmm6 // _r_b
4481 paddw xmm3, xmm7 // 256 - alpha
4482 pmullw xmm2, xmm3 // _r_b * alpha
4483 movd xmm1, [esi] // _a_g
4485 psrlw xmm1, 8 // _a_g
4486 por xmm0, xmm4 // set alpha to 255
4487 pmullw xmm1, xmm3 // _a_g * alpha
4488 psrlw xmm2, 8 // _r_b convert to 8 bits again
4489 paddusb xmm0, xmm2 // + src argb
4490 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4491 paddusb xmm0, xmm1 // + src argb
4503 movdqu xmm3, [eax] // src argb
4505 movdqa xmm0, xmm3 // src argb
4506 pxor xmm3, xmm4 // ~alpha
4507 movdqu xmm2, [esi] // _r_b
4508 psrlw xmm3, 8 // alpha
4509 pshufhw xmm3, xmm3, 0F5h // 8 alpha words
4510 pshuflw xmm3, xmm3, 0F5h
4511 pand xmm2, xmm6 // _r_b
4512 paddw xmm3, xmm7 // 256 - alpha
4513 pmullw xmm2, xmm3 // _r_b * alpha
4514 movdqu xmm1, [esi] // _a_g
4516 psrlw xmm1, 8 // _a_g
4517 por xmm0, xmm4 // set alpha to 255
4518 pmullw xmm1, xmm3 // _a_g * alpha
4519 psrlw xmm2, 8 // _r_b convert to 8 bits again
4520 paddusb xmm0, xmm2 // + src argb
4521 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4522 paddusb xmm0, xmm1 // + src argb
4534 movd xmm3, [eax] // src argb
4536 movdqa xmm0, xmm3 // src argb
4537 pxor xmm3, xmm4 // ~alpha
4538 movd xmm2, [esi] // _r_b
4539 psrlw xmm3, 8 // alpha
4540 pshufhw xmm3, xmm3, 0F5h // 8 alpha words
4541 pshuflw xmm3, xmm3, 0F5h
4542 pand xmm2, xmm6 // _r_b
4543 paddw xmm3, xmm7 // 256 - alpha
4544 pmullw xmm2, xmm3 // _r_b * alpha
4545 movd xmm1, [esi] // _a_g
4547 psrlw xmm1, 8 // _a_g
4548 por xmm0, xmm4 // set alpha to 255
4549 pmullw xmm1, xmm3 // _a_g * alpha
4550 psrlw xmm2, 8 // _r_b convert to 8 bits again
4551 paddusb xmm0, xmm2 // + src argb
4552 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4553 paddusb xmm0, xmm1 // + src argb
4564 #endif // HAS_ARGBBLENDROW_SSE2
4566 #ifdef HAS_ARGBBLENDROW_SSSE3
4567 // Shuffle table for isolating alpha.
4568 static const uvec8 kShuffleAlpha = {
4569 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
4570 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
4572 // Same as SSE2, but replaces:
4573 // psrlw xmm3, 8 // alpha
4574 // pshufhw xmm3, xmm3, 0F5h // 8 alpha words
4575 // pshuflw xmm3, xmm3, 0F5h
4577 // pshufb xmm3, kShuffleAlpha // alpha
4578 // Blend 8 pixels at a time.
4580 __declspec(naked) __declspec(align(16))
4581 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
4582 uint8* dst_argb, int width) {
4585 mov eax, [esp + 4 + 4] // src_argb0
4586 mov esi, [esp + 4 + 8] // src_argb1
4587 mov edx, [esp + 4 + 12] // dst_argb
4588 mov ecx, [esp + 4 + 16] // width
4589 pcmpeqb xmm7, xmm7 // generate constant 0x0001
4591 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
4593 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
4595 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
4599 je convertloop1 // only 1 pixel?
4602 // 1 pixel loop until destination pointer is aligned.
4604 test edx, 15 // aligned?
4608 movdqa xmm0, xmm3 // src argb
4609 pxor xmm3, xmm4 // ~alpha
4610 movd xmm2, [esi] // _r_b
4611 pshufb xmm3, kShuffleAlpha // alpha
4612 pand xmm2, xmm6 // _r_b
4613 paddw xmm3, xmm7 // 256 - alpha
4614 pmullw xmm2, xmm3 // _r_b * alpha
4615 movd xmm1, [esi] // _a_g
4617 psrlw xmm1, 8 // _a_g
4618 por xmm0, xmm4 // set alpha to 255
4619 pmullw xmm1, xmm3 // _a_g * alpha
4620 psrlw xmm2, 8 // _r_b convert to 8 bits again
4621 paddusb xmm0, xmm2 // + src argb
4622 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4623 paddusb xmm0, xmm1 // + src argb
4633 test eax, 15 // unaligned?
4635 test esi, 15 // unaligned?
4640 movdqa xmm3, [eax] // src argb
4642 movdqa xmm0, xmm3 // src argb
4643 pxor xmm3, xmm4 // ~alpha
4644 movdqa xmm2, [esi] // _r_b
4645 pshufb xmm3, kShuffleAlpha // alpha
4646 pand xmm2, xmm6 // _r_b
4647 paddw xmm3, xmm7 // 256 - alpha
4648 pmullw xmm2, xmm3 // _r_b * alpha
4649 movdqa xmm1, [esi] // _a_g
4651 psrlw xmm1, 8 // _a_g
4652 por xmm0, xmm4 // set alpha to 255
4653 pmullw xmm1, xmm3 // _a_g * alpha
4654 psrlw xmm2, 8 // _r_b convert to 8 bits again
4655 paddusb xmm0, xmm2 // + src argb
4656 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4657 paddusb xmm0, xmm1 // + src argb
4664 // 4 pixel unaligned loop.
4666 movdqu xmm3, [eax] // src argb
4668 movdqa xmm0, xmm3 // src argb
4669 pxor xmm3, xmm4 // ~alpha
4670 movdqu xmm2, [esi] // _r_b
4671 pshufb xmm3, kShuffleAlpha // alpha
4672 pand xmm2, xmm6 // _r_b
4673 paddw xmm3, xmm7 // 256 - alpha
4674 pmullw xmm2, xmm3 // _r_b * alpha
4675 movdqu xmm1, [esi] // _a_g
4677 psrlw xmm1, 8 // _a_g
4678 por xmm0, xmm4 // set alpha to 255
4679 pmullw xmm1, xmm3 // _a_g * alpha
4680 psrlw xmm2, 8 // _r_b convert to 8 bits again
4681 paddusb xmm0, xmm2 // + src argb
4682 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4683 paddusb xmm0, xmm1 // + src argb
4695 movd xmm3, [eax] // src argb
4697 movdqa xmm0, xmm3 // src argb
4698 pxor xmm3, xmm4 // ~alpha
4699 movd xmm2, [esi] // _r_b
4700 pshufb xmm3, kShuffleAlpha // alpha
4701 pand xmm2, xmm6 // _r_b
4702 paddw xmm3, xmm7 // 256 - alpha
4703 pmullw xmm2, xmm3 // _r_b * alpha
4704 movd xmm1, [esi] // _a_g
4706 psrlw xmm1, 8 // _a_g
4707 por xmm0, xmm4 // set alpha to 255
4708 pmullw xmm1, xmm3 // _a_g * alpha
4709 psrlw xmm2, 8 // _r_b convert to 8 bits again
4710 paddusb xmm0, xmm2 // + src argb
4711 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4712 paddusb xmm0, xmm1 // + src argb
4723 #endif // HAS_ARGBBLENDROW_SSSE3
4725 #ifdef HAS_ARGBATTENUATEROW_SSE2
4726 // Attenuate 4 pixels at a time.
4727 // Aligned to 16 bytes.
4728 __declspec(naked) __declspec(align(16))
4729 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
4731 mov eax, [esp + 4] // src_argb0
4732 mov edx, [esp + 8] // dst_argb
4733 mov ecx, [esp + 12] // width
4734 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
4736 pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff
4741 movdqa xmm0, [eax] // read 4 pixels
4742 punpcklbw xmm0, xmm0 // first 2
4743 pshufhw xmm2, xmm0, 0FFh // 8 alpha words
4744 pshuflw xmm2, xmm2, 0FFh
4745 pmulhuw xmm0, xmm2 // rgb * a
4746 movdqa xmm1, [eax] // read 4 pixels
4747 punpckhbw xmm1, xmm1 // next 2 pixels
4748 pshufhw xmm2, xmm1, 0FFh // 8 alpha words
4749 pshuflw xmm2, xmm2, 0FFh
4750 pmulhuw xmm1, xmm2 // rgb * a
4751 movdqa xmm2, [eax] // alphas
4757 pand xmm0, xmm5 // keep original alphas
4767 #endif // HAS_ARGBATTENUATEROW_SSE2
4769 #ifdef HAS_ARGBATTENUATEROW_SSSE3
4770 // Shuffle table duplicating alpha.
4771 static const uvec8 kShuffleAlpha0 = {
4772 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
4774 static const uvec8 kShuffleAlpha1 = {
4775 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
4776 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
4778 __declspec(naked) __declspec(align(16))
4779 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
4781 mov eax, [esp + 4] // src_argb0
4782 mov edx, [esp + 8] // dst_argb
4783 mov ecx, [esp + 12] // width
4784 pcmpeqb xmm3, xmm3 // generate mask 0xff000000
4786 movdqa xmm4, kShuffleAlpha0
4787 movdqa xmm5, kShuffleAlpha1
4791 movdqu xmm0, [eax] // read 4 pixels
4792 pshufb xmm0, xmm4 // isolate first 2 alphas
4793 movdqu xmm1, [eax] // read 4 pixels
4794 punpcklbw xmm1, xmm1 // first 2 pixel rgbs
4795 pmulhuw xmm0, xmm1 // rgb * a
4796 movdqu xmm1, [eax] // read 4 pixels
4797 pshufb xmm1, xmm5 // isolate next 2 alphas
4798 movdqu xmm2, [eax] // read 4 pixels
4799 punpckhbw xmm2, xmm2 // next 2 pixel rgbs
4800 pmulhuw xmm1, xmm2 // rgb * a
4801 movdqu xmm2, [eax] // mask original alpha
4807 por xmm0, xmm2 // copy original alpha
4816 #endif // HAS_ARGBATTENUATEROW_SSSE3
4818 #ifdef HAS_ARGBATTENUATEROW_AVX2
4819 // Shuffle table duplicating alpha.
4820 static const ulvec8 kShuffleAlpha_AVX2 = {
4821 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
4822 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
4823 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
4824 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
4826 __declspec(naked) __declspec(align(16))
4827 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
4829 mov eax, [esp + 4] // src_argb0
4830 mov edx, [esp + 8] // dst_argb
4831 mov ecx, [esp + 12] // width
4833 vmovdqa ymm4, kShuffleAlpha_AVX2
4834 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
4835 vpslld ymm5, ymm5, 24
4839 vmovdqu ymm6, [eax] // read 8 pixels.
4840 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
4841 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
4842 vpshufb ymm2, ymm0, ymm4 // low 4 alphas
4843 vpshufb ymm3, ymm1, ymm4 // high 4 alphas
4844 vpmulhuw ymm0, ymm0, ymm2 // rgb * a
4845 vpmulhuw ymm1, ymm1, ymm3 // rgb * a
4846 vpand ymm6, ymm6, ymm5 // isolate alpha
4847 vpsrlw ymm0, ymm0, 8
4848 vpsrlw ymm1, ymm1, 8
4849 vpackuswb ymm0, ymm0, ymm1 // unmutated.
4850 vpor ymm0, ymm0, ymm6 // copy original alpha
4852 vmovdqu [eax + edx], ymm0
4860 #endif // HAS_ARGBATTENUATEROW_AVX2
4862 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
4863 // Unattenuate 4 pixels at a time.
4864 // Aligned to 16 bytes.
4865 __declspec(naked) __declspec(align(16))
4866 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
4871 mov eax, [esp + 8 + 4] // src_argb0
4872 mov edx, [esp + 8 + 8] // dst_argb
4873 mov ecx, [esp + 8 + 12] // width
4877 movdqu xmm0, [eax] // read 4 pixels
4878 movzx esi, byte ptr [eax + 3] // first alpha
4879 movzx edi, byte ptr [eax + 7] // second alpha
4880 punpcklbw xmm0, xmm0 // first 2
4881 movd xmm2, dword ptr fixed_invtbl8[esi * 4]
4882 movd xmm3, dword ptr fixed_invtbl8[edi * 4]
4883 pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a
4884 pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
4886 pmulhuw xmm0, xmm2 // rgb * a
4888 movdqu xmm1, [eax] // read 4 pixels
4889 movzx esi, byte ptr [eax + 11] // third alpha
4890 movzx edi, byte ptr [eax + 15] // forth alpha
4891 punpckhbw xmm1, xmm1 // next 2
4892 movd xmm2, dword ptr fixed_invtbl8[esi * 4]
4893 movd xmm3, dword ptr fixed_invtbl8[edi * 4]
4894 pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words
4895 pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
4897 pmulhuw xmm1, xmm2 // rgb * a
4910 #endif // HAS_ARGBUNATTENUATEROW_SSE2
4912 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
4913 // Shuffle table duplicating alpha.
4914 static const ulvec8 kUnattenShuffleAlpha_AVX2 = {
4915 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
4916 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
4918 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
4919 // USE_GATHER is not on by default, due to being a slow instruction.
4921 __declspec(naked) __declspec(align(16))
4922 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
4925 mov eax, [esp + 4] // src_argb0
4926 mov edx, [esp + 8] // dst_argb
4927 mov ecx, [esp + 12] // width
4929 vmovdqa ymm4, kUnattenShuffleAlpha_AVX2
4933 vmovdqu ymm6, [eax] // read 8 pixels.
4934 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather.
4935 vpsrld ymm2, ymm6, 24 // alpha in low 8 bits.
4936 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
4937 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
4938 vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a
4939 vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
4940 vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
4941 vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a
4942 vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas
4943 vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
4944 vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
4945 vpackuswb ymm0, ymm0, ymm1 // unmutated.
4947 vmovdqu [eax + edx], ymm0
4956 __declspec(naked) __declspec(align(16))
4957 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
4961 mov eax, [esp + 4] // src_argb0
4962 mov edx, [esp + 8] // dst_argb
4963 mov ecx, [esp + 12] // width
4965 vmovdqa ymm5, kUnattenShuffleAlpha_AVX2
4973 movzx esi, byte ptr [eax + 3] // alpha0
4974 movzx edi, byte ptr [eax + 7] // alpha1
4975 vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a0]
4976 vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a1]
4977 movzx esi, byte ptr [eax + 11] // alpha2
4978 movzx edi, byte ptr [eax + 15] // alpha3
4979 vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0]
4980 vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a2]
4981 vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a3]
4982 movzx esi, byte ptr [eax + 19] // alpha4
4983 movzx edi, byte ptr [eax + 23] // alpha5
4984 vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2]
4985 vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a4]
4986 vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a5]
4987 movzx esi, byte ptr [eax + 27] // alpha6
4988 movzx edi, byte ptr [eax + 31] // alpha7
4989 vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4]
4990 vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a6]
4991 vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a7]
4992 vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6]
4993 vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0]
4994 vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4]
4995 vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
4998 vmovdqu ymm6, [eax] // read 8 pixels.
4999 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
5000 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
5001 vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
5002 vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
5003 vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a
5004 vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas
5005 vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
5006 vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
5007 vpackuswb ymm0, ymm0, ymm1 // unmutated.
5009 vmovdqu [eax + edx], ymm0
5019 #endif // USE_GATHER
5020 #endif // HAS_ARGBATTENUATEROW_AVX2
5022 #ifdef HAS_ARGBGRAYROW_SSSE3
5023 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
5024 __declspec(naked) __declspec(align(16))
5025 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
5027 mov eax, [esp + 4] /* src_argb */
5028 mov edx, [esp + 8] /* dst_argb */
5029 mov ecx, [esp + 12] /* width */
5030 movdqa xmm4, kARGBToYJ
5031 movdqa xmm5, kAddYJ64
5035 movdqa xmm0, [eax] // G
5036 movdqa xmm1, [eax + 16]
5037 pmaddubsw xmm0, xmm4
5038 pmaddubsw xmm1, xmm4
5040 paddw xmm0, xmm5 // Add .5 for rounding.
5042 packuswb xmm0, xmm0 // 8 G bytes
5043 movdqa xmm2, [eax] // A
5044 movdqa xmm3, [eax + 16]
5049 packuswb xmm2, xmm2 // 8 A bytes
5050 movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA
5051 punpcklbw xmm0, xmm0 // 8 GG words
5052 punpcklbw xmm3, xmm2 // 8 GA words
5054 punpcklwd xmm0, xmm3 // GGGA first 4
5055 punpckhwd xmm1, xmm3 // GGGA next 4
5058 movdqa [edx + 16], xmm1
5064 #endif // HAS_ARGBGRAYROW_SSSE3
5066 #ifdef HAS_ARGBSEPIAROW_SSSE3
5067 // b = (r * 35 + g * 68 + b * 17) >> 7
5068 // g = (r * 45 + g * 88 + b * 22) >> 7
5069 // r = (r * 50 + g * 98 + b * 24) >> 7
5070 // Constant for ARGB color to sepia tone.
5071 static const vec8 kARGBToSepiaB = {
5072 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
5075 static const vec8 kARGBToSepiaG = {
5076 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
5079 static const vec8 kARGBToSepiaR = {
5080 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
5083 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
5084 __declspec(naked) __declspec(align(16))
5085 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
5087 mov eax, [esp + 4] /* dst_argb */
5088 mov ecx, [esp + 8] /* width */
5089 movdqa xmm2, kARGBToSepiaB
5090 movdqa xmm3, kARGBToSepiaG
5091 movdqa xmm4, kARGBToSepiaR
5095 movdqa xmm0, [eax] // B
5096 movdqa xmm6, [eax + 16]
5097 pmaddubsw xmm0, xmm2
5098 pmaddubsw xmm6, xmm2
5101 packuswb xmm0, xmm0 // 8 B values
5102 movdqa xmm5, [eax] // G
5103 movdqa xmm1, [eax + 16]
5104 pmaddubsw xmm5, xmm3
5105 pmaddubsw xmm1, xmm3
5108 packuswb xmm5, xmm5 // 8 G values
5109 punpcklbw xmm0, xmm5 // 8 BG values
5110 movdqa xmm5, [eax] // R
5111 movdqa xmm1, [eax + 16]
5112 pmaddubsw xmm5, xmm4
5113 pmaddubsw xmm1, xmm4
5116 packuswb xmm5, xmm5 // 8 R values
5117 movdqa xmm6, [eax] // A
5118 movdqa xmm1, [eax + 16]
5122 packuswb xmm6, xmm6 // 8 A values
5123 punpcklbw xmm5, xmm6 // 8 RA values
5124 movdqa xmm1, xmm0 // Weave BG, RA together
5125 punpcklwd xmm0, xmm5 // BGRA first 4
5126 punpckhwd xmm1, xmm5 // BGRA next 4
5129 movdqa [eax + 16], xmm1
5135 #endif // HAS_ARGBSEPIAROW_SSSE3
5137 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
5138 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
5139 // Same as Sepia except matrix is provided.
5140 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
5141 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
5142 __declspec(naked) __declspec(align(16))
5143 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
5144 const int8* matrix_argb, int width) {
5146 mov eax, [esp + 4] /* src_argb */
5147 mov edx, [esp + 8] /* dst_argb */
5148 mov ecx, [esp + 12] /* matrix_argb */
5150 pshufd xmm2, xmm5, 0x00
5151 pshufd xmm3, xmm5, 0x55
5152 pshufd xmm4, xmm5, 0xaa
5153 pshufd xmm5, xmm5, 0xff
5154 mov ecx, [esp + 16] /* width */
5158 movdqa xmm0, [eax] // B
5159 movdqa xmm7, [eax + 16]
5160 pmaddubsw xmm0, xmm2
5161 pmaddubsw xmm7, xmm2
5162 movdqa xmm6, [eax] // G
5163 movdqa xmm1, [eax + 16]
5164 pmaddubsw xmm6, xmm3
5165 pmaddubsw xmm1, xmm3
5166 phaddsw xmm0, xmm7 // B
5167 phaddsw xmm6, xmm1 // G
5170 packuswb xmm0, xmm0 // 8 B values
5171 packuswb xmm6, xmm6 // 8 G values
5172 punpcklbw xmm0, xmm6 // 8 BG values
5173 movdqa xmm1, [eax] // R
5174 movdqa xmm7, [eax + 16]
5175 pmaddubsw xmm1, xmm4
5176 pmaddubsw xmm7, xmm4
5177 phaddsw xmm1, xmm7 // R
5178 movdqa xmm6, [eax] // A
5179 movdqa xmm7, [eax + 16]
5180 pmaddubsw xmm6, xmm5
5181 pmaddubsw xmm7, xmm5
5182 phaddsw xmm6, xmm7 // A
5185 packuswb xmm1, xmm1 // 8 R values
5186 packuswb xmm6, xmm6 // 8 A values
5187 punpcklbw xmm1, xmm6 // 8 RA values
5188 movdqa xmm6, xmm0 // Weave BG, RA together
5189 punpcklwd xmm0, xmm1 // BGRA first 4
5190 punpckhwd xmm6, xmm1 // BGRA next 4
5193 movdqa [edx + 16], xmm6
5200 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
5202 #ifdef HAS_ARGBQUANTIZEROW_SSE2
5203 // Quantize 4 ARGB pixels (16 bytes).
5204 // Aligned to 16 bytes.
5205 __declspec(naked) __declspec(align(16))
5206 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
5207 int interval_offset, int width) {
5209 mov eax, [esp + 4] /* dst_argb */
5210 movd xmm2, [esp + 8] /* scale */
5211 movd xmm3, [esp + 12] /* interval_size */
5212 movd xmm4, [esp + 16] /* interval_offset */
5213 mov ecx, [esp + 20] /* width */
5214 pshuflw xmm2, xmm2, 040h
5215 pshufd xmm2, xmm2, 044h
5216 pshuflw xmm3, xmm3, 040h
5217 pshufd xmm3, xmm3, 044h
5218 pshuflw xmm4, xmm4, 040h
5219 pshufd xmm4, xmm4, 044h
5220 pxor xmm5, xmm5 // constant 0
5221 pcmpeqb xmm6, xmm6 // generate mask 0xff000000
5226 movdqa xmm0, [eax] // read 4 pixels
5227 punpcklbw xmm0, xmm5 // first 2 pixels
5228 pmulhuw xmm0, xmm2 // pixel * scale >> 16
5229 movdqa xmm1, [eax] // read 4 pixels
5230 punpckhbw xmm1, xmm5 // next 2 pixels
5232 pmullw xmm0, xmm3 // * interval_size
5233 movdqa xmm7, [eax] // read 4 pixels
5235 pand xmm7, xmm6 // mask alpha
5236 paddw xmm0, xmm4 // + interval_size / 2
5247 #endif // HAS_ARGBQUANTIZEROW_SSE2
5249 #ifdef HAS_ARGBSHADEROW_SSE2
5250 // Shade 4 pixels at a time by specified value.
5251 // Aligned to 16 bytes.
5252 __declspec(naked) __declspec(align(16))
5253 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
5256 mov eax, [esp + 4] // src_argb
5257 mov edx, [esp + 8] // dst_argb
5258 mov ecx, [esp + 12] // width
5259 movd xmm2, [esp + 16] // value
5260 punpcklbw xmm2, xmm2
5261 punpcklqdq xmm2, xmm2
5265 movdqa xmm0, [eax] // read 4 pixels
5268 punpcklbw xmm0, xmm0 // first 2
5269 punpckhbw xmm1, xmm1 // next 2
5270 pmulhuw xmm0, xmm2 // argb * value
5271 pmulhuw xmm1, xmm2 // argb * value
5283 #endif // HAS_ARGBSHADEROW_SSE2
5285 #ifdef HAS_ARGBMULTIPLYROW_SSE2
5286 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
5287 __declspec(naked) __declspec(align(16))
5288 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
5289 uint8* dst_argb, int width) {
5292 mov eax, [esp + 4 + 4] // src_argb0
5293 mov esi, [esp + 4 + 8] // src_argb1
5294 mov edx, [esp + 4 + 12] // dst_argb
5295 mov ecx, [esp + 4 + 16] // width
5296 pxor xmm5, xmm5 // constant 0
5300 movdqu xmm0, [eax] // read 4 pixels from src_argb0
5301 movdqu xmm2, [esi] // read 4 pixels from src_argb1
5304 punpcklbw xmm0, xmm0 // first 2
5305 punpckhbw xmm1, xmm1 // next 2
5306 punpcklbw xmm2, xmm5 // first 2
5307 punpckhbw xmm3, xmm5 // next 2
5308 pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2
5309 pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2
5322 #endif // HAS_ARGBMULTIPLYROW_SSE2
5324 #ifdef HAS_ARGBADDROW_SSE2
5325 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
5326 // TODO(fbarchard): Port this to posix, neon and other math functions.
5327 __declspec(naked) __declspec(align(16))
5328 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
5329 uint8* dst_argb, int width) {
5332 mov eax, [esp + 4 + 4] // src_argb0
5333 mov esi, [esp + 4 + 8] // src_argb1
5334 mov edx, [esp + 4 + 12] // dst_argb
5335 mov ecx, [esp + 4 + 16] // width
5342 movdqu xmm0, [eax] // read 4 pixels from src_argb0
5344 movdqu xmm1, [esi] // read 4 pixels from src_argb1
5346 paddusb xmm0, xmm1 // src_argb0 + src_argb1
5357 movd xmm0, [eax] // read 1 pixels from src_argb0
5359 movd xmm1, [esi] // read 1 pixels from src_argb1
5361 paddusb xmm0, xmm1 // src_argb0 + src_argb1
5372 #endif // HAS_ARGBADDROW_SSE2
5374 #ifdef HAS_ARGBSUBTRACTROW_SSE2
5375 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
5376 __declspec(naked) __declspec(align(16))
5377 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
5378 uint8* dst_argb, int width) {
5381 mov eax, [esp + 4 + 4] // src_argb0
5382 mov esi, [esp + 4 + 8] // src_argb1
5383 mov edx, [esp + 4 + 12] // dst_argb
5384 mov ecx, [esp + 4 + 16] // width
5388 movdqu xmm0, [eax] // read 4 pixels from src_argb0
5390 movdqu xmm1, [esi] // read 4 pixels from src_argb1
5392 psubusb xmm0, xmm1 // src_argb0 - src_argb1
5402 #endif // HAS_ARGBSUBTRACTROW_SSE2
5404 #ifdef HAS_ARGBMULTIPLYROW_AVX2
5405 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
5406 __declspec(naked) __declspec(align(16))
5407 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
5408 uint8* dst_argb, int width) {
5411 mov eax, [esp + 4 + 4] // src_argb0
5412 mov esi, [esp + 4 + 8] // src_argb1
5413 mov edx, [esp + 4 + 12] // dst_argb
5414 mov ecx, [esp + 4 + 16] // width
5415 vpxor ymm5, ymm5, ymm5 // constant 0
5419 vmovdqu ymm1, [eax] // read 8 pixels from src_argb0
5421 vmovdqu ymm3, [esi] // read 8 pixels from src_argb1
5423 vpunpcklbw ymm0, ymm1, ymm1 // low 4
5424 vpunpckhbw ymm1, ymm1, ymm1 // high 4
5425 vpunpcklbw ymm2, ymm3, ymm5 // low 4
5426 vpunpckhbw ymm3, ymm3, ymm5 // high 4
5427 vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4
5428 vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4
5429 vpackuswb ymm0, ymm0, ymm1
5440 #endif // HAS_ARGBMULTIPLYROW_AVX2
5442 #ifdef HAS_ARGBADDROW_AVX2
5443 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
5444 __declspec(naked) __declspec(align(16))
5445 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
5446 uint8* dst_argb, int width) {
5449 mov eax, [esp + 4 + 4] // src_argb0
5450 mov esi, [esp + 4 + 8] // src_argb1
5451 mov edx, [esp + 4 + 12] // dst_argb
5452 mov ecx, [esp + 4 + 16] // width
5456 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
5458 vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1
5470 #endif // HAS_ARGBADDROW_AVX2
5472 #ifdef HAS_ARGBSUBTRACTROW_AVX2
5473 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
5474 __declspec(naked) __declspec(align(16))
5475 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
5476 uint8* dst_argb, int width) {
5479 mov eax, [esp + 4 + 4] // src_argb0
5480 mov esi, [esp + 4 + 8] // src_argb1
5481 mov edx, [esp + 4 + 12] // dst_argb
5482 mov ecx, [esp + 4 + 16] // width
5486 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
5488 vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1
5500 #endif // HAS_ARGBSUBTRACTROW_AVX2
5502 #ifdef HAS_SOBELXROW_SSE2
5503 // SobelX as a matrix is
5507 __declspec(naked) __declspec(align(16))
5508 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
5509 const uint8* src_y2, uint8* dst_sobelx, int width) {
5513 mov eax, [esp + 8 + 4] // src_y0
5514 mov esi, [esp + 8 + 8] // src_y1
5515 mov edi, [esp + 8 + 12] // src_y2
5516 mov edx, [esp + 8 + 16] // dst_sobelx
5517 mov ecx, [esp + 8 + 20] // width
5521 pxor xmm5, xmm5 // constant 0
5525 movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
5526 movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
5527 punpcklbw xmm0, xmm5
5528 punpcklbw xmm1, xmm5
5530 movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
5531 movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
5532 punpcklbw xmm1, xmm5
5533 punpcklbw xmm2, xmm5
5535 movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0]
5536 movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2]
5537 punpcklbw xmm2, xmm5
5538 punpcklbw xmm3, xmm5
5543 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
5548 movq qword ptr [eax + edx], xmm0
5557 #endif // HAS_SOBELXROW_SSE2
5559 #ifdef HAS_SOBELYROW_SSE2
5560 // SobelY as a matrix is
5564 __declspec(naked) __declspec(align(16))
5565 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
5566 uint8* dst_sobely, int width) {
5569 mov eax, [esp + 4 + 4] // src_y0
5570 mov esi, [esp + 4 + 8] // src_y1
5571 mov edx, [esp + 4 + 12] // dst_sobely
5572 mov ecx, [esp + 4 + 16] // width
5575 pxor xmm5, xmm5 // constant 0
5579 movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
5580 movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
5581 punpcklbw xmm0, xmm5
5582 punpcklbw xmm1, xmm5
5584 movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1]
5585 movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1]
5586 punpcklbw xmm1, xmm5
5587 punpcklbw xmm2, xmm5
5589 movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
5590 movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
5591 punpcklbw xmm2, xmm5
5592 punpcklbw xmm3, xmm5
5597 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
5602 movq qword ptr [eax + edx], xmm0
5610 #endif // HAS_SOBELYROW_SSE2
5612 #ifdef HAS_SOBELROW_SSE2
5613 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
5618 __declspec(naked) __declspec(align(16))
5619 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
5620 uint8* dst_argb, int width) {
5623 mov eax, [esp + 4 + 4] // src_sobelx
5624 mov esi, [esp + 4 + 8] // src_sobely
5625 mov edx, [esp + 4 + 12] // dst_argb
5626 mov ecx, [esp + 4 + 16] // width
5628 pcmpeqb xmm5, xmm5 // alpha 255
5629 pslld xmm5, 24 // 0xff000000
5633 movdqa xmm0, [eax] // read 16 pixels src_sobelx
5634 movdqa xmm1, [eax + esi] // read 16 pixels src_sobely
5636 paddusb xmm0, xmm1 // sobel = sobelx + sobely
5637 movdqa xmm2, xmm0 // GG
5638 punpcklbw xmm2, xmm0 // First 8
5639 punpckhbw xmm0, xmm0 // Next 8
5640 movdqa xmm1, xmm2 // GGGG
5641 punpcklwd xmm1, xmm2 // First 4
5642 punpckhwd xmm2, xmm2 // Next 4
5643 por xmm1, xmm5 // GGGA
5645 movdqa xmm3, xmm0 // GGGG
5646 punpcklwd xmm3, xmm0 // Next 4
5647 punpckhwd xmm0, xmm0 // Last 4
5648 por xmm3, xmm5 // GGGA
5652 movdqa [edx + 16], xmm2
5653 movdqa [edx + 32], xmm3
5654 movdqa [edx + 48], xmm0
5662 #endif // HAS_SOBELROW_SSE2
5664 #ifdef HAS_SOBELTOPLANEROW_SSE2
5665 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
5666 __declspec(naked) __declspec(align(16))
5667 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
5668 uint8* dst_y, int width) {
5671 mov eax, [esp + 4 + 4] // src_sobelx
5672 mov esi, [esp + 4 + 8] // src_sobely
5673 mov edx, [esp + 4 + 12] // dst_argb
5674 mov ecx, [esp + 4 + 16] // width
5679 movdqa xmm0, [eax] // read 16 pixels src_sobelx
5680 movdqa xmm1, [eax + esi] // read 16 pixels src_sobely
5682 paddusb xmm0, xmm1 // sobel = sobelx + sobely
5692 #endif // HAS_SOBELTOPLANEROW_SSE2
5694 #ifdef HAS_SOBELXYROW_SSE2
5695 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
5700 __declspec(naked) __declspec(align(16))
5701 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
5702 uint8* dst_argb, int width) {
5705 mov eax, [esp + 4 + 4] // src_sobelx
5706 mov esi, [esp + 4 + 8] // src_sobely
5707 mov edx, [esp + 4 + 12] // dst_argb
5708 mov ecx, [esp + 4 + 16] // width
5710 pcmpeqb xmm5, xmm5 // alpha 255
5714 movdqa xmm0, [eax] // read 16 pixels src_sobelx
5715 movdqa xmm1, [eax + esi] // read 16 pixels src_sobely
5718 paddusb xmm2, xmm1 // sobel = sobelx + sobely
5719 movdqa xmm3, xmm0 // XA
5720 punpcklbw xmm3, xmm5
5721 punpckhbw xmm0, xmm5
5722 movdqa xmm4, xmm1 // YS
5723 punpcklbw xmm4, xmm2
5724 punpckhbw xmm1, xmm2
5725 movdqa xmm6, xmm4 // YSXA
5726 punpcklwd xmm6, xmm3 // First 4
5727 punpckhwd xmm4, xmm3 // Next 4
5728 movdqa xmm7, xmm1 // YSXA
5729 punpcklwd xmm7, xmm0 // Next 4
5730 punpckhwd xmm1, xmm0 // Last 4
5733 movdqa [edx + 16], xmm4
5734 movdqa [edx + 32], xmm7
5735 movdqa [edx + 48], xmm1
5743 #endif // HAS_SOBELXYROW_SSE2
5745 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5746 // Consider float CumulativeSum.
5747 // Consider calling CumulativeSum one row at time as needed.
5748 // Consider circular CumulativeSum buffer of radius * 2 + 1 height.
5749 // Convert cumulative sum for an area to an average for 1 pixel.
5750 // topleft is pointer to top left of CumulativeSum buffer for area.
5751 // botleft is pointer to bottom left of CumulativeSum buffer.
5752 // width is offset from left to right of area in CumulativeSum buffer measured
5753 // in number of ints.
5754 // area is the number of pixels in the area being averaged.
5755 // dst points to pixel to store result to.
5756 // count is number of averaged pixels to produce.
5757 // Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte
5759 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
5760 int width, int area, uint8* dst,
5763 mov eax, topleft // eax topleft
5764 mov esi, botleft // esi botleft
5770 rcpss xmm4, xmm5 // 1.0f / area
5771 pshufd xmm4, xmm4, 0
5775 cmp area, 128 // 128 pixels will not overflow 15 bits.
5778 pshufd xmm5, xmm5, 0 // area
5779 pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0
5782 addps xmm5, xmm6 // (65536.0 + area - 1)
5783 mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area
5784 cvtps2dq xmm5, xmm5 // 0.16 fixed point
5785 packssdw xmm5, xmm5 // 16 bit shorts
5787 // 4 pixel loop small blocks.
5792 movdqa xmm1, [eax + 16]
5793 movdqa xmm2, [eax + 32]
5794 movdqa xmm3, [eax + 48]
5797 psubd xmm0, [eax + edx * 4]
5798 psubd xmm1, [eax + edx * 4 + 16]
5799 psubd xmm2, [eax + edx * 4 + 32]
5800 psubd xmm3, [eax + edx * 4 + 48]
5805 psubd xmm1, [esi + 16]
5806 psubd xmm2, [esi + 32]
5807 psubd xmm3, [esi + 48]
5810 paddd xmm0, [esi + edx * 4]
5811 paddd xmm1, [esi + edx * 4 + 16]
5812 paddd xmm2, [esi + edx * 4 + 32]
5813 paddd xmm3, [esi + edx * 4 + 48]
5816 packssdw xmm0, xmm1 // pack 4 pixels into 2 registers
5835 movdqa xmm1, [eax + 16]
5836 movdqa xmm2, [eax + 32]
5837 movdqa xmm3, [eax + 48]
5840 psubd xmm0, [eax + edx * 4]
5841 psubd xmm1, [eax + edx * 4 + 16]
5842 psubd xmm2, [eax + edx * 4 + 32]
5843 psubd xmm3, [eax + edx * 4 + 48]
5848 psubd xmm1, [esi + 16]
5849 psubd xmm2, [esi + 32]
5850 psubd xmm3, [esi + 48]
5853 paddd xmm0, [esi + edx * 4]
5854 paddd xmm1, [esi + edx * 4 + 16]
5855 paddd xmm2, [esi + edx * 4 + 32]
5856 paddd xmm3, [esi + edx * 4 + 48]
5859 cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area
5887 psubd xmm0, [eax + edx * 4]
5890 paddd xmm0, [esi + edx * 4]
5897 movd dword ptr [edi], xmm0
5904 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5906 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
5907 // Creates a table of cumulative sums where each value is a sum of all values
5908 // above and to the left of the value.
5909 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
5910 const int32* previous_cumsum, int width) {
5914 mov esi, previous_cumsum
5927 movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
5931 punpcklbw xmm2, xmm1
5933 punpcklwd xmm2, xmm1
5934 punpckhwd xmm3, xmm1
5936 punpckhbw xmm4, xmm1
5938 punpcklwd xmm4, xmm1
5939 punpckhwd xmm5, xmm1
5942 movdqa xmm2, [esi] // previous row above.
5946 movdqa xmm3, [esi + 16]
5950 movdqa xmm4, [esi + 32]
5954 movdqa xmm5, [esi + 48]
5959 movdqa [edx + 16], xmm3
5960 movdqa [edx + 32], xmm4
5961 movdqa [edx + 48], xmm5
5974 movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes.
5976 punpcklbw xmm2, xmm1
5977 punpcklwd xmm2, xmm1
5990 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
5992 #ifdef HAS_ARGBAFFINEROW_SSE2
5993 // Copy ARGB pixels from source image with slope to a row of destination.
5994 __declspec(naked) __declspec(align(16))
5996 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
5997 uint8* dst_argb, const float* uv_dudv, int width) {
6001 mov eax, [esp + 12] // src_argb
6002 mov esi, [esp + 16] // stride
6003 mov edx, [esp + 20] // dst_argb
6004 mov ecx, [esp + 24] // pointer to uv_dudv
6005 movq xmm2, qword ptr [ecx] // uv
6006 movq xmm7, qword ptr [ecx + 8] // dudv
6007 mov ecx, [esp + 28] // width
6008 shl esi, 16 // 4, stride
6014 // setup for 4 pixel loop
6015 pshufd xmm7, xmm7, 0x44 // dup dudv
6016 pshufd xmm5, xmm5, 0 // dup 4, stride
6017 movdqa xmm0, xmm2 // x0, y0, x1, y1
6021 addps xmm4, xmm4 // dudv *= 2
6022 movdqa xmm3, xmm2 // x2, y2, x3, y3
6024 addps xmm4, xmm4 // dudv *= 4
6029 cvttps2dq xmm0, xmm2 // x, y float to int first 2
6030 cvttps2dq xmm1, xmm3 // x, y float to int next 2
6031 packssdw xmm0, xmm1 // x, y as 8 shorts
6032 pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride.
6034 pshufd xmm0, xmm0, 0x39 // shift right
6036 pshufd xmm0, xmm0, 0x39 // shift right
6037 movd xmm1, [eax + esi] // read pixel 0
6038 movd xmm6, [eax + edi] // read pixel 1
6039 punpckldq xmm1, xmm6 // combine pixel 0 and 1
6040 addps xmm2, xmm4 // x, y += dx, dy first 2
6041 movq qword ptr [edx], xmm1
6043 pshufd xmm0, xmm0, 0x39 // shift right
6045 movd xmm6, [eax + esi] // read pixel 2
6046 movd xmm0, [eax + edi] // read pixel 3
6047 punpckldq xmm6, xmm0 // combine pixel 2 and 3
6048 addps xmm3, xmm4 // x, y += dx, dy next 2
6050 movq qword ptr 8[edx], xmm6
6061 cvttps2dq xmm0, xmm2 // x, y float to int
6062 packssdw xmm0, xmm0 // x, y as shorts
6063 pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride
6064 addps xmm2, xmm7 // x, y += dx, dy
6066 movd xmm0, [eax + esi] // copy a pixel
6077 #endif // HAS_ARGBAFFINEROW_SSE2
6079 #ifdef HAS_INTERPOLATEROW_AVX2
6080 // Bilinear filter 16x2 -> 16x1
6081 __declspec(naked) __declspec(align(16))
6082 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
6083 ptrdiff_t src_stride, int dst_width,
6084 int source_y_fraction) {
6088 mov edi, [esp + 8 + 4] // dst_ptr
6089 mov esi, [esp + 8 + 8] // src_ptr
6090 mov edx, [esp + 8 + 12] // src_stride
6091 mov ecx, [esp + 8 + 16] // dst_width
6092 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
6094 // Dispatch to specialized filters if applicable.
6096 je xloop100 // 0 / 128. Blend 100 / 0.
6099 je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
6101 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
6103 je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
6105 vmovd xmm0, eax // high fraction 0..127
6108 vmovd xmm5, eax // low fraction 128..1
6109 vpunpcklbw xmm5, xmm5, xmm0
6110 vpunpcklwd xmm5, xmm5, xmm5
6111 vpxor ymm0, ymm0, ymm0
6112 vpermd ymm5, ymm0, ymm5
6117 vmovdqu ymm2, [esi + edx]
6118 vpunpckhbw ymm1, ymm0, ymm2 // mutates
6119 vpunpcklbw ymm0, ymm0, ymm2 // mutates
6120 vpmaddubsw ymm0, ymm0, ymm5
6121 vpmaddubsw ymm1, ymm1, ymm5
6122 vpsrlw ymm0, ymm0, 7
6123 vpsrlw ymm1, ymm1, 7
6124 vpackuswb ymm0, ymm0, ymm1 // unmutates
6126 vmovdqu [esi + edi], ymm0
6135 vpavgb ymm0, ymm0, [esi + edx]
6136 vpavgb ymm0, ymm0, [esi + edx]
6138 vmovdqu [esi + edi], ymm0
6147 vpavgb ymm0, ymm0, [esi + edx]
6149 vmovdqu [esi + edi], ymm0
6157 vmovdqu ymm0, [esi + edx]
6158 vpavgb ymm0, ymm0, [esi]
6159 vpavgb ymm0, ymm0, [esi]
6161 vmovdqu [esi + edi], ymm0
6166 // Blend 100 / 0 - Copy row unchanged.
6178 #endif // HAS_INTERPOLATEROW_AVX2
6180 #ifdef HAS_INTERPOLATEROW_SSSE3
6181 // Bilinear filter 16x2 -> 16x1
6182 __declspec(naked) __declspec(align(16))
6183 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
6184 ptrdiff_t src_stride, int dst_width,
6185 int source_y_fraction) {
6189 mov edi, [esp + 8 + 4] // dst_ptr
6190 mov esi, [esp + 8 + 8] // src_ptr
6191 mov edx, [esp + 8 + 12] // src_stride
6192 mov ecx, [esp + 8 + 16] // dst_width
6193 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
6196 // Dispatch to specialized filters if applicable.
6198 je xloop100 // 0 / 128. Blend 100 / 0.
6200 je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
6202 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
6204 je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
6206 movd xmm0, eax // high fraction 0..127
6209 movd xmm5, eax // low fraction 128..1
6210 punpcklbw xmm5, xmm0
6211 punpcklwd xmm5, xmm5
6212 pshufd xmm5, xmm5, 0
6217 movdqa xmm2, [esi + edx]
6219 punpcklbw xmm0, xmm2
6220 punpckhbw xmm1, xmm2
6221 pmaddubsw xmm0, xmm5
6222 pmaddubsw xmm1, xmm5
6227 movdqa [esi + edi], xmm0
6236 movdqa xmm1, [esi + edx]
6240 movdqa [esi + edi], xmm0
6249 movdqa xmm1, [esi + edx]
6252 movdqa [esi + edi], xmm0
6261 movdqa xmm0, [esi + edx]
6265 movdqa [esi + edi], xmm0
6270 // Blend 100 / 0 - Copy row unchanged.
6275 movdqa [esi + edi], xmm0
6285 #endif // HAS_INTERPOLATEROW_SSSE3
6287 #ifdef HAS_INTERPOLATEROW_SSE2
6288 // Bilinear filter 16x2 -> 16x1
6289 __declspec(naked) __declspec(align(16))
6290 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
6291 ptrdiff_t src_stride, int dst_width,
6292 int source_y_fraction) {
6296 mov edi, [esp + 8 + 4] // dst_ptr
6297 mov esi, [esp + 8 + 8] // src_ptr
6298 mov edx, [esp + 8 + 12] // src_stride
6299 mov ecx, [esp + 8 + 16] // dst_width
6300 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
6302 // Dispatch to specialized filters if applicable.
6304 je xloop100 // 0 / 256. Blend 100 / 0.
6306 je xloop75 // 64 / 256 is 0.25. Blend 75 / 25.
6308 je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
6310 je xloop25 // 192 / 256 is 0.75. Blend 25 / 75.
6312 movd xmm5, eax // xmm5 = y fraction
6313 punpcklbw xmm5, xmm5
6315 punpcklwd xmm5, xmm5
6316 punpckldq xmm5, xmm5
6317 punpcklqdq xmm5, xmm5
6322 movdqa xmm0, [esi] // row0
6323 movdqa xmm2, [esi + edx] // row1
6326 punpcklbw xmm2, xmm4
6327 punpckhbw xmm3, xmm4
6328 punpcklbw xmm0, xmm4
6329 punpckhbw xmm1, xmm4
6330 psubw xmm2, xmm0 // row1 - row0
6332 paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16
6334 pmulhw xmm2, xmm5 // scale diff
6336 paddw xmm0, xmm2 // sum rows
6340 movdqa [esi + edi], xmm0
6349 movdqa xmm1, [esi + edx]
6353 movdqa [esi + edi], xmm0
6362 movdqa xmm1, [esi + edx]
6365 movdqa [esi + edi], xmm0
6374 movdqa xmm0, [esi + edx]
6378 movdqa [esi + edi], xmm0
6383 // Blend 100 / 0 - Copy row unchanged.
6388 movdqa [esi + edi], xmm0
6398 #endif // HAS_INTERPOLATEROW_SSE2
6400 // Bilinear filter 16x2 -> 16x1
6401 __declspec(naked) __declspec(align(16))
6402 void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
6403 ptrdiff_t src_stride, int dst_width,
6404 int source_y_fraction) {
6408 mov edi, [esp + 8 + 4] // dst_ptr
6409 mov esi, [esp + 8 + 8] // src_ptr
6410 mov edx, [esp + 8 + 12] // src_stride
6411 mov ecx, [esp + 8 + 16] // dst_width
6412 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
6415 // Dispatch to specialized filters if applicable.
6417 je xloop100 // 0 / 128. Blend 100 / 0.
6419 je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
6421 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
6423 je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
6425 movd xmm0, eax // high fraction 0..127
6428 movd xmm5, eax // low fraction 128..1
6429 punpcklbw xmm5, xmm0
6430 punpcklwd xmm5, xmm5
6431 pshufd xmm5, xmm5, 0
6436 movdqu xmm2, [esi + edx]
6438 punpcklbw xmm0, xmm2
6439 punpckhbw xmm1, xmm2
6440 pmaddubsw xmm0, xmm5
6441 pmaddubsw xmm1, xmm5
6446 movdqu [esi + edi], xmm0
6455 movdqu xmm1, [esi + edx]
6459 movdqu [esi + edi], xmm0
6468 movdqu xmm1, [esi + edx]
6471 movdqu [esi + edi], xmm0
6480 movdqu xmm0, [esi + edx]
6484 movdqu [esi + edi], xmm0
6489 // Blend 100 / 0 - Copy row unchanged.
6494 movdqu [esi + edi], xmm0
6505 #ifdef HAS_INTERPOLATEROW_SSE2
6506 // Bilinear filter 16x2 -> 16x1
6507 __declspec(naked) __declspec(align(16))
6508 void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
6509 ptrdiff_t src_stride, int dst_width,
6510 int source_y_fraction) {
6514 mov edi, [esp + 8 + 4] // dst_ptr
6515 mov esi, [esp + 8 + 8] // src_ptr
6516 mov edx, [esp + 8 + 12] // src_stride
6517 mov ecx, [esp + 8 + 16] // dst_width
6518 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
6520 // Dispatch to specialized filters if applicable.
6522 je xloop100 // 0 / 256. Blend 100 / 0.
6524 je xloop75 // 64 / 256 is 0.25. Blend 75 / 25.
6526 je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
6528 je xloop25 // 192 / 256 is 0.75. Blend 25 / 75.
6530 movd xmm5, eax // xmm5 = y fraction
6531 punpcklbw xmm5, xmm5
6533 punpcklwd xmm5, xmm5
6534 punpckldq xmm5, xmm5
6535 punpcklqdq xmm5, xmm5
6540 movdqu xmm0, [esi] // row0
6541 movdqu xmm2, [esi + edx] // row1
6544 punpcklbw xmm2, xmm4
6545 punpckhbw xmm3, xmm4
6546 punpcklbw xmm0, xmm4
6547 punpckhbw xmm1, xmm4
6548 psubw xmm2, xmm0 // row1 - row0
6550 paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16
6552 pmulhw xmm2, xmm5 // scale diff
6554 paddw xmm0, xmm2 // sum rows
6558 movdqu [esi + edi], xmm0
6567 movdqu xmm1, [esi + edx]
6571 movdqu [esi + edi], xmm0
6580 movdqu xmm1, [esi + edx]
6583 movdqu [esi + edi], xmm0
6592 movdqu xmm0, [esi + edx]
6596 movdqu [esi + edi], xmm0
6601 // Blend 100 / 0 - Copy row unchanged.
6606 movdqu [esi + edi], xmm0
6616 #endif // HAS_INTERPOLATEROW_SSE2
6618 __declspec(naked) __declspec(align(16))
6619 void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
6620 uint8* dst_uv, int pix) {
6623 mov eax, [esp + 4 + 4] // src_uv
6624 mov edx, [esp + 4 + 8] // src_uv_stride
6625 mov edi, [esp + 4 + 12] // dst_v
6626 mov ecx, [esp + 4 + 16] // pix
6632 pavgb xmm0, [eax + edx]
6634 movdqa [eax + edi], xmm0
6642 #ifdef HAS_HALFROW_AVX2
6643 __declspec(naked) __declspec(align(16))
6644 void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,
6645 uint8* dst_uv, int pix) {
6648 mov eax, [esp + 4 + 4] // src_uv
6649 mov edx, [esp + 4 + 8] // src_uv_stride
6650 mov edi, [esp + 4 + 12] // dst_v
6651 mov ecx, [esp + 4 + 16] // pix
6657 vpavgb ymm0, ymm0, [eax + edx]
6659 vmovdqu [eax + edi], ymm0
6668 #endif // HAS_HALFROW_AVX2
6670 __declspec(naked) __declspec(align(16))
6671 void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
6672 uint32 selector, int pix) {
6674 mov eax, [esp + 4] // src_argb
6675 mov edx, [esp + 8] // dst_bayer
6676 movd xmm5, [esp + 12] // selector
6677 mov ecx, [esp + 16] // pix
6678 pshufd xmm5, xmm5, 0
6683 movdqa xmm1, [eax + 16]
6687 punpckldq xmm0, xmm1
6689 movq qword ptr [edx], xmm0
6696 // Specialized ARGB to Bayer that just isolates G channel.
6697 __declspec(naked) __declspec(align(16))
6698 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
6699 uint32 selector, int pix) {
6701 mov eax, [esp + 4] // src_argb
6702 mov edx, [esp + 8] // dst_bayer
6704 mov ecx, [esp + 16] // pix
6705 pcmpeqb xmm5, xmm5 // generate mask 0x000000ff
6711 movdqa xmm1, [eax + 16]
6713 psrld xmm0, 8 // Move green to bottom.
6720 movq qword ptr [edx], xmm0
6727 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
6728 __declspec(naked) __declspec(align(16))
6729 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
6730 const uint8* shuffler, int pix) {
6732 mov eax, [esp + 4] // src_argb
6733 mov edx, [esp + 8] // dst_argb
6734 mov ecx, [esp + 12] // shuffler
6736 mov ecx, [esp + 16] // pix
6741 movdqa xmm1, [eax + 16]
6747 movdqa [edx + 16], xmm1
6754 __declspec(naked) __declspec(align(16))
6755 void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
6756 const uint8* shuffler, int pix) {
6758 mov eax, [esp + 4] // src_argb
6759 mov edx, [esp + 8] // dst_argb
6760 mov ecx, [esp + 12] // shuffler
6762 mov ecx, [esp + 16] // pix
6767 movdqu xmm1, [eax + 16]
6773 movdqu [edx + 16], xmm1
6780 #ifdef HAS_ARGBSHUFFLEROW_AVX2
6781 __declspec(naked) __declspec(align(16))
6782 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
6783 const uint8* shuffler, int pix) {
6785 mov eax, [esp + 4] // src_argb
6786 mov edx, [esp + 8] // dst_argb
6787 mov ecx, [esp + 12] // shuffler
6788 vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
6789 mov ecx, [esp + 16] // pix
6794 vmovdqu ymm1, [eax + 32]
6796 vpshufb ymm0, ymm0, ymm5
6797 vpshufb ymm1, ymm1, ymm5
6800 vmovdqu [edx + 32], ymm1
6808 #endif // HAS_ARGBSHUFFLEROW_AVX2
6810 __declspec(naked) __declspec(align(16))
6811 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
6812 const uint8* shuffler, int pix) {
6816 mov eax, [esp + 8 + 4] // src_argb
6817 mov edx, [esp + 8 + 8] // dst_argb
6818 mov esi, [esp + 8 + 12] // shuffler
6819 mov ecx, [esp + 8 + 16] // pix
6822 mov ebx, [esi] // shuffler
6832 // TODO(fbarchard): Use one source pointer and 3 offsets.
6834 movzx ebx, byte ptr [esi]
6835 movzx ebx, byte ptr [eax + ebx]
6837 movzx ebx, byte ptr [esi + 1]
6838 movzx ebx, byte ptr [eax + ebx]
6840 movzx ebx, byte ptr [esi + 2]
6841 movzx ebx, byte ptr [eax + ebx]
6843 movzx ebx, byte ptr [esi + 3]
6844 movzx ebx, byte ptr [eax + ebx]
6857 punpcklbw xmm0, xmm5
6858 punpckhbw xmm1, xmm5
6859 pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB
6860 pshuflw xmm0, xmm0, 01Bh
6861 pshufhw xmm1, xmm1, 01Bh
6862 pshuflw xmm1, xmm1, 01Bh
6875 punpcklbw xmm0, xmm5
6876 punpckhbw xmm1, xmm5
6877 pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB
6878 pshuflw xmm0, xmm0, 039h
6879 pshufhw xmm1, xmm1, 039h
6880 pshuflw xmm1, xmm1, 039h
6893 punpcklbw xmm0, xmm5
6894 punpckhbw xmm1, xmm5
6895 pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA
6896 pshuflw xmm0, xmm0, 093h
6897 pshufhw xmm1, xmm1, 093h
6898 pshuflw xmm1, xmm1, 093h
6911 punpcklbw xmm0, xmm5
6912 punpckhbw xmm1, xmm5
6913 pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB
6914 pshuflw xmm0, xmm0, 0C6h
6915 pshufhw xmm1, xmm1, 0C6h
6916 pshuflw xmm1, xmm1, 0C6h
6930 // YUY2 - Macro-pixel = 2 image pixels
6931 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
6933 // UYVY - Macro-pixel = 2 image pixels
6936 __declspec(naked) __declspec(align(16))
6937 void I422ToYUY2Row_SSE2(const uint8* src_y,
6940 uint8* dst_frame, int width) {
6944 mov eax, [esp + 8 + 4] // src_y
6945 mov esi, [esp + 8 + 8] // src_u
6946 mov edx, [esp + 8 + 12] // src_v
6947 mov edi, [esp + 8 + 16] // dst_frame
6948 mov ecx, [esp + 8 + 20] // width
6953 movq xmm2, qword ptr [esi] // U
6954 movq xmm3, qword ptr [esi + edx] // V
6956 punpcklbw xmm2, xmm3 // UV
6957 movdqu xmm0, [eax] // Y
6960 punpcklbw xmm0, xmm2 // YUYV
6961 punpckhbw xmm1, xmm2
6963 movdqu [edi + 16], xmm1
6974 __declspec(naked) __declspec(align(16))
6975 void I422ToUYVYRow_SSE2(const uint8* src_y,
6978 uint8* dst_frame, int width) {
6982 mov eax, [esp + 8 + 4] // src_y
6983 mov esi, [esp + 8 + 8] // src_u
6984 mov edx, [esp + 8 + 12] // src_v
6985 mov edi, [esp + 8 + 16] // dst_frame
6986 mov ecx, [esp + 8 + 20] // width
6991 movq xmm2, qword ptr [esi] // U
6992 movq xmm3, qword ptr [esi + edx] // V
6994 punpcklbw xmm2, xmm3 // UV
6995 movdqu xmm0, [eax] // Y
6998 punpcklbw xmm1, xmm0 // UYVY
6999 punpckhbw xmm2, xmm0
7001 movdqu [edi + 16], xmm2
7012 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
7013 __declspec(naked) __declspec(align(16))
7014 void ARGBPolynomialRow_SSE2(const uint8* src_argb,
7015 uint8* dst_argb, const float* poly,
7019 mov eax, [esp + 4 + 4] /* src_argb */
7020 mov edx, [esp + 4 + 8] /* dst_argb */
7021 mov esi, [esp + 4 + 12] /* poly */
7022 mov ecx, [esp + 4 + 16] /* width */
7023 pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
7028 // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
7029 // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
7030 movq xmm0, qword ptr [eax] // BGRABGRA
7032 punpcklbw xmm0, xmm3
7034 punpcklwd xmm0, xmm3 // pixel 0
7035 punpckhwd xmm4, xmm3 // pixel 1
7036 cvtdq2ps xmm0, xmm0 // 4 floats
7038 movdqa xmm1, xmm0 // X
7040 mulps xmm0, [esi + 16] // C1 * X
7041 mulps xmm4, [esi + 16]
7042 addps xmm0, [esi] // result = C0 + C1 * X
7046 mulps xmm2, xmm1 // X * X
7048 mulps xmm1, xmm2 // X * X * X
7050 mulps xmm2, [esi + 32] // C2 * X * X
7051 mulps xmm6, [esi + 32]
7052 mulps xmm1, [esi + 48] // C3 * X * X * X
7053 mulps xmm5, [esi + 48]
7054 addps xmm0, xmm2 // result += C2 * X * X
7056 addps xmm0, xmm1 // result += C3 * X * X * X
7058 cvttps2dq xmm0, xmm0
7059 cvttps2dq xmm4, xmm4
7063 movq qword ptr [edx], xmm0
7070 #endif // HAS_ARGBPOLYNOMIALROW_SSE2
7072 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
7073 __declspec(naked) __declspec(align(16))
7074 void ARGBPolynomialRow_AVX2(const uint8* src_argb,
7075 uint8* dst_argb, const float* poly,
7078 mov eax, [esp + 4] /* src_argb */
7079 mov edx, [esp + 8] /* dst_argb */
7080 mov ecx, [esp + 12] /* poly */
7081 vbroadcastf128 ymm4, [ecx] // C0
7082 vbroadcastf128 ymm5, [ecx + 16] // C1
7083 vbroadcastf128 ymm6, [ecx + 32] // C2
7084 vbroadcastf128 ymm7, [ecx + 48] // C3
7085 mov ecx, [esp + 16] /* width */
7090 vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels
7092 vcvtdq2ps ymm0, ymm0 // X 8 floats
7093 vmulps ymm2, ymm0, ymm0 // X * X
7094 vmulps ymm3, ymm0, ymm7 // C3 * X
7095 vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X
7096 vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X
7097 vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X
7098 vcvttps2dq ymm0, ymm0
7099 vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000
7100 vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
7101 vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000
7103 vmovq qword ptr [edx], xmm0
7110 #endif // HAS_ARGBPOLYNOMIALROW_AVX2
7112 #ifdef HAS_ARGBCOLORTABLEROW_X86
7113 // Tranform ARGB pixels with color table.
7114 __declspec(naked) __declspec(align(16))
7115 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
7119 mov eax, [esp + 4 + 4] /* dst_argb */
7120 mov esi, [esp + 4 + 8] /* table_argb */
7121 mov ecx, [esp + 4 + 12] /* width */
7126 movzx edx, byte ptr [eax]
7128 movzx edx, byte ptr [esi + edx * 4]
7129 mov byte ptr [eax - 4], dl
7130 movzx edx, byte ptr [eax - 4 + 1]
7131 movzx edx, byte ptr [esi + edx * 4 + 1]
7132 mov byte ptr [eax - 4 + 1], dl
7133 movzx edx, byte ptr [eax - 4 + 2]
7134 movzx edx, byte ptr [esi + edx * 4 + 2]
7135 mov byte ptr [eax - 4 + 2], dl
7136 movzx edx, byte ptr [eax - 4 + 3]
7137 movzx edx, byte ptr [esi + edx * 4 + 3]
7138 mov byte ptr [eax - 4 + 3], dl
7145 #endif // HAS_ARGBCOLORTABLEROW_X86
7147 #ifdef HAS_RGBCOLORTABLEROW_X86
7148 // Tranform RGB pixels with color table.
7149 __declspec(naked) __declspec(align(16))
7150 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
7153 mov eax, [esp + 4 + 4] /* dst_argb */
7154 mov esi, [esp + 4 + 8] /* table_argb */
7155 mov ecx, [esp + 4 + 12] /* width */
7160 movzx edx, byte ptr [eax]
7162 movzx edx, byte ptr [esi + edx * 4]
7163 mov byte ptr [eax - 4], dl
7164 movzx edx, byte ptr [eax - 4 + 1]
7165 movzx edx, byte ptr [esi + edx * 4 + 1]
7166 mov byte ptr [eax - 4 + 1], dl
7167 movzx edx, byte ptr [eax - 4 + 2]
7168 movzx edx, byte ptr [esi + edx * 4 + 2]
7169 mov byte ptr [eax - 4 + 2], dl
7177 #endif // HAS_RGBCOLORTABLEROW_X86
7179 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
7180 // Tranform RGB pixels with luma table.
7181 __declspec(naked) __declspec(align(16))
7182 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
7184 const uint8* luma, uint32 lumacoeff) {
7188 mov eax, [esp + 8 + 4] /* src_argb */
7189 mov edi, [esp + 8 + 8] /* dst_argb */
7190 mov ecx, [esp + 8 + 12] /* width */
7191 movd xmm2, dword ptr [esp + 8 + 16] // luma table
7192 movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff
7193 pshufd xmm2, xmm2, 0
7194 pshufd xmm3, xmm3, 0
7195 pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00
7202 movdqu xmm0, qword ptr [eax] // generate luma ptr
7203 pmaddubsw xmm0, xmm3
7205 pand xmm0, xmm4 // mask out low bits
7206 punpcklwd xmm0, xmm5
7207 paddd xmm0, xmm2 // add table base
7209 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
7211 movzx edx, byte ptr [eax]
7212 movzx edx, byte ptr [esi + edx]
7213 mov byte ptr [edi], dl
7214 movzx edx, byte ptr [eax + 1]
7215 movzx edx, byte ptr [esi + edx]
7216 mov byte ptr [edi + 1], dl
7217 movzx edx, byte ptr [eax + 2]
7218 movzx edx, byte ptr [esi + edx]
7219 mov byte ptr [edi + 2], dl
7220 movzx edx, byte ptr [eax + 3] // copy alpha.
7221 mov byte ptr [edi + 3], dl
7224 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
7226 movzx edx, byte ptr [eax + 4]
7227 movzx edx, byte ptr [esi + edx]
7228 mov byte ptr [edi + 4], dl
7229 movzx edx, byte ptr [eax + 5]
7230 movzx edx, byte ptr [esi + edx]
7231 mov byte ptr [edi + 5], dl
7232 movzx edx, byte ptr [eax + 6]
7233 movzx edx, byte ptr [esi + edx]
7234 mov byte ptr [edi + 6], dl
7235 movzx edx, byte ptr [eax + 7] // copy alpha.
7236 mov byte ptr [edi + 7], dl
7239 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
7241 movzx edx, byte ptr [eax + 8]
7242 movzx edx, byte ptr [esi + edx]
7243 mov byte ptr [edi + 8], dl
7244 movzx edx, byte ptr [eax + 9]
7245 movzx edx, byte ptr [esi + edx]
7246 mov byte ptr [edi + 9], dl
7247 movzx edx, byte ptr [eax + 10]
7248 movzx edx, byte ptr [esi + edx]
7249 mov byte ptr [edi + 10], dl
7250 movzx edx, byte ptr [eax + 11] // copy alpha.
7251 mov byte ptr [edi + 11], dl
7255 movzx edx, byte ptr [eax + 12]
7256 movzx edx, byte ptr [esi + edx]
7257 mov byte ptr [edi + 12], dl
7258 movzx edx, byte ptr [eax + 13]
7259 movzx edx, byte ptr [esi + edx]
7260 mov byte ptr [edi + 13], dl
7261 movzx edx, byte ptr [eax + 14]
7262 movzx edx, byte ptr [esi + edx]
7263 mov byte ptr [edi + 14], dl
7264 movzx edx, byte ptr [eax + 15] // copy alpha.
7265 mov byte ptr [edi + 15], dl
7277 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
7279 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
7283 } // namespace libyuv