2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
11 #include "libyuv/row.h"
18 // This module is for Visual C x86.
19 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
21 // Offsets for source bytes 0 to 9
23 { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
25 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
27 { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
29 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
31 { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
33 // Offsets for source bytes 0 to 10
34 static uvec8 kShuf01 =
35 { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
37 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
38 static uvec8 kShuf11 =
39 { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
41 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
42 static uvec8 kShuf21 =
43 { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
45 // Coefficients for source bytes 0 to 10
46 static uvec8 kMadd01 =
47 { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
49 // Coefficients for source bytes 10 to 21
50 static uvec8 kMadd11 =
51 { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
53 // Coefficients for source bytes 21 to 31
54 static uvec8 kMadd21 =
55 { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
57 // Coefficients for source bytes 21 to 31
58 static vec16 kRound34 =
59 { 2, 2, 2, 2, 2, 2, 2, 2 };
61 static uvec8 kShuf38a =
62 { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
64 static uvec8 kShuf38b =
65 { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
67 // Arrange words 0,3,6 into 0,1,2
68 static uvec8 kShufAc =
69 { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
71 // Arrange words 0,3,6 into 3,4,5
72 static uvec8 kShufAc3 =
73 { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
75 // Scaling values for boxes of 3x3 and 2x3
76 static uvec16 kScaleAc33 =
77 { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
79 // Arrange first value for pixels 0,1,2,3,4,5
80 static uvec8 kShufAb0 =
81 { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
83 // Arrange second value for pixels 0,1,2,3,4,5
84 static uvec8 kShufAb1 =
85 { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
87 // Arrange third value for pixels 0,1,2,3,4,5
88 static uvec8 kShufAb2 =
89 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
91 // Scaling values for boxes of 3x2 and 2x2
92 static uvec16 kScaleAb2 =
93 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
95 // Reads 32 pixels, throws half away and writes 16 pixels.
96 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
97 __declspec(naked) __declspec(align(16))
98 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
99 uint8* dst_ptr, int dst_width) {
101 mov eax, [esp + 4] // src_ptr
102 // src_stride ignored
103 mov edx, [esp + 12] // dst_ptr
104 mov ecx, [esp + 16] // dst_width
109 movdqa xmm1, [eax + 16]
111 psrlw xmm0, 8 // isolate odd pixels.
123 // Blends 32x1 rectangle to 16x1.
124 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
125 __declspec(naked) __declspec(align(16))
126 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
127 uint8* dst_ptr, int dst_width) {
129 mov eax, [esp + 4] // src_ptr
131 mov edx, [esp + 12] // dst_ptr
132 mov ecx, [esp + 16] // dst_width
133 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
139 movdqa xmm1, [eax + 16]
142 movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
161 // Blends 32x2 rectangle to 16x1.
162 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
163 __declspec(naked) __declspec(align(16))
164 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
165 uint8* dst_ptr, int dst_width) {
168 mov eax, [esp + 4 + 4] // src_ptr
169 mov esi, [esp + 4 + 8] // src_stride
170 mov edx, [esp + 4 + 12] // dst_ptr
171 mov ecx, [esp + 4 + 16] // dst_width
172 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
178 movdqa xmm1, [eax + 16]
179 movdqa xmm2, [eax + esi]
180 movdqa xmm3, [eax + esi + 16]
182 pavgb xmm0, xmm2 // average rows
185 movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
205 // Reads 32 pixels, throws half away and writes 16 pixels.
206 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
207 __declspec(naked) __declspec(align(16))
208 void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
209 ptrdiff_t src_stride,
210 uint8* dst_ptr, int dst_width) {
212 mov eax, [esp + 4] // src_ptr
213 // src_stride ignored
214 mov edx, [esp + 12] // dst_ptr
215 mov ecx, [esp + 16] // dst_width
220 movdqu xmm1, [eax + 16]
222 psrlw xmm0, 8 // isolate odd pixels.
234 // Blends 32x1 rectangle to 16x1.
235 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
236 __declspec(naked) __declspec(align(16))
237 void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
238 ptrdiff_t src_stride,
239 uint8* dst_ptr, int dst_width) {
241 mov eax, [esp + 4] // src_ptr
243 mov edx, [esp + 12] // dst_ptr
244 mov ecx, [esp + 16] // dst_width
245 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
251 movdqu xmm1, [eax + 16]
254 movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
273 // Blends 32x2 rectangle to 16x1.
274 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
275 __declspec(naked) __declspec(align(16))
276 void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
277 ptrdiff_t src_stride,
278 uint8* dst_ptr, int dst_width) {
281 mov eax, [esp + 4 + 4] // src_ptr
282 mov esi, [esp + 4 + 8] // src_stride
283 mov edx, [esp + 4 + 12] // dst_ptr
284 mov ecx, [esp + 4 + 16] // dst_width
285 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
291 movdqu xmm1, [eax + 16]
292 movdqu xmm2, [eax + esi]
293 movdqu xmm3, [eax + esi + 16]
295 pavgb xmm0, xmm2 // average rows
298 movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
318 // Point samples 32 pixels to 8 pixels.
319 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
320 __declspec(naked) __declspec(align(16))
321 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
322 uint8* dst_ptr, int dst_width) {
324 mov eax, [esp + 4] // src_ptr
325 // src_stride ignored
326 mov edx, [esp + 12] // dst_ptr
327 mov ecx, [esp + 16] // dst_width
328 pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000
335 movdqa xmm1, [eax + 16]
343 movq qword ptr [edx], xmm0
351 // Blends 32x4 rectangle to 8x1.
352 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
353 __declspec(naked) __declspec(align(16))
354 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
355 uint8* dst_ptr, int dst_width) {
359 mov eax, [esp + 8 + 4] // src_ptr
360 mov esi, [esp + 8 + 8] // src_stride
361 mov edx, [esp + 8 + 12] // dst_ptr
362 mov ecx, [esp + 8 + 16] // dst_width
363 lea edi, [esi + esi * 2] // src_stride * 3
364 pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
370 movdqa xmm1, [eax + 16]
371 movdqa xmm2, [eax + esi]
372 movdqa xmm3, [eax + esi + 16]
373 pavgb xmm0, xmm2 // average rows
375 movdqa xmm2, [eax + esi * 2]
376 movdqa xmm3, [eax + esi * 2 + 16]
377 movdqa xmm4, [eax + edi]
378 movdqa xmm5, [eax + edi + 16]
385 movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
395 movdqa xmm2, xmm0 // average columns (16 to 8 pixels)
402 movq qword ptr [edx], xmm0
412 // Point samples 32 pixels to 24 pixels.
413 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
414 // Then shuffled to do the scaling.
416 // Note that movdqa+palign may be better than movdqu.
417 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
418 __declspec(naked) __declspec(align(16))
419 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
420 uint8* dst_ptr, int dst_width) {
422 mov eax, [esp + 4] // src_ptr
423 // src_stride ignored
424 mov edx, [esp + 12] // dst_ptr
425 mov ecx, [esp + 16] // dst_width
433 movdqa xmm1, [eax + 16]
436 palignr xmm1, xmm0, 8
440 movq qword ptr [edx], xmm0
441 movq qword ptr [edx + 8], xmm1
442 movq qword ptr [edx + 16], xmm2
451 // Blends 32x2 rectangle to 24x1
452 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
453 // Then shuffled to do the scaling.
465 // Note that movdqa+palign may be better than movdqu.
466 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
467 __declspec(naked) __declspec(align(16))
468 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
469 ptrdiff_t src_stride,
470 uint8* dst_ptr, int dst_width) {
473 mov eax, [esp + 4 + 4] // src_ptr
474 mov esi, [esp + 4 + 8] // src_stride
475 mov edx, [esp + 4 + 12] // dst_ptr
476 mov ecx, [esp + 4 + 16] // dst_width
482 movdqa xmm7, kRound34
486 movdqa xmm0, [eax] // pixels 0..7
487 movdqa xmm1, [eax + esi]
494 movq qword ptr [edx], xmm0
495 movdqu xmm0, [eax + 8] // pixels 8..15
496 movdqu xmm1, [eax + esi + 8]
503 movq qword ptr [edx + 8], xmm0
504 movdqa xmm0, [eax + 16] // pixels 16..23
505 movdqa xmm1, [eax + esi + 16]
515 movq qword ptr [edx + 16], xmm0
524 // Note that movdqa+palign may be better than movdqu.
525 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
526 __declspec(naked) __declspec(align(16))
527 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
528 ptrdiff_t src_stride,
529 uint8* dst_ptr, int dst_width) {
532 mov eax, [esp + 4 + 4] // src_ptr
533 mov esi, [esp + 4 + 8] // src_stride
534 mov edx, [esp + 4 + 12] // dst_ptr
535 mov ecx, [esp + 4 + 16] // dst_width
541 movdqa xmm7, kRound34
545 movdqa xmm0, [eax] // pixels 0..7
546 movdqa xmm1, [eax + esi]
554 movq qword ptr [edx], xmm0
555 movdqu xmm0, [eax + 8] // pixels 8..15
556 movdqu xmm1, [eax + esi + 8]
564 movq qword ptr [edx + 8], xmm0
565 movdqa xmm0, [eax + 16] // pixels 16..23
566 movdqa xmm1, [eax + esi + 16]
577 movq qword ptr [edx + 16], xmm0
588 // Scale 32 pixels to 12
589 __declspec(naked) __declspec(align(16))
590 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
591 uint8* dst_ptr, int dst_width) {
593 mov eax, [esp + 4] // src_ptr
594 // src_stride ignored
595 mov edx, [esp + 12] // dst_ptr
596 mov ecx, [esp + 16] // dst_width
597 movdqa xmm4, kShuf38a
598 movdqa xmm5, kShuf38b
602 movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
603 movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
610 movq qword ptr [edx], xmm0 // write 12 pixels
620 // Scale 16x3 pixels to 6x1 with interpolation
621 __declspec(naked) __declspec(align(16))
622 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
623 ptrdiff_t src_stride,
624 uint8* dst_ptr, int dst_width) {
627 mov eax, [esp + 4 + 4] // src_ptr
628 mov esi, [esp + 4 + 8] // src_stride
629 mov edx, [esp + 4 + 12] // dst_ptr
630 mov ecx, [esp + 4 + 16] // dst_width
632 movdqa xmm3, kShufAc3
633 movdqa xmm4, kScaleAc33
638 movdqa xmm0, [eax] // sum up 3 rows into xmm0/1
639 movdqa xmm6, [eax + esi]
648 movdqa xmm6, [eax + esi * 2]
656 movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6
663 movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6
671 pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6
675 movd [edx], xmm6 // write 6 pixels
686 // Scale 16x2 pixels to 6x1 with interpolation
687 __declspec(naked) __declspec(align(16))
688 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
689 ptrdiff_t src_stride,
690 uint8* dst_ptr, int dst_width) {
693 mov eax, [esp + 4 + 4] // src_ptr
694 mov esi, [esp + 4 + 8] // src_stride
695 mov edx, [esp + 4 + 12] // dst_ptr
696 mov ecx, [esp + 4 + 16] // dst_width
697 movdqa xmm2, kShufAb0
698 movdqa xmm3, kShufAb1
699 movdqa xmm4, kShufAb2
700 movdqa xmm5, kScaleAb2
704 movdqa xmm0, [eax] // average 2 rows into xmm0
705 pavgb xmm0, [eax + esi]
708 movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1
716 pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2
720 movd [edx], xmm1 // write 6 pixels
731 // Reads 16xN bytes and produces 16 shorts at a time.
732 // TODO(fbarchard): Make this handle 4xN bytes for any width ARGB.
733 __declspec(naked) __declspec(align(16))
734 void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
735 uint16* dst_ptr, int src_width,
742 mov esi, [esp + 16 + 4] // src_ptr
743 mov edx, [esp + 16 + 8] // src_stride
744 mov edi, [esp + 16 + 12] // dst_ptr
745 mov ecx, [esp + 16 + 16] // dst_width
746 mov ebx, [esp + 16 + 20] // height
763 // sum remaining rows
766 movdqa xmm2, [eax] // read 16 pixels
767 lea eax, [eax + edx] // advance to next row
771 paddusw xmm0, xmm2 // sum 16 words
779 movdqa [edi + 16], xmm1
793 // Bilinear column filtering. SSSE3 version.
794 // TODO(fbarchard): Port to Neon
795 // TODO(fbarchard): Switch the following:
797 // mov bx, word ptr [esi + eax] // 2 source x0 pixels
799 // movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
800 // when drmemory bug fixed.
801 // https://code.google.com/p/drmemory/issues/detail?id=1396
803 __declspec(naked) __declspec(align(16))
804 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
805 int dst_width, int x, int dx) {
810 mov edi, [esp + 12 + 4] // dst_ptr
811 mov esi, [esp + 12 + 8] // src_ptr
812 mov ecx, [esp + 12 + 12] // dst_width
813 movd xmm2, [esp + 12 + 16] // x
814 movd xmm3, [esp + 12 + 20] // dx
815 mov eax, 0x04040000 // shuffle to line up fractions with pixel.
817 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
819 pextrw eax, xmm2, 1 // get x0 integer. preroll
823 movdqa xmm0, xmm2 // x1 = x0 + dx
825 punpckldq xmm2, xmm0 // x0 x1
826 punpckldq xmm3, xmm3 // dx dx
827 paddd xmm3, xmm3 // dx * 2, dx * 2
828 pextrw edx, xmm2, 3 // get x1 integer. preroll
833 movdqa xmm1, xmm2 // x0, x1 fractions.
834 paddd xmm2, xmm3 // x += dx
835 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
837 psrlw xmm1, 9 // 7 bit fractions.
838 movzx ebx, word ptr [esi + edx] // 2 source x1 pixels
840 pshufb xmm1, xmm5 // 0011
842 pxor xmm1, xmm6 // 0..7f and 7f..0
843 pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels.
844 pextrw eax, xmm2, 1 // get x0 integer. next iteration.
845 pextrw edx, xmm2, 3 // get x1 integer. next iteration.
846 psrlw xmm0, 7 // 8.7 fixed point to low 8 bits.
847 packuswb xmm0, xmm0 // 8 bits, 2 pixels.
851 sub ecx, 2 // 2 pixels
861 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
863 psrlw xmm2, 9 // 7 bit fractions.
864 pshufb xmm2, xmm5 // 0011
865 pxor xmm2, xmm6 // 0..7f and 7f..0
866 pmaddubsw xmm0, xmm2 // 16 bit
867 psrlw xmm0, 7 // 8.7 fixed point to low 8 bits.
868 packuswb xmm0, xmm0 // 8 bits
882 // Reads 16 pixels, duplicates them and writes 32 pixels.
883 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
884 __declspec(naked) __declspec(align(16))
885 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
886 int dst_width, int x, int dx) {
888 mov edx, [esp + 4] // dst_ptr
889 mov eax, [esp + 8] // src_ptr
890 mov ecx, [esp + 12] // dst_width
901 movdqa [edx + 16], xmm1
909 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
910 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
911 __declspec(naked) __declspec(align(16))
912 void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
913 ptrdiff_t src_stride,
914 uint8* dst_argb, int dst_width) {
916 mov eax, [esp + 4] // src_argb
917 // src_stride ignored
918 mov edx, [esp + 12] // dst_argb
919 mov ecx, [esp + 16] // dst_width
924 movdqa xmm1, [eax + 16]
926 shufps xmm0, xmm1, 0xdd
936 // Blends 8x1 rectangle to 4x1.
937 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
938 __declspec(naked) __declspec(align(16))
939 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
940 ptrdiff_t src_stride,
941 uint8* dst_argb, int dst_width) {
943 mov eax, [esp + 4] // src_argb
944 // src_stride ignored
945 mov edx, [esp + 12] // dst_argb
946 mov ecx, [esp + 16] // dst_width
951 movdqa xmm1, [eax + 16]
954 shufps xmm0, xmm1, 0x88 // even pixels
955 shufps xmm2, xmm1, 0xdd // odd pixels
966 // Blends 8x2 rectangle to 4x1.
967 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
968 __declspec(naked) __declspec(align(16))
969 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
970 ptrdiff_t src_stride,
971 uint8* dst_argb, int dst_width) {
974 mov eax, [esp + 4 + 4] // src_argb
975 mov esi, [esp + 4 + 8] // src_stride
976 mov edx, [esp + 4 + 12] // dst_argb
977 mov ecx, [esp + 4 + 16] // dst_width
982 movdqa xmm1, [eax + 16]
983 movdqa xmm2, [eax + esi]
984 movdqa xmm3, [eax + esi + 16]
986 pavgb xmm0, xmm2 // average rows
988 movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
989 shufps xmm0, xmm1, 0x88 // even pixels
990 shufps xmm2, xmm1, 0xdd // odd pixels
1002 // Reads 4 pixels at a time.
1003 // Alignment requirement: dst_argb 16 byte aligned.
1004 __declspec(naked) __declspec(align(16))
1005 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
1007 uint8* dst_argb, int dst_width) {
1011 mov eax, [esp + 8 + 4] // src_argb
1012 // src_stride ignored
1013 mov ebx, [esp + 8 + 12] // src_stepx
1014 mov edx, [esp + 8 + 16] // dst_argb
1015 mov ecx, [esp + 8 + 20] // dst_width
1017 lea edi, [ebx + ebx * 2]
1022 movd xmm1, [eax + ebx]
1023 punpckldq xmm0, xmm1
1024 movd xmm2, [eax + ebx * 2]
1025 movd xmm3, [eax + edi]
1026 lea eax, [eax + ebx * 4]
1027 punpckldq xmm2, xmm3
1028 punpcklqdq xmm0, xmm2
1040 // Blends four 2x2 to 4x1.
1041 // Alignment requirement: dst_argb 16 byte aligned.
1042 __declspec(naked) __declspec(align(16))
1043 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
1044 ptrdiff_t src_stride,
1046 uint8* dst_argb, int dst_width) {
1051 mov eax, [esp + 12 + 4] // src_argb
1052 mov esi, [esp + 12 + 8] // src_stride
1053 mov ebx, [esp + 12 + 12] // src_stepx
1054 mov edx, [esp + 12 + 16] // dst_argb
1055 mov ecx, [esp + 12 + 20] // dst_width
1056 lea esi, [eax + esi] // row1 pointer
1058 lea edi, [ebx + ebx * 2]
1062 movq xmm0, qword ptr [eax] // row0 4 pairs
1063 movhps xmm0, qword ptr [eax + ebx]
1064 movq xmm1, qword ptr [eax + ebx * 2]
1065 movhps xmm1, qword ptr [eax + edi]
1066 lea eax, [eax + ebx * 4]
1067 movq xmm2, qword ptr [esi] // row1 4 pairs
1068 movhps xmm2, qword ptr [esi + ebx]
1069 movq xmm3, qword ptr [esi + ebx * 2]
1070 movhps xmm3, qword ptr [esi + edi]
1071 lea esi, [esi + ebx * 4]
1072 pavgb xmm0, xmm2 // average rows
1074 movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
1075 shufps xmm0, xmm1, 0x88 // even pixels
1076 shufps xmm2, xmm1, 0xdd // odd pixels
1090 // Column scaling unfiltered. SSE2 version.
1091 __declspec(naked) __declspec(align(16))
1092 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
1093 int dst_width, int x, int dx) {
1097 mov edi, [esp + 8 + 4] // dst_argb
1098 mov esi, [esp + 8 + 8] // src_argb
1099 mov ecx, [esp + 8 + 12] // dst_width
1100 movd xmm2, [esp + 8 + 16] // x
1101 movd xmm3, [esp + 8 + 20] // dx
1103 pshufd xmm2, xmm2, 0 // x0 x0 x0 x0
1104 pshufd xmm0, xmm3, 0x11 // dx 0 dx 0
1106 paddd xmm3, xmm3 // 0, 0, 0, dx * 2
1107 pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0
1108 paddd xmm2, xmm0 // x3 x2 x1 x0
1109 paddd xmm3, xmm3 // 0, 0, 0, dx * 4
1110 pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4
1112 pextrw eax, xmm2, 1 // get x0 integer.
1113 pextrw edx, xmm2, 3 // get x1 integer.
1123 movd xmm0, [esi + eax * 4] // 1 source x0 pixels
1124 movd xmm1, [esi + edx * 4] // 1 source x1 pixels
1125 pextrw eax, xmm2, 5 // get x2 integer.
1126 pextrw edx, xmm2, 7 // get x3 integer.
1127 paddd xmm2, xmm3 // x += dx
1128 punpckldq xmm0, xmm1 // x0 x1
1130 movd xmm1, [esi + eax * 4] // 1 source x2 pixels
1131 movd xmm4, [esi + edx * 4] // 1 source x3 pixels
1132 pextrw eax, xmm2, 1 // get x0 integer. next iteration.
1133 pextrw edx, xmm2, 3 // get x1 integer. next iteration.
1134 punpckldq xmm1, xmm4 // x2 x3
1135 punpcklqdq xmm0, xmm1 // x0 x1 x2 x3
1136 sub ecx, 4 // 4 pixels
1147 movd xmm0, [esi + eax * 4] // 1 source x0 pixels
1148 movd xmm1, [esi + edx * 4] // 1 source x1 pixels
1149 pextrw eax, xmm2, 5 // get x2 integer.
1150 punpckldq xmm0, xmm1 // x0 x1
1152 movq qword ptr [edi], xmm0
1160 movd xmm0, [esi + eax * 4] // 1 source x2 pixels
1161 movd dword ptr [edi], xmm0
1171 // Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
1172 // TODO(fbarchard): Port to Neon
1174 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
1175 static uvec8 kShuffleColARGB = {
1176 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
1177 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
1180 // Shuffle table for duplicating 2 fractions into 8 bytes each
1181 static uvec8 kShuffleFractions = {
1182 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
1185 __declspec(naked) __declspec(align(16))
1186 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
1187 int dst_width, int x, int dx) {
1191 mov edi, [esp + 8 + 4] // dst_argb
1192 mov esi, [esp + 8 + 8] // src_argb
1193 mov ecx, [esp + 8 + 12] // dst_width
1194 movd xmm2, [esp + 8 + 16] // x
1195 movd xmm3, [esp + 8 + 20] // dx
1196 movdqa xmm4, kShuffleColARGB
1197 movdqa xmm5, kShuffleFractions
1198 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
1200 pextrw eax, xmm2, 1 // get x0 integer. preroll
1204 movdqa xmm0, xmm2 // x1 = x0 + dx
1206 punpckldq xmm2, xmm0 // x0 x1
1207 punpckldq xmm3, xmm3 // dx dx
1208 paddd xmm3, xmm3 // dx * 2, dx * 2
1209 pextrw edx, xmm2, 3 // get x1 integer. preroll
1214 movdqa xmm1, xmm2 // x0, x1 fractions.
1215 paddd xmm2, xmm3 // x += dx
1216 movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
1217 psrlw xmm1, 9 // 7 bit fractions.
1218 movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels
1219 pshufb xmm1, xmm5 // 0000000011111111
1220 pshufb xmm0, xmm4 // arrange pixels into pairs
1221 pxor xmm1, xmm6 // 0..7f and 7f..0
1222 pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels.
1223 pextrw eax, xmm2, 1 // get x0 integer. next iteration.
1224 pextrw edx, xmm2, 3 // get x1 integer. next iteration.
1225 psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits.
1226 packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels.
1227 movq qword ptr [edi], xmm0
1229 sub ecx, 2 // 2 pixels
1238 // 1 pixel remainder
1239 psrlw xmm2, 9 // 7 bit fractions.
1240 movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
1241 pshufb xmm2, xmm5 // 00000000
1242 pshufb xmm0, xmm4 // arrange pixels into pairs
1243 pxor xmm2, xmm6 // 0..7f and 7f..0
1244 pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel.
1246 packuswb xmm0, xmm0 // argb 8 bits, 1 pixel.
1258 // Reads 4 pixels, duplicates them and writes 8 pixels.
1259 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
1260 __declspec(naked) __declspec(align(16))
1261 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
1262 int dst_width, int x, int dx) {
1264 mov edx, [esp + 4] // dst_argb
1265 mov eax, [esp + 8] // src_argb
1266 mov ecx, [esp + 12] // dst_width
1273 punpckldq xmm0, xmm0
1274 punpckhdq xmm1, xmm1
1277 movdqa [edx + 16], xmm1
1285 // Divide num by div and return as 16.16 fixed point result.
1286 __declspec(naked) __declspec(align(16))
1287 int FixedDiv_X86(int num, int div) {
1289 mov eax, [esp + 4] // num
1290 cdq // extend num to 64 bits
1291 shld edx, eax, 16 // 32.16
1293 idiv dword ptr [esp + 8]
1298 // Divide num by div and return as 16.16 fixed point result.
1299 __declspec(naked) __declspec(align(16))
1300 int FixedDiv1_X86(int num, int div) {
1302 mov eax, [esp + 4] // num
1303 mov ecx, [esp + 8] // denom
1304 cdq // extend num to 64 bits
1305 shld edx, eax, 16 // 32.16
1315 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
1319 } // namespace libyuv