2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
11 #include "libyuv/row.h"
18 // This module is for GCC x86 and x64.
19 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
21 // Offsets for source bytes 0 to 9
23 { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
25 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
27 { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
29 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
31 { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
33 // Offsets for source bytes 0 to 10
34 static uvec8 kShuf01 =
35 { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
37 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
38 static uvec8 kShuf11 =
39 { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
41 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
42 static uvec8 kShuf21 =
43 { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
45 // Coefficients for source bytes 0 to 10
46 static uvec8 kMadd01 =
47 { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
49 // Coefficients for source bytes 10 to 21
50 static uvec8 kMadd11 =
51 { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
53 // Coefficients for source bytes 21 to 31
54 static uvec8 kMadd21 =
55 { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
57 // Coefficients for source bytes 21 to 31
58 static vec16 kRound34 =
59 { 2, 2, 2, 2, 2, 2, 2, 2 };
61 static uvec8 kShuf38a =
62 { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
64 static uvec8 kShuf38b =
65 { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
67 // Arrange words 0,3,6 into 0,1,2
68 static uvec8 kShufAc =
69 { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
71 // Arrange words 0,3,6 into 3,4,5
72 static uvec8 kShufAc3 =
73 { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
75 // Scaling values for boxes of 3x3 and 2x3
76 static uvec16 kScaleAc33 =
77 { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
79 // Arrange first value for pixels 0,1,2,3,4,5
80 static uvec8 kShufAb0 =
81 { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
83 // Arrange second value for pixels 0,1,2,3,4,5
84 static uvec8 kShufAb1 =
85 { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
87 // Arrange third value for pixels 0,1,2,3,4,5
88 static uvec8 kShufAb2 =
89 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
91 // Scaling values for boxes of 3x2 and 2x2
92 static uvec16 kScaleAb2 =
93 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
95 // GCC versions of row functions are verbatim conversions from Visual C.
96 // Generated using gcc disassembly on Visual C object file:
97 // objdump -D yuvscaler.obj >yuvscaler.txt
99 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
100 uint8* dst_ptr, int dst_width) {
104 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
105 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
106 "lea " MEMLEA(0x20,0) ",%0 \n"
107 "psrlw $0x8,%%xmm0 \n"
108 "psrlw $0x8,%%xmm1 \n"
109 "packuswb %%xmm1,%%xmm0 \n"
110 "movdqa %%xmm0," MEMACCESS(1) " \n"
111 "lea " MEMLEA(0x10,1) ",%1 \n"
114 : "+r"(src_ptr), // %0
116 "+r"(dst_width) // %2
119 #if defined(__SSE2__)
125 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
126 uint8* dst_ptr, int dst_width) {
128 "pcmpeqb %%xmm5,%%xmm5 \n"
129 "psrlw $0x8,%%xmm5 \n"
133 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
134 "movdqa " MEMACCESS2(0x10, 0) ",%%xmm1 \n"
135 "lea " MEMLEA(0x20,0) ",%0 \n"
136 "movdqa %%xmm0,%%xmm2 \n"
137 "psrlw $0x8,%%xmm0 \n"
138 "movdqa %%xmm1,%%xmm3 \n"
139 "psrlw $0x8,%%xmm1 \n"
140 "pand %%xmm5,%%xmm2 \n"
141 "pand %%xmm5,%%xmm3 \n"
142 "pavgw %%xmm2,%%xmm0 \n"
143 "pavgw %%xmm3,%%xmm1 \n"
144 "packuswb %%xmm1,%%xmm0 \n"
145 "movdqa %%xmm0," MEMACCESS(1) " \n"
146 "lea " MEMLEA(0x10,1) ",%1 \n"
149 : "+r"(src_ptr), // %0
151 "+r"(dst_width) // %2
154 #if defined(__SSE2__)
155 , "xmm0", "xmm1", "xmm5"
160 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
161 uint8* dst_ptr, int dst_width) {
163 "pcmpeqb %%xmm5,%%xmm5 \n"
164 "psrlw $0x8,%%xmm5 \n"
168 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
169 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
170 MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2
172 MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3
173 "lea " MEMLEA(0x20,0) ",%0 \n"
174 "pavgb %%xmm2,%%xmm0 \n"
175 "pavgb %%xmm3,%%xmm1 \n"
176 "movdqa %%xmm0,%%xmm2 \n"
177 "psrlw $0x8,%%xmm0 \n"
178 "movdqa %%xmm1,%%xmm3 \n"
179 "psrlw $0x8,%%xmm1 \n"
180 "pand %%xmm5,%%xmm2 \n"
181 "pand %%xmm5,%%xmm3 \n"
182 "pavgw %%xmm2,%%xmm0 \n"
183 "pavgw %%xmm3,%%xmm1 \n"
184 "packuswb %%xmm1,%%xmm0 \n"
185 "movdqa %%xmm0," MEMACCESS(1) " \n"
186 "lea " MEMLEA(0x10,1) ",%1 \n"
189 : "+r"(src_ptr), // %0
191 "+r"(dst_width) // %2
192 : "r"((intptr_t)(src_stride)) // %3
194 #if defined(__native_client__) && defined(__x86_64__)
197 #if defined(__SSE2__)
198 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
203 void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
204 uint8* dst_ptr, int dst_width) {
208 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
209 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
210 "lea " MEMLEA(0x20,0) ",%0 \n"
211 "psrlw $0x8,%%xmm0 \n"
212 "psrlw $0x8,%%xmm1 \n"
213 "packuswb %%xmm1,%%xmm0 \n"
214 "movdqu %%xmm0," MEMACCESS(1) " \n"
215 "lea " MEMLEA(0x10,1) ",%1 \n"
218 : "+r"(src_ptr), // %0
220 "+r"(dst_width) // %2
223 #if defined(__SSE2__)
229 void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
230 ptrdiff_t src_stride,
231 uint8* dst_ptr, int dst_width) {
233 "pcmpeqb %%xmm5,%%xmm5 \n"
234 "psrlw $0x8,%%xmm5 \n"
238 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
239 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
240 "lea " MEMLEA(0x20,0) ",%0 \n"
241 "movdqa %%xmm0,%%xmm2 \n"
242 "psrlw $0x8,%%xmm0 \n"
243 "movdqa %%xmm1,%%xmm3 \n"
244 "psrlw $0x8,%%xmm1 \n"
245 "pand %%xmm5,%%xmm2 \n"
246 "pand %%xmm5,%%xmm3 \n"
247 "pavgw %%xmm2,%%xmm0 \n"
248 "pavgw %%xmm3,%%xmm1 \n"
249 "packuswb %%xmm1,%%xmm0 \n"
250 "movdqu %%xmm0," MEMACCESS(1) " \n"
251 "lea " MEMLEA(0x10,1) ",%1 \n"
254 : "+r"(src_ptr), // %0
256 "+r"(dst_width) // %2
259 #if defined(__SSE2__)
260 , "xmm0", "xmm1", "xmm5"
265 void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
266 ptrdiff_t src_stride,
267 uint8* dst_ptr, int dst_width) {
269 "pcmpeqb %%xmm5,%%xmm5 \n"
270 "psrlw $0x8,%%xmm5 \n"
274 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
275 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
276 MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
278 MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
279 "lea " MEMLEA(0x20,0) ",%0 \n"
280 "pavgb %%xmm2,%%xmm0 \n"
281 "pavgb %%xmm3,%%xmm1 \n"
282 "movdqa %%xmm0,%%xmm2 \n"
283 "psrlw $0x8,%%xmm0 \n"
284 "movdqa %%xmm1,%%xmm3 \n"
285 "psrlw $0x8,%%xmm1 \n"
286 "pand %%xmm5,%%xmm2 \n"
287 "pand %%xmm5,%%xmm3 \n"
288 "pavgw %%xmm2,%%xmm0 \n"
289 "pavgw %%xmm3,%%xmm1 \n"
290 "packuswb %%xmm1,%%xmm0 \n"
291 "movdqu %%xmm0," MEMACCESS(1) " \n"
292 "lea " MEMLEA(0x10,1) ",%1 \n"
295 : "+r"(src_ptr), // %0
297 "+r"(dst_width) // %2
298 : "r"((intptr_t)(src_stride)) // %3
300 #if defined(__native_client__) && defined(__x86_64__)
303 #if defined(__SSE2__)
304 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
309 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
310 uint8* dst_ptr, int dst_width) {
312 "pcmpeqb %%xmm5,%%xmm5 \n"
313 "psrld $0x18,%%xmm5 \n"
314 "pslld $0x10,%%xmm5 \n"
318 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
319 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
320 "lea " MEMLEA(0x20,0) ",%0 \n"
321 "pand %%xmm5,%%xmm0 \n"
322 "pand %%xmm5,%%xmm1 \n"
323 "packuswb %%xmm1,%%xmm0 \n"
324 "psrlw $0x8,%%xmm0 \n"
325 "packuswb %%xmm0,%%xmm0 \n"
326 "movq %%xmm0," MEMACCESS(1) " \n"
327 "lea " MEMLEA(0x8,1) ",%1 \n"
330 : "+r"(src_ptr), // %0
332 "+r"(dst_width) // %2
335 #if defined(__SSE2__)
336 , "xmm0", "xmm1", "xmm5"
341 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
342 uint8* dst_ptr, int dst_width) {
343 intptr_t stridex3 = 0;
345 "pcmpeqb %%xmm7,%%xmm7 \n"
346 "psrlw $0x8,%%xmm7 \n"
347 "lea " MEMLEA4(0x00,4,4,2) ",%3 \n"
351 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
352 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
353 MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2
355 MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3
356 "pavgb %%xmm2,%%xmm0 \n"
357 "pavgb %%xmm3,%%xmm1 \n"
358 MEMOPREG(movdqa,0x00,0,4,2,xmm2) // movdqa (%0,%4,2),%%xmm2
360 MEMOPREG(movdqa,0x10,0,4,2,xmm3) // movdqa 0x10(%0,%4,2),%%xmm3
361 MEMOPREG(movdqa,0x00,0,3,1,xmm4) // movdqa (%0,%3,1),%%xmm4
362 MEMOPREG(movdqa,0x10,0,3,1,xmm5) // movdqa 0x10(%0,%3,1),%%xmm5
363 "lea " MEMLEA(0x20,0) ",%0 \n"
364 "pavgb %%xmm4,%%xmm2 \n"
365 "pavgb %%xmm2,%%xmm0 \n"
366 "pavgb %%xmm5,%%xmm3 \n"
367 "pavgb %%xmm3,%%xmm1 \n"
368 "movdqa %%xmm0,%%xmm2 \n"
369 "psrlw $0x8,%%xmm0 \n"
370 "movdqa %%xmm1,%%xmm3 \n"
371 "psrlw $0x8,%%xmm1 \n"
372 "pand %%xmm7,%%xmm2 \n"
373 "pand %%xmm7,%%xmm3 \n"
374 "pavgw %%xmm2,%%xmm0 \n"
375 "pavgw %%xmm3,%%xmm1 \n"
376 "packuswb %%xmm1,%%xmm0 \n"
377 "movdqa %%xmm0,%%xmm2 \n"
378 "psrlw $0x8,%%xmm0 \n"
379 "pand %%xmm7,%%xmm2 \n"
380 "pavgw %%xmm2,%%xmm0 \n"
381 "packuswb %%xmm0,%%xmm0 \n"
382 "movq %%xmm0," MEMACCESS(1) " \n"
383 "lea " MEMLEA(0x8,1) ",%1 \n"
386 : "+r"(src_ptr), // %0
388 "+r"(dst_width), // %2
390 : "r"((intptr_t)(src_stride)) // %4
392 #if defined(__native_client__) && defined(__x86_64__)
395 #if defined(__SSE2__)
396 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
401 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
402 uint8* dst_ptr, int dst_width) {
404 "movdqa %0,%%xmm3 \n"
405 "movdqa %1,%%xmm4 \n"
406 "movdqa %2,%%xmm5 \n"
415 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
416 "movdqa " MEMACCESS2(0x10,0) ",%%xmm2 \n"
417 "lea " MEMLEA(0x20,0) ",%0 \n"
418 "movdqa %%xmm2,%%xmm1 \n"
419 "palignr $0x8,%%xmm0,%%xmm1 \n"
420 "pshufb %%xmm3,%%xmm0 \n"
421 "pshufb %%xmm4,%%xmm1 \n"
422 "pshufb %%xmm5,%%xmm2 \n"
423 "movq %%xmm0," MEMACCESS(1) " \n"
424 "movq %%xmm1," MEMACCESS2(0x8,1) " \n"
425 "movq %%xmm2," MEMACCESS2(0x10,1) " \n"
426 "lea " MEMLEA(0x18,1) ",%1 \n"
429 : "+r"(src_ptr), // %0
431 "+r"(dst_width) // %2
434 #if defined(__SSE2__)
435 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
440 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
441 ptrdiff_t src_stride,
442 uint8* dst_ptr, int dst_width) {
444 "movdqa %0,%%xmm2 \n" // kShuf01
445 "movdqa %1,%%xmm3 \n" // kShuf11
446 "movdqa %2,%%xmm4 \n" // kShuf21
448 : "m"(kShuf01), // %0
453 "movdqa %0,%%xmm5 \n" // kMadd01
454 "movdqa %1,%%xmm0 \n" // kMadd11
455 "movdqa %2,%%xmm1 \n" // kRound34
457 : "m"(kMadd01), // %0
464 "movdqa " MEMACCESS(0) ",%%xmm6 \n"
465 MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3),%%xmm7
466 "pavgb %%xmm7,%%xmm6 \n"
467 "pshufb %%xmm2,%%xmm6 \n"
468 "pmaddubsw %%xmm5,%%xmm6 \n"
469 "paddsw %%xmm1,%%xmm6 \n"
470 "psrlw $0x2,%%xmm6 \n"
471 "packuswb %%xmm6,%%xmm6 \n"
472 "movq %%xmm6," MEMACCESS(1) " \n"
473 "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n"
474 MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3),%%xmm7
475 "pavgb %%xmm7,%%xmm6 \n"
476 "pshufb %%xmm3,%%xmm6 \n"
477 "pmaddubsw %%xmm0,%%xmm6 \n"
478 "paddsw %%xmm1,%%xmm6 \n"
479 "psrlw $0x2,%%xmm6 \n"
480 "packuswb %%xmm6,%%xmm6 \n"
481 "movq %%xmm6," MEMACCESS2(0x8,1) " \n"
482 "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n"
484 MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3),%%xmm7
485 "lea " MEMLEA(0x20,0) ",%0 \n"
486 "pavgb %%xmm7,%%xmm6 \n"
487 "pshufb %%xmm4,%%xmm6 \n"
488 "pmaddubsw %4,%%xmm6 \n"
489 "paddsw %%xmm1,%%xmm6 \n"
490 "psrlw $0x2,%%xmm6 \n"
491 "packuswb %%xmm6,%%xmm6 \n"
492 "movq %%xmm6," MEMACCESS2(0x10,1) " \n"
493 "lea " MEMLEA(0x18,1) ",%1 \n"
496 : "+r"(src_ptr), // %0
498 "+r"(dst_width) // %2
499 : "r"((intptr_t)(src_stride)), // %3
502 #if defined(__native_client__) && defined(__x86_64__)
505 #if defined(__SSE2__)
506 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
511 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
512 ptrdiff_t src_stride,
513 uint8* dst_ptr, int dst_width) {
515 "movdqa %0,%%xmm2 \n" // kShuf01
516 "movdqa %1,%%xmm3 \n" // kShuf11
517 "movdqa %2,%%xmm4 \n" // kShuf21
519 : "m"(kShuf01), // %0
524 "movdqa %0,%%xmm5 \n" // kMadd01
525 "movdqa %1,%%xmm0 \n" // kMadd11
526 "movdqa %2,%%xmm1 \n" // kRound34
528 : "m"(kMadd01), // %0
536 "movdqa " MEMACCESS(0) ",%%xmm6 \n"
537 MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3,1),%%xmm7
538 "pavgb %%xmm6,%%xmm7 \n"
539 "pavgb %%xmm7,%%xmm6 \n"
540 "pshufb %%xmm2,%%xmm6 \n"
541 "pmaddubsw %%xmm5,%%xmm6 \n"
542 "paddsw %%xmm1,%%xmm6 \n"
543 "psrlw $0x2,%%xmm6 \n"
544 "packuswb %%xmm6,%%xmm6 \n"
545 "movq %%xmm6," MEMACCESS(1) " \n"
546 "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n"
547 MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3,1),%%xmm7
548 "pavgb %%xmm6,%%xmm7 \n"
549 "pavgb %%xmm7,%%xmm6 \n"
550 "pshufb %%xmm3,%%xmm6 \n"
551 "pmaddubsw %%xmm0,%%xmm6 \n"
552 "paddsw %%xmm1,%%xmm6 \n"
553 "psrlw $0x2,%%xmm6 \n"
554 "packuswb %%xmm6,%%xmm6 \n"
555 "movq %%xmm6," MEMACCESS2(0x8,1) " \n"
556 "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n"
557 MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3,1),%%xmm7
558 "lea " MEMLEA(0x20,0) ",%0 \n"
559 "pavgb %%xmm6,%%xmm7 \n"
560 "pavgb %%xmm7,%%xmm6 \n"
561 "pshufb %%xmm4,%%xmm6 \n"
562 "pmaddubsw %4,%%xmm6 \n"
563 "paddsw %%xmm1,%%xmm6 \n"
564 "psrlw $0x2,%%xmm6 \n"
565 "packuswb %%xmm6,%%xmm6 \n"
566 "movq %%xmm6," MEMACCESS2(0x10,1) " \n"
567 "lea " MEMLEA(0x18,1) ",%1 \n"
570 : "+r"(src_ptr), // %0
572 "+r"(dst_width) // %2
573 : "r"((intptr_t)(src_stride)), // %3
576 #if defined(__native_client__) && defined(__x86_64__)
579 #if defined(__SSE2__)
580 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
585 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
586 uint8* dst_ptr, int dst_width) {
588 "movdqa %3,%%xmm4 \n"
589 "movdqa %4,%%xmm5 \n"
593 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
594 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
595 "lea " MEMLEA(0x20,0) ",%0 \n"
596 "pshufb %%xmm4,%%xmm0 \n"
597 "pshufb %%xmm5,%%xmm1 \n"
598 "paddusb %%xmm1,%%xmm0 \n"
599 "movq %%xmm0," MEMACCESS(1) " \n"
600 "movhlps %%xmm0,%%xmm1 \n"
601 "movd %%xmm1," MEMACCESS2(0x8,1) " \n"
602 "lea " MEMLEA(0xc,1) ",%1 \n"
605 : "+r"(src_ptr), // %0
607 "+r"(dst_width) // %2
608 : "m"(kShuf38a), // %3
611 #if defined(__SSE2__)
612 , "xmm0", "xmm1", "xmm4", "xmm5"
617 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
618 ptrdiff_t src_stride,
619 uint8* dst_ptr, int dst_width) {
621 "movdqa %0,%%xmm2 \n"
622 "movdqa %1,%%xmm3 \n"
623 "movdqa %2,%%xmm4 \n"
624 "movdqa %3,%%xmm5 \n"
626 : "m"(kShufAb0), // %0
634 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
635 MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3,1),%%xmm0
636 "lea " MEMLEA(0x10,0) ",%0 \n"
637 "movdqa %%xmm0,%%xmm1 \n"
638 "pshufb %%xmm2,%%xmm1 \n"
639 "movdqa %%xmm0,%%xmm6 \n"
640 "pshufb %%xmm3,%%xmm6 \n"
641 "paddusw %%xmm6,%%xmm1 \n"
642 "pshufb %%xmm4,%%xmm0 \n"
643 "paddusw %%xmm0,%%xmm1 \n"
644 "pmulhuw %%xmm5,%%xmm1 \n"
645 "packuswb %%xmm1,%%xmm1 \n"
647 "movd %%xmm1," MEMACCESS(1) " \n"
648 "psrlq $0x10,%%xmm1 \n"
649 "movd %%xmm1," MEMACCESS2(0x2,1) " \n"
650 "lea " MEMLEA(0x6,1) ",%1 \n"
652 : "+r"(src_ptr), // %0
654 "+r"(dst_width) // %2
655 : "r"((intptr_t)(src_stride)) // %3
657 #if defined(__native_client__) && defined(__x86_64__)
660 #if defined(__SSE2__)
661 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
666 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
667 ptrdiff_t src_stride,
668 uint8* dst_ptr, int dst_width) {
670 "movdqa %0,%%xmm2 \n"
671 "movdqa %1,%%xmm3 \n"
672 "movdqa %2,%%xmm4 \n"
673 "pxor %%xmm5,%%xmm5 \n"
675 : "m"(kShufAc), // %0
677 "m"(kScaleAc33) // %2
682 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
683 MEMOPREG(movdqa,0x00,0,3,1,xmm6) // movdqa (%0,%3,1),%%xmm6
684 "movhlps %%xmm0,%%xmm1 \n"
685 "movhlps %%xmm6,%%xmm7 \n"
686 "punpcklbw %%xmm5,%%xmm0 \n"
687 "punpcklbw %%xmm5,%%xmm1 \n"
688 "punpcklbw %%xmm5,%%xmm6 \n"
689 "punpcklbw %%xmm5,%%xmm7 \n"
690 "paddusw %%xmm6,%%xmm0 \n"
691 "paddusw %%xmm7,%%xmm1 \n"
692 MEMOPREG(movdqa,0x00,0,3,2,xmm6) // movdqa (%0,%3,2),%%xmm6
693 "lea " MEMLEA(0x10,0) ",%0 \n"
694 "movhlps %%xmm6,%%xmm7 \n"
695 "punpcklbw %%xmm5,%%xmm6 \n"
696 "punpcklbw %%xmm5,%%xmm7 \n"
697 "paddusw %%xmm6,%%xmm0 \n"
698 "paddusw %%xmm7,%%xmm1 \n"
699 "movdqa %%xmm0,%%xmm6 \n"
700 "psrldq $0x2,%%xmm0 \n"
701 "paddusw %%xmm0,%%xmm6 \n"
702 "psrldq $0x2,%%xmm0 \n"
703 "paddusw %%xmm0,%%xmm6 \n"
704 "pshufb %%xmm2,%%xmm6 \n"
705 "movdqa %%xmm1,%%xmm7 \n"
706 "psrldq $0x2,%%xmm1 \n"
707 "paddusw %%xmm1,%%xmm7 \n"
708 "psrldq $0x2,%%xmm1 \n"
709 "paddusw %%xmm1,%%xmm7 \n"
710 "pshufb %%xmm3,%%xmm7 \n"
711 "paddusw %%xmm7,%%xmm6 \n"
712 "pmulhuw %%xmm4,%%xmm6 \n"
713 "packuswb %%xmm6,%%xmm6 \n"
715 "movd %%xmm6," MEMACCESS(1) " \n"
716 "psrlq $0x10,%%xmm6 \n"
717 "movd %%xmm6," MEMACCESS2(0x2,1) " \n"
718 "lea " MEMLEA(0x6,1) ",%1 \n"
720 : "+r"(src_ptr), // %0
722 "+r"(dst_width) // %2
723 : "r"((intptr_t)(src_stride)) // %3
725 #if defined(__native_client__) && defined(__x86_64__)
728 #if defined(__SSE2__)
729 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
734 void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
735 uint16* dst_ptr, int src_width, int src_height) {
737 intptr_t tmp_src = 0;
739 "pxor %%xmm4,%%xmm4 \n"
744 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
747 "movdqa %%xmm0,%%xmm1 \n"
748 "punpcklbw %%xmm4,%%xmm0 \n"
749 "punpckhbw %%xmm4,%%xmm1 \n"
756 "movdqa " MEMACCESS(0) ",%%xmm2 \n"
758 "movdqa %%xmm2,%%xmm3 \n"
759 "punpcklbw %%xmm4,%%xmm2 \n"
760 "punpckhbw %%xmm4,%%xmm3 \n"
761 "paddusw %%xmm2,%%xmm0 \n"
762 "paddusw %%xmm3,%%xmm1 \n"
768 "movdqa %%xmm0," MEMACCESS(1) " \n"
769 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
770 "lea " MEMLEA(0x10,3) ",%0 \n"
771 "lea " MEMLEA(0x20,1) ",%1 \n"
774 : "+r"(src_ptr), // %0
776 "+r"(tmp_height), // %2
778 "+r"(src_width), // %4
779 "+rm"(src_height) // %5
780 : "rm"((intptr_t)(src_stride)) // %6
782 #if defined(__SSE2__)
783 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
788 // Bilinear column filtering. SSSE3 version.
789 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
790 int dst_width, int x, int dx) {
791 intptr_t x0 = 0, x1 = 0, temp_pixel = 0;
795 "movl $0x04040000,%k2 \n"
797 "pcmpeqb %%xmm6,%%xmm6 \n"
798 "psrlw $0x9,%%xmm6 \n"
799 "pextrw $0x1,%%xmm2,%k3 \n"
802 "movdqa %%xmm2,%%xmm0 \n"
803 "paddd %%xmm3,%%xmm0 \n"
804 "punpckldq %%xmm0,%%xmm2 \n"
805 "punpckldq %%xmm3,%%xmm3 \n"
806 "paddd %%xmm3,%%xmm3 \n"
807 "pextrw $0x3,%%xmm2,%k4 \n"
811 "movdqa %%xmm2,%%xmm1 \n"
812 "paddd %%xmm3,%%xmm2 \n"
813 MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2
815 "psrlw $0x9,%%xmm1 \n"
817 MEMOPARG(movzwl,0x00,1,4,1,k2) // movzwl (%1,%4,1),%k2
819 "pshufb %%xmm5,%%xmm1 \n"
820 "punpcklwd %%xmm4,%%xmm0 \n"
821 "pxor %%xmm6,%%xmm1 \n"
822 "pmaddubsw %%xmm1,%%xmm0 \n"
823 "pextrw $0x1,%%xmm2,%k3 \n"
824 "pextrw $0x3,%%xmm2,%k4 \n"
825 "psrlw $0x7,%%xmm0 \n"
826 "packuswb %%xmm0,%%xmm0 \n"
828 "mov %w2," MEMACCESS(0) " \n"
829 "lea " MEMLEA(0x2,0) ",%0 \n"
837 MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2
839 "psrlw $0x9,%%xmm2 \n"
840 "pshufb %%xmm5,%%xmm2 \n"
841 "pxor %%xmm6,%%xmm2 \n"
842 "pmaddubsw %%xmm2,%%xmm0 \n"
843 "psrlw $0x7,%%xmm0 \n"
844 "packuswb %%xmm0,%%xmm0 \n"
846 "mov %b2," MEMACCESS(0) " \n"
848 : "+r"(dst_ptr), // %0
850 "+a"(temp_pixel), // %2
853 "+rm"(dst_width) // %5
857 #if defined(__native_client__) && defined(__x86_64__)
860 #if defined(__SSE2__)
861 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
866 // Reads 4 pixels, duplicates them and writes 8 pixels.
867 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
868 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
869 int dst_width, int x, int dx) {
873 "movdqa " MEMACCESS(1) ",%%xmm0 \n"
874 "lea " MEMLEA(0x10,1) ",%1 \n"
875 "movdqa %%xmm0,%%xmm1 \n"
876 "punpcklbw %%xmm0,%%xmm0 \n"
877 "punpckhbw %%xmm1,%%xmm1 \n"
879 "movdqa %%xmm0," MEMACCESS(0) " \n"
880 "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n"
881 "lea " MEMLEA(0x20,0) ",%0 \n"
884 : "+r"(dst_ptr), // %0
886 "+r"(dst_width) // %2
889 #if defined(__SSE2__)
895 void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
896 ptrdiff_t src_stride,
897 uint8* dst_argb, int dst_width) {
901 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
902 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
903 "lea " MEMLEA(0x20,0) ",%0 \n"
904 "shufps $0xdd,%%xmm1,%%xmm0 \n"
906 "movdqa %%xmm0," MEMACCESS(1) " \n"
907 "lea " MEMLEA(0x10,1) ",%1 \n"
909 : "+r"(src_argb), // %0
910 "+r"(dst_argb), // %1
911 "+r"(dst_width) // %2
914 #if defined(__SSE2__)
920 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
921 ptrdiff_t src_stride,
922 uint8* dst_argb, int dst_width) {
926 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
927 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
928 "lea " MEMLEA(0x20,0) ",%0 \n"
929 "movdqa %%xmm0,%%xmm2 \n"
930 "shufps $0x88,%%xmm1,%%xmm0 \n"
931 "shufps $0xdd,%%xmm1,%%xmm2 \n"
932 "pavgb %%xmm2,%%xmm0 \n"
934 "movdqa %%xmm0," MEMACCESS(1) " \n"
935 "lea " MEMLEA(0x10,1) ",%1 \n"
937 : "+r"(src_argb), // %0
938 "+r"(dst_argb), // %1
939 "+r"(dst_width) // %2
942 #if defined(__SSE2__)
948 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
949 ptrdiff_t src_stride,
950 uint8* dst_argb, int dst_width) {
954 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
955 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
957 MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2
958 MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3
959 "lea " MEMLEA(0x20,0) ",%0 \n"
960 "pavgb %%xmm2,%%xmm0 \n"
961 "pavgb %%xmm3,%%xmm1 \n"
962 "movdqa %%xmm0,%%xmm2 \n"
963 "shufps $0x88,%%xmm1,%%xmm0 \n"
964 "shufps $0xdd,%%xmm1,%%xmm2 \n"
965 "pavgb %%xmm2,%%xmm0 \n"
967 "movdqa %%xmm0," MEMACCESS(1) " \n"
968 "lea " MEMLEA(0x10,1) ",%1 \n"
970 : "+r"(src_argb), // %0
971 "+r"(dst_argb), // %1
972 "+r"(dst_width) // %2
973 : "r"((intptr_t)(src_stride)) // %3
975 #if defined(__native_client__) && defined(__x86_64__)
978 #if defined(__SSE2__)
979 , "xmm0", "xmm1", "xmm2", "xmm3"
984 // Reads 4 pixels at a time.
985 // Alignment requirement: dst_argb 16 byte aligned.
986 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
988 uint8* dst_argb, int dst_width) {
989 intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
990 intptr_t src_stepx_x12 = 0;
992 "lea " MEMLEA3(0x00,1,4) ",%1 \n"
993 "lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
996 "movd " MEMACCESS(0) ",%%xmm0 \n"
997 MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1
998 "punpckldq %%xmm1,%%xmm0 \n"
1000 MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2
1001 MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3
1002 "lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
1003 "punpckldq %%xmm3,%%xmm2 \n"
1004 "punpcklqdq %%xmm2,%%xmm0 \n"
1006 "movdqa %%xmm0," MEMACCESS(2) " \n"
1007 "lea " MEMLEA(0x10,2) ",%2 \n"
1009 : "+r"(src_argb), // %0
1010 "+r"(src_stepx_x4), // %1
1011 "+r"(dst_argb), // %2
1012 "+r"(dst_width), // %3
1013 "+r"(src_stepx_x12) // %4
1016 #if defined(__native_client__) && defined(__x86_64__)
1019 #if defined(__SSE2__)
1020 , "xmm0", "xmm1", "xmm2", "xmm3"
1025 // Blends four 2x2 to 4x1.
1026 // Alignment requirement: dst_argb 16 byte aligned.
1027 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
1028 ptrdiff_t src_stride, int src_stepx,
1029 uint8* dst_argb, int dst_width) {
1030 intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
1031 intptr_t src_stepx_x12 = 0;
1032 intptr_t row1 = (intptr_t)(src_stride);
1034 "lea " MEMLEA3(0x00,1,4) ",%1 \n"
1035 "lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
1036 "lea " MEMLEA4(0x00,0,5,1) ",%5 \n"
1040 "movq " MEMACCESS(0) ",%%xmm0 \n"
1041 MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0
1042 MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1
1044 MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1
1045 "lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
1046 "movq " MEMACCESS(5) ",%%xmm2 \n"
1048 MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2
1049 MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3
1050 MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3
1051 "lea " MEMLEA4(0x00,5,1,4) ",%5 \n"
1052 "pavgb %%xmm2,%%xmm0 \n"
1053 "pavgb %%xmm3,%%xmm1 \n"
1054 "movdqa %%xmm0,%%xmm2 \n"
1055 "shufps $0x88,%%xmm1,%%xmm0 \n"
1056 "shufps $0xdd,%%xmm1,%%xmm2 \n"
1057 "pavgb %%xmm2,%%xmm0 \n"
1059 "movdqa %%xmm0," MEMACCESS(2) " \n"
1060 "lea " MEMLEA(0x10,2) ",%2 \n"
1062 : "+r"(src_argb), // %0
1063 "+r"(src_stepx_x4), // %1
1064 "+r"(dst_argb), // %2
1065 "+rm"(dst_width), // %3
1066 "+r"(src_stepx_x12), // %4
1070 #if defined(__native_client__) && defined(__x86_64__)
1073 #if defined(__SSE2__)
1074 , "xmm0", "xmm1", "xmm2", "xmm3"
1079 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
1080 int dst_width, int x, int dx) {
1081 intptr_t x0 = 0, x1 = 0;
1085 "pshufd $0x0,%%xmm2,%%xmm2 \n"
1086 "pshufd $0x11,%%xmm3,%%xmm0 \n"
1087 "paddd %%xmm0,%%xmm2 \n"
1088 "paddd %%xmm3,%%xmm3 \n"
1089 "pshufd $0x5,%%xmm3,%%xmm0 \n"
1090 "paddd %%xmm0,%%xmm2 \n"
1091 "paddd %%xmm3,%%xmm3 \n"
1092 "pshufd $0x0,%%xmm3,%%xmm3 \n"
1093 "pextrw $0x1,%%xmm2,%k0 \n"
1094 "pextrw $0x3,%%xmm2,%k1 \n"
1102 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
1103 MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1
1104 "pextrw $0x5,%%xmm2,%k0 \n"
1105 "pextrw $0x7,%%xmm2,%k1 \n"
1106 "paddd %%xmm3,%%xmm2 \n"
1107 "punpckldq %%xmm1,%%xmm0 \n"
1108 MEMOPREG(movd,0x00,3,0,4,xmm1) // movd (%3,%0,4),%%xmm1
1109 MEMOPREG(movd,0x00,3,1,4,xmm4) // movd (%3,%1,4),%%xmm4
1110 "pextrw $0x1,%%xmm2,%k0 \n"
1111 "pextrw $0x3,%%xmm2,%k1 \n"
1112 "punpckldq %%xmm4,%%xmm1 \n"
1113 "punpcklqdq %%xmm1,%%xmm0 \n"
1115 "movdqu %%xmm0," MEMACCESS(2) " \n"
1116 "lea " MEMLEA(0x10,2) ",%2 \n"
1123 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
1124 MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1
1125 "pextrw $0x5,%%xmm2,%k0 \n"
1126 "punpckldq %%xmm1,%%xmm0 \n"
1127 "movq %%xmm0," MEMACCESS(2) " \n"
1128 "lea " MEMLEA(0x8,2) ",%2 \n"
1132 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
1133 "movd %%xmm0," MEMACCESS(2) " \n"
1137 "+r"(dst_argb), // %2
1138 "+r"(src_argb), // %3
1139 "+r"(dst_width) // %4
1143 #if defined(__native_client__) && defined(__x86_64__)
1146 #if defined(__SSE2__)
1147 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
1152 // Reads 4 pixels, duplicates them and writes 8 pixels.
1153 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
1154 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
1155 int dst_width, int x, int dx) {
1159 "movdqa " MEMACCESS(1) ",%%xmm0 \n"
1160 "lea " MEMLEA(0x10,1) ",%1 \n"
1161 "movdqa %%xmm0,%%xmm1 \n"
1162 "punpckldq %%xmm0,%%xmm0 \n"
1163 "punpckhdq %%xmm1,%%xmm1 \n"
1165 "movdqa %%xmm0," MEMACCESS(0) " \n"
1166 "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n"
1167 "lea " MEMLEA(0x20,0) ",%0 \n"
1170 : "+r"(dst_argb), // %0
1171 "+r"(src_argb), // %1
1172 "+r"(dst_width) // %2
1175 #if defined(__native_client__) && defined(__x86_64__)
1178 #if defined(__SSE2__)
1184 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
1185 static uvec8 kShuffleColARGB = {
1186 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
1187 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
1190 // Shuffle table for duplicating 2 fractions into 8 bytes each
1191 static uvec8 kShuffleFractions = {
1192 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
1195 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
1196 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
1197 int dst_width, int x, int dx) {
1198 intptr_t x0 = 0, x1 = 0;
1200 "movdqa %0,%%xmm4 \n"
1201 "movdqa %1,%%xmm5 \n"
1203 : "m"(kShuffleColARGB), // %0
1204 "m"(kShuffleFractions) // %1
1210 "pcmpeqb %%xmm6,%%xmm6 \n"
1211 "psrlw $0x9,%%xmm6 \n"
1212 "pextrw $0x1,%%xmm2,%k3 \n"
1215 "movdqa %%xmm2,%%xmm0 \n"
1216 "paddd %%xmm3,%%xmm0 \n"
1217 "punpckldq %%xmm0,%%xmm2 \n"
1218 "punpckldq %%xmm3,%%xmm3 \n"
1219 "paddd %%xmm3,%%xmm3 \n"
1220 "pextrw $0x3,%%xmm2,%k4 \n"
1224 "movdqa %%xmm2,%%xmm1 \n"
1225 "paddd %%xmm3,%%xmm2 \n"
1226 MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0
1227 "psrlw $0x9,%%xmm1 \n"
1229 MEMOPREG(movhps,0x00,1,4,4,xmm0) // movhps (%1,%4,4),%%xmm0
1230 "pshufb %%xmm5,%%xmm1 \n"
1231 "pshufb %%xmm4,%%xmm0 \n"
1232 "pxor %%xmm6,%%xmm1 \n"
1233 "pmaddubsw %%xmm1,%%xmm0 \n"
1234 "psrlw $0x7,%%xmm0 \n"
1235 "pextrw $0x1,%%xmm2,%k3 \n"
1236 "pextrw $0x3,%%xmm2,%k4 \n"
1237 "packuswb %%xmm0,%%xmm0 \n"
1238 "movq %%xmm0," MEMACCESS(0) " \n"
1239 "lea " MEMLEA(0x8,0) ",%0 \n"
1247 "psrlw $0x9,%%xmm2 \n"
1249 MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0
1250 "pshufb %%xmm5,%%xmm2 \n"
1251 "pshufb %%xmm4,%%xmm0 \n"
1252 "pxor %%xmm6,%%xmm2 \n"
1253 "pmaddubsw %%xmm2,%%xmm0 \n"
1254 "psrlw $0x7,%%xmm0 \n"
1255 "packuswb %%xmm0,%%xmm0 \n"
1256 "movd %%xmm0," MEMACCESS(0) " \n"
1260 : "+r"(dst_argb), // %0
1261 "+r"(src_argb), // %1
1262 "+rm"(dst_width), // %2
1268 #if defined(__native_client__) && defined(__x86_64__)
1271 #if defined(__SSE2__)
1272 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
1277 // Divide num by div and return as 16.16 fixed point result.
1278 int FixedDiv_X86(int num, int div) {
1281 "shld $0x10,%%eax,%%edx \n"
1282 "shl $0x10,%%eax \n"
1287 : "memory", "cc", "edx"
1292 // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
1293 int FixedDiv1_X86(int num, int div) {
1296 "shld $0x10,%%eax,%%edx \n"
1297 "shl $0x10,%%eax \n"
1298 "sub $0x10001,%%eax \n"
1305 : "memory", "cc", "edx"
1310 #endif // defined(__x86_64__) || defined(__i386__)
1314 } // namespace libyuv