Upstream version 10.39.225.0
[platform/framework/web/crosswalk.git] / src / third_party / libvpx / source / libvpx / third_party / libyuv / source / scale_posix.cc
1 /*
2  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10
11 #include "libyuv/row.h"
12
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17
18 // This module is for GCC x86 and x64.
19 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
20
21 // Offsets for source bytes 0 to 9
22 static uvec8 kShuf0 =
23   { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
24
25 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
26 static uvec8 kShuf1 =
27   { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
28
29 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
30 static uvec8 kShuf2 =
31   { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
32
33 // Offsets for source bytes 0 to 10
34 static uvec8 kShuf01 =
35   { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
36
37 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
38 static uvec8 kShuf11 =
39   { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
40
41 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
42 static uvec8 kShuf21 =
43   { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
44
45 // Coefficients for source bytes 0 to 10
46 static uvec8 kMadd01 =
47   { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
48
49 // Coefficients for source bytes 10 to 21
50 static uvec8 kMadd11 =
51   { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
52
53 // Coefficients for source bytes 21 to 31
54 static uvec8 kMadd21 =
55   { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
56
57 // Coefficients for source bytes 21 to 31
58 static vec16 kRound34 =
59   { 2, 2, 2, 2, 2, 2, 2, 2 };
60
61 static uvec8 kShuf38a =
62   { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
63
64 static uvec8 kShuf38b =
65   { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
66
67 // Arrange words 0,3,6 into 0,1,2
68 static uvec8 kShufAc =
69   { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
70
71 // Arrange words 0,3,6 into 3,4,5
72 static uvec8 kShufAc3 =
73   { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
74
75 // Scaling values for boxes of 3x3 and 2x3
76 static uvec16 kScaleAc33 =
77   { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
78
79 // Arrange first value for pixels 0,1,2,3,4,5
80 static uvec8 kShufAb0 =
81   { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
82
83 // Arrange second value for pixels 0,1,2,3,4,5
84 static uvec8 kShufAb1 =
85   { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
86
87 // Arrange third value for pixels 0,1,2,3,4,5
88 static uvec8 kShufAb2 =
89   { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
90
91 // Scaling values for boxes of 3x2 and 2x2
92 static uvec16 kScaleAb2 =
93   { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
94
95 // GCC versions of row functions are verbatim conversions from Visual C.
96 // Generated using gcc disassembly on Visual C object file:
97 // objdump -D yuvscaler.obj >yuvscaler.txt
98
99 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
100                         uint8* dst_ptr, int dst_width) {
101   asm volatile (
102     LABELALIGN
103   "1:                                          \n"
104     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
105     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
106     "lea       " MEMLEA(0x20,0) ",%0           \n"
107     "psrlw     $0x8,%%xmm0                     \n"
108     "psrlw     $0x8,%%xmm1                     \n"
109     "packuswb  %%xmm1,%%xmm0                   \n"
110     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
111     "lea       " MEMLEA(0x10,1) ",%1           \n"
112     "sub       $0x10,%2                        \n"
113     "jg        1b                              \n"
114   : "+r"(src_ptr),    // %0
115     "+r"(dst_ptr),    // %1
116     "+r"(dst_width)   // %2
117   :
118   : "memory", "cc"
119 #if defined(__SSE2__)
120     , "xmm0", "xmm1"
121 #endif
122   );
123 }
124
125 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
126                               uint8* dst_ptr, int dst_width) {
127   asm volatile (
128     "pcmpeqb   %%xmm5,%%xmm5                   \n"
129     "psrlw     $0x8,%%xmm5                     \n"
130
131     LABELALIGN
132   "1:                                          \n"
133     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
134     "movdqa    " MEMACCESS2(0x10, 0) ",%%xmm1  \n"
135     "lea       " MEMLEA(0x20,0) ",%0           \n"
136     "movdqa    %%xmm0,%%xmm2                   \n"
137     "psrlw     $0x8,%%xmm0                     \n"
138     "movdqa    %%xmm1,%%xmm3                   \n"
139     "psrlw     $0x8,%%xmm1                     \n"
140     "pand      %%xmm5,%%xmm2                   \n"
141     "pand      %%xmm5,%%xmm3                   \n"
142     "pavgw     %%xmm2,%%xmm0                   \n"
143     "pavgw     %%xmm3,%%xmm1                   \n"
144     "packuswb  %%xmm1,%%xmm0                   \n"
145     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
146     "lea       " MEMLEA(0x10,1) ",%1           \n"
147     "sub       $0x10,%2                        \n"
148     "jg        1b                              \n"
149   : "+r"(src_ptr),    // %0
150     "+r"(dst_ptr),    // %1
151     "+r"(dst_width)   // %2
152   :
153   : "memory", "cc"
154 #if defined(__SSE2__)
155     , "xmm0", "xmm1", "xmm5"
156 #endif
157   );
158 }
159
160 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
161                            uint8* dst_ptr, int dst_width) {
162   asm volatile (
163     "pcmpeqb   %%xmm5,%%xmm5                   \n"
164     "psrlw     $0x8,%%xmm5                     \n"
165
166     LABELALIGN
167   "1:                                          \n"
168     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
169     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
170     MEMOPREG(movdqa,0x00,0,3,1,xmm2)           //  movdqa  (%0,%3,1),%%xmm2
171     BUNDLEALIGN
172     MEMOPREG(movdqa,0x10,0,3,1,xmm3)           //  movdqa  0x10(%0,%3,1),%%xmm3
173     "lea       " MEMLEA(0x20,0) ",%0           \n"
174     "pavgb     %%xmm2,%%xmm0                   \n"
175     "pavgb     %%xmm3,%%xmm1                   \n"
176     "movdqa    %%xmm0,%%xmm2                   \n"
177     "psrlw     $0x8,%%xmm0                     \n"
178     "movdqa    %%xmm1,%%xmm3                   \n"
179     "psrlw     $0x8,%%xmm1                     \n"
180     "pand      %%xmm5,%%xmm2                   \n"
181     "pand      %%xmm5,%%xmm3                   \n"
182     "pavgw     %%xmm2,%%xmm0                   \n"
183     "pavgw     %%xmm3,%%xmm1                   \n"
184     "packuswb  %%xmm1,%%xmm0                   \n"
185     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
186     "lea       " MEMLEA(0x10,1) ",%1           \n"
187     "sub       $0x10,%2                        \n"
188     "jg        1b                              \n"
189   : "+r"(src_ptr),    // %0
190     "+r"(dst_ptr),    // %1
191     "+r"(dst_width)   // %2
192   : "r"((intptr_t)(src_stride))   // %3
193   : "memory", "cc"
194 #if defined(__native_client__) && defined(__x86_64__)
195     , "r14"
196 #endif
197 #if defined(__SSE2__)
198     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
199 #endif
200   );
201 }
202
203 void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
204                                   uint8* dst_ptr, int dst_width) {
205   asm volatile (
206     LABELALIGN
207   "1:                                          \n"
208     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
209     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
210     "lea       " MEMLEA(0x20,0) ",%0           \n"
211     "psrlw     $0x8,%%xmm0                     \n"
212     "psrlw     $0x8,%%xmm1                     \n"
213     "packuswb  %%xmm1,%%xmm0                   \n"
214     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
215     "lea       " MEMLEA(0x10,1) ",%1           \n"
216     "sub       $0x10,%2                        \n"
217     "jg        1b                              \n"
218   : "+r"(src_ptr),    // %0
219     "+r"(dst_ptr),    // %1
220     "+r"(dst_width)   // %2
221   :
222   : "memory", "cc"
223 #if defined(__SSE2__)
224     , "xmm0", "xmm1"
225 #endif
226   );
227 }
228
229 void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
230                                         ptrdiff_t src_stride,
231                                         uint8* dst_ptr, int dst_width) {
232   asm volatile (
233     "pcmpeqb   %%xmm5,%%xmm5                   \n"
234     "psrlw     $0x8,%%xmm5                     \n"
235
236     LABELALIGN
237   "1:                                          \n"
238     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
239     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
240     "lea       " MEMLEA(0x20,0) ",%0           \n"
241     "movdqa    %%xmm0,%%xmm2                   \n"
242     "psrlw     $0x8,%%xmm0                     \n"
243     "movdqa    %%xmm1,%%xmm3                   \n"
244     "psrlw     $0x8,%%xmm1                     \n"
245     "pand      %%xmm5,%%xmm2                   \n"
246     "pand      %%xmm5,%%xmm3                   \n"
247     "pavgw     %%xmm2,%%xmm0                   \n"
248     "pavgw     %%xmm3,%%xmm1                   \n"
249     "packuswb  %%xmm1,%%xmm0                   \n"
250     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
251     "lea       " MEMLEA(0x10,1) ",%1           \n"
252     "sub       $0x10,%2                        \n"
253     "jg        1b                              \n"
254   : "+r"(src_ptr),    // %0
255     "+r"(dst_ptr),    // %1
256     "+r"(dst_width)   // %2
257   :
258   : "memory", "cc"
259 #if defined(__SSE2__)
260     , "xmm0", "xmm1", "xmm5"
261 #endif
262   );
263 }
264
265 void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
266                                      ptrdiff_t src_stride,
267                                      uint8* dst_ptr, int dst_width) {
268   asm volatile (
269     "pcmpeqb   %%xmm5,%%xmm5                   \n"
270     "psrlw     $0x8,%%xmm5                     \n"
271
272     LABELALIGN
273   "1:                                          \n"
274     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
275     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
276     MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu  (%0,%3,1),%%xmm2
277     BUNDLEALIGN
278     MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu  0x10(%0,%3,1),%%xmm3
279     "lea       " MEMLEA(0x20,0) ",%0           \n"
280     "pavgb     %%xmm2,%%xmm0                   \n"
281     "pavgb     %%xmm3,%%xmm1                   \n"
282     "movdqa    %%xmm0,%%xmm2                   \n"
283     "psrlw     $0x8,%%xmm0                     \n"
284     "movdqa    %%xmm1,%%xmm3                   \n"
285     "psrlw     $0x8,%%xmm1                     \n"
286     "pand      %%xmm5,%%xmm2                   \n"
287     "pand      %%xmm5,%%xmm3                   \n"
288     "pavgw     %%xmm2,%%xmm0                   \n"
289     "pavgw     %%xmm3,%%xmm1                   \n"
290     "packuswb  %%xmm1,%%xmm0                   \n"
291     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
292     "lea       " MEMLEA(0x10,1) ",%1           \n"
293     "sub       $0x10,%2                        \n"
294     "jg        1b                              \n"
295   : "+r"(src_ptr),    // %0
296     "+r"(dst_ptr),    // %1
297     "+r"(dst_width)   // %2
298   : "r"((intptr_t)(src_stride))   // %3
299   : "memory", "cc"
300 #if defined(__native_client__) && defined(__x86_64__)
301     , "r14"
302 #endif
303 #if defined(__SSE2__)
304     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
305 #endif
306   );
307 }
308
309 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
310                         uint8* dst_ptr, int dst_width) {
311   asm volatile (
312     "pcmpeqb   %%xmm5,%%xmm5                   \n"
313     "psrld     $0x18,%%xmm5                    \n"
314     "pslld     $0x10,%%xmm5                    \n"
315
316     LABELALIGN
317   "1:                                          \n"
318     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
319     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
320     "lea       " MEMLEA(0x20,0) ",%0           \n"
321     "pand      %%xmm5,%%xmm0                   \n"
322     "pand      %%xmm5,%%xmm1                   \n"
323     "packuswb  %%xmm1,%%xmm0                   \n"
324     "psrlw     $0x8,%%xmm0                     \n"
325     "packuswb  %%xmm0,%%xmm0                   \n"
326     "movq      %%xmm0," MEMACCESS(1) "         \n"
327     "lea       " MEMLEA(0x8,1) ",%1            \n"
328     "sub       $0x8,%2                         \n"
329     "jg        1b                              \n"
330   : "+r"(src_ptr),    // %0
331     "+r"(dst_ptr),    // %1
332     "+r"(dst_width)   // %2
333   :
334   : "memory", "cc"
335 #if defined(__SSE2__)
336     , "xmm0", "xmm1", "xmm5"
337 #endif
338   );
339 }
340
341 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
342                            uint8* dst_ptr, int dst_width) {
343   intptr_t stridex3 = 0;
344   asm volatile (
345     "pcmpeqb   %%xmm7,%%xmm7                   \n"
346     "psrlw     $0x8,%%xmm7                     \n"
347     "lea       " MEMLEA4(0x00,4,4,2) ",%3      \n"
348
349     LABELALIGN
350   "1:                                          \n"
351     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
352     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
353     MEMOPREG(movdqa,0x00,0,4,1,xmm2)           //  movdqa  (%0,%4,1),%%xmm2
354     BUNDLEALIGN
355     MEMOPREG(movdqa,0x10,0,4,1,xmm3)           //  movdqa  0x10(%0,%4,1),%%xmm3
356     "pavgb     %%xmm2,%%xmm0                   \n"
357     "pavgb     %%xmm3,%%xmm1                   \n"
358     MEMOPREG(movdqa,0x00,0,4,2,xmm2)           //  movdqa  (%0,%4,2),%%xmm2
359     BUNDLEALIGN
360     MEMOPREG(movdqa,0x10,0,4,2,xmm3)           //  movdqa  0x10(%0,%4,2),%%xmm3
361     MEMOPREG(movdqa,0x00,0,3,1,xmm4)           //  movdqa  (%0,%3,1),%%xmm4
362     MEMOPREG(movdqa,0x10,0,3,1,xmm5)           //  movdqa  0x10(%0,%3,1),%%xmm5
363     "lea       " MEMLEA(0x20,0) ",%0           \n"
364     "pavgb     %%xmm4,%%xmm2                   \n"
365     "pavgb     %%xmm2,%%xmm0                   \n"
366     "pavgb     %%xmm5,%%xmm3                   \n"
367     "pavgb     %%xmm3,%%xmm1                   \n"
368     "movdqa    %%xmm0,%%xmm2                   \n"
369     "psrlw     $0x8,%%xmm0                     \n"
370     "movdqa    %%xmm1,%%xmm3                   \n"
371     "psrlw     $0x8,%%xmm1                     \n"
372     "pand      %%xmm7,%%xmm2                   \n"
373     "pand      %%xmm7,%%xmm3                   \n"
374     "pavgw     %%xmm2,%%xmm0                   \n"
375     "pavgw     %%xmm3,%%xmm1                   \n"
376     "packuswb  %%xmm1,%%xmm0                   \n"
377     "movdqa    %%xmm0,%%xmm2                   \n"
378     "psrlw     $0x8,%%xmm0                     \n"
379     "pand      %%xmm7,%%xmm2                   \n"
380     "pavgw     %%xmm2,%%xmm0                   \n"
381     "packuswb  %%xmm0,%%xmm0                   \n"
382     "movq      %%xmm0," MEMACCESS(1) "         \n"
383     "lea       " MEMLEA(0x8,1) ",%1            \n"
384     "sub       $0x8,%2                         \n"
385     "jg        1b                              \n"
386   : "+r"(src_ptr),     // %0
387     "+r"(dst_ptr),     // %1
388     "+r"(dst_width),   // %2
389     "+r"(stridex3)     // %3
390   : "r"((intptr_t)(src_stride))    // %4
391   : "memory", "cc"
392 #if defined(__native_client__) && defined(__x86_64__)
393     , "r14"
394 #endif
395 #if defined(__SSE2__)
396     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
397 #endif
398   );
399 }
400
401 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
402                           uint8* dst_ptr, int dst_width) {
403   asm volatile (
404     "movdqa    %0,%%xmm3                       \n"
405     "movdqa    %1,%%xmm4                       \n"
406     "movdqa    %2,%%xmm5                       \n"
407   :
408   : "m"(kShuf0),  // %0
409     "m"(kShuf1),  // %1
410     "m"(kShuf2)   // %2
411   );
412   asm volatile (
413     LABELALIGN
414   "1:                                          \n"
415     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
416     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm2   \n"
417     "lea       " MEMLEA(0x20,0) ",%0           \n"
418     "movdqa    %%xmm2,%%xmm1                   \n"
419     "palignr   $0x8,%%xmm0,%%xmm1              \n"
420     "pshufb    %%xmm3,%%xmm0                   \n"
421     "pshufb    %%xmm4,%%xmm1                   \n"
422     "pshufb    %%xmm5,%%xmm2                   \n"
423     "movq      %%xmm0," MEMACCESS(1) "         \n"
424     "movq      %%xmm1," MEMACCESS2(0x8,1) "    \n"
425     "movq      %%xmm2," MEMACCESS2(0x10,1) "   \n"
426     "lea       " MEMLEA(0x18,1) ",%1           \n"
427     "sub       $0x18,%2                        \n"
428     "jg        1b                              \n"
429   : "+r"(src_ptr),   // %0
430     "+r"(dst_ptr),   // %1
431     "+r"(dst_width)  // %2
432   :
433   : "memory", "cc"
434 #if defined(__SSE2__)
435     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
436 #endif
437   );
438 }
439
440 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
441                                 ptrdiff_t src_stride,
442                                 uint8* dst_ptr, int dst_width) {
443   asm volatile (
444     "movdqa    %0,%%xmm2                       \n"  // kShuf01
445     "movdqa    %1,%%xmm3                       \n"  // kShuf11
446     "movdqa    %2,%%xmm4                       \n"  // kShuf21
447   :
448   : "m"(kShuf01),  // %0
449     "m"(kShuf11),  // %1
450     "m"(kShuf21)   // %2
451   );
452   asm volatile (
453     "movdqa    %0,%%xmm5                       \n"  // kMadd01
454     "movdqa    %1,%%xmm0                       \n"  // kMadd11
455     "movdqa    %2,%%xmm1                       \n"  // kRound34
456   :
457   : "m"(kMadd01),  // %0
458     "m"(kMadd11),  // %1
459     "m"(kRound34)  // %2
460   );
461   asm volatile (
462     LABELALIGN
463   "1:                                          \n"
464     "movdqa    " MEMACCESS(0) ",%%xmm6         \n"
465     MEMOPREG(movdqa,0x00,0,3,1,xmm7)           //  movdqa  (%0,%3),%%xmm7
466     "pavgb     %%xmm7,%%xmm6                   \n"
467     "pshufb    %%xmm2,%%xmm6                   \n"
468     "pmaddubsw %%xmm5,%%xmm6                   \n"
469     "paddsw    %%xmm1,%%xmm6                   \n"
470     "psrlw     $0x2,%%xmm6                     \n"
471     "packuswb  %%xmm6,%%xmm6                   \n"
472     "movq      %%xmm6," MEMACCESS(1) "         \n"
473     "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
474     MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3),%%xmm7
475     "pavgb     %%xmm7,%%xmm6                   \n"
476     "pshufb    %%xmm3,%%xmm6                   \n"
477     "pmaddubsw %%xmm0,%%xmm6                   \n"
478     "paddsw    %%xmm1,%%xmm6                   \n"
479     "psrlw     $0x2,%%xmm6                     \n"
480     "packuswb  %%xmm6,%%xmm6                   \n"
481     "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
482     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
483     BUNDLEALIGN
484     MEMOPREG(movdqa,0x10,0,3,1,xmm7)           //  movdqa  0x10(%0,%3),%%xmm7
485     "lea       " MEMLEA(0x20,0) ",%0           \n"
486     "pavgb     %%xmm7,%%xmm6                   \n"
487     "pshufb    %%xmm4,%%xmm6                   \n"
488     "pmaddubsw %4,%%xmm6                       \n"
489     "paddsw    %%xmm1,%%xmm6                   \n"
490     "psrlw     $0x2,%%xmm6                     \n"
491     "packuswb  %%xmm6,%%xmm6                   \n"
492     "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
493     "lea       " MEMLEA(0x18,1) ",%1           \n"
494     "sub       $0x18,%2                        \n"
495     "jg        1b                              \n"
496   : "+r"(src_ptr),   // %0
497     "+r"(dst_ptr),   // %1
498     "+r"(dst_width)  // %2
499   : "r"((intptr_t)(src_stride)),  // %3
500     "m"(kMadd21)     // %4
501   : "memory", "cc"
502 #if defined(__native_client__) && defined(__x86_64__)
503     , "r14"
504 #endif
505 #if defined(__SSE2__)
506     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
507 #endif
508   );
509 }
510
511 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
512                                 ptrdiff_t src_stride,
513                                 uint8* dst_ptr, int dst_width) {
514   asm volatile (
515     "movdqa    %0,%%xmm2                       \n"  // kShuf01
516     "movdqa    %1,%%xmm3                       \n"  // kShuf11
517     "movdqa    %2,%%xmm4                       \n"  // kShuf21
518   :
519   : "m"(kShuf01),  // %0
520     "m"(kShuf11),  // %1
521     "m"(kShuf21)   // %2
522   );
523   asm volatile (
524     "movdqa    %0,%%xmm5                       \n"  // kMadd01
525     "movdqa    %1,%%xmm0                       \n"  // kMadd11
526     "movdqa    %2,%%xmm1                       \n"  // kRound34
527   :
528   : "m"(kMadd01),  // %0
529     "m"(kMadd11),  // %1
530     "m"(kRound34)  // %2
531   );
532
533   asm volatile (
534     LABELALIGN
535   "1:                                          \n"
536     "movdqa    " MEMACCESS(0) ",%%xmm6         \n"
537     MEMOPREG(movdqa,0x00,0,3,1,xmm7)           //  movdqa  (%0,%3,1),%%xmm7
538     "pavgb     %%xmm6,%%xmm7                   \n"
539     "pavgb     %%xmm7,%%xmm6                   \n"
540     "pshufb    %%xmm2,%%xmm6                   \n"
541     "pmaddubsw %%xmm5,%%xmm6                   \n"
542     "paddsw    %%xmm1,%%xmm6                   \n"
543     "psrlw     $0x2,%%xmm6                     \n"
544     "packuswb  %%xmm6,%%xmm6                   \n"
545     "movq      %%xmm6," MEMACCESS(1) "         \n"
546     "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
547     MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3,1),%%xmm7
548     "pavgb     %%xmm6,%%xmm7                   \n"
549     "pavgb     %%xmm7,%%xmm6                   \n"
550     "pshufb    %%xmm3,%%xmm6                   \n"
551     "pmaddubsw %%xmm0,%%xmm6                   \n"
552     "paddsw    %%xmm1,%%xmm6                   \n"
553     "psrlw     $0x2,%%xmm6                     \n"
554     "packuswb  %%xmm6,%%xmm6                   \n"
555     "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
556     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
557     MEMOPREG(movdqa,0x10,0,3,1,xmm7)           //  movdqa  0x10(%0,%3,1),%%xmm7
558     "lea       " MEMLEA(0x20,0) ",%0           \n"
559     "pavgb     %%xmm6,%%xmm7                   \n"
560     "pavgb     %%xmm7,%%xmm6                   \n"
561     "pshufb    %%xmm4,%%xmm6                   \n"
562     "pmaddubsw %4,%%xmm6                       \n"
563     "paddsw    %%xmm1,%%xmm6                   \n"
564     "psrlw     $0x2,%%xmm6                     \n"
565     "packuswb  %%xmm6,%%xmm6                   \n"
566     "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
567     "lea       " MEMLEA(0x18,1) ",%1           \n"
568     "sub       $0x18,%2                        \n"
569     "jg        1b                              \n"
570     : "+r"(src_ptr),   // %0
571       "+r"(dst_ptr),   // %1
572       "+r"(dst_width)  // %2
573     : "r"((intptr_t)(src_stride)),  // %3
574       "m"(kMadd21)     // %4
575     : "memory", "cc"
576 #if defined(__native_client__) && defined(__x86_64__)
577     , "r14"
578 #endif
579 #if defined(__SSE2__)
580     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
581 #endif
582   );
583 }
584
585 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
586                           uint8* dst_ptr, int dst_width) {
587   asm volatile (
588     "movdqa    %3,%%xmm4                       \n"
589     "movdqa    %4,%%xmm5                       \n"
590
591     LABELALIGN
592   "1:                                          \n"
593     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
594     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
595     "lea       " MEMLEA(0x20,0) ",%0           \n"
596     "pshufb    %%xmm4,%%xmm0                   \n"
597     "pshufb    %%xmm5,%%xmm1                   \n"
598     "paddusb   %%xmm1,%%xmm0                   \n"
599     "movq      %%xmm0," MEMACCESS(1) "         \n"
600     "movhlps   %%xmm0,%%xmm1                   \n"
601     "movd      %%xmm1," MEMACCESS2(0x8,1) "    \n"
602     "lea       " MEMLEA(0xc,1) ",%1            \n"
603     "sub       $0xc,%2                         \n"
604     "jg        1b                              \n"
605   : "+r"(src_ptr),   // %0
606     "+r"(dst_ptr),   // %1
607     "+r"(dst_width)  // %2
608   : "m"(kShuf38a),   // %3
609     "m"(kShuf38b)    // %4
610   : "memory", "cc"
611 #if defined(__SSE2__)
612       , "xmm0", "xmm1", "xmm4", "xmm5"
613 #endif
614   );
615 }
616
617 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
618                                 ptrdiff_t src_stride,
619                                 uint8* dst_ptr, int dst_width) {
620   asm volatile (
621     "movdqa    %0,%%xmm2                       \n"
622     "movdqa    %1,%%xmm3                       \n"
623     "movdqa    %2,%%xmm4                       \n"
624     "movdqa    %3,%%xmm5                       \n"
625   :
626   : "m"(kShufAb0),   // %0
627     "m"(kShufAb1),   // %1
628     "m"(kShufAb2),   // %2
629     "m"(kScaleAb2)   // %3
630   );
631   asm volatile (
632     LABELALIGN
633   "1:                                          \n"
634     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
635     MEMOPREG(pavgb,0x00,0,3,1,xmm0)            //  pavgb   (%0,%3,1),%%xmm0
636     "lea       " MEMLEA(0x10,0) ",%0           \n"
637     "movdqa    %%xmm0,%%xmm1                   \n"
638     "pshufb    %%xmm2,%%xmm1                   \n"
639     "movdqa    %%xmm0,%%xmm6                   \n"
640     "pshufb    %%xmm3,%%xmm6                   \n"
641     "paddusw   %%xmm6,%%xmm1                   \n"
642     "pshufb    %%xmm4,%%xmm0                   \n"
643     "paddusw   %%xmm0,%%xmm1                   \n"
644     "pmulhuw   %%xmm5,%%xmm1                   \n"
645     "packuswb  %%xmm1,%%xmm1                   \n"
646     "sub       $0x6,%2                         \n"
647     "movd      %%xmm1," MEMACCESS(1) "         \n"
648     "psrlq     $0x10,%%xmm1                    \n"
649     "movd      %%xmm1," MEMACCESS2(0x2,1) "    \n"
650     "lea       " MEMLEA(0x6,1) ",%1            \n"
651     "jg        1b                              \n"
652   : "+r"(src_ptr),     // %0
653     "+r"(dst_ptr),     // %1
654     "+r"(dst_width)    // %2
655   : "r"((intptr_t)(src_stride))  // %3
656   : "memory", "cc"
657 #if defined(__native_client__) && defined(__x86_64__)
658     , "r14"
659 #endif
660 #if defined(__SSE2__)
661     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
662 #endif
663   );
664 }
665
666 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
667                                 ptrdiff_t src_stride,
668                                 uint8* dst_ptr, int dst_width) {
669   asm volatile (
670     "movdqa    %0,%%xmm2                       \n"
671     "movdqa    %1,%%xmm3                       \n"
672     "movdqa    %2,%%xmm4                       \n"
673     "pxor      %%xmm5,%%xmm5                   \n"
674   :
675   : "m"(kShufAc),    // %0
676     "m"(kShufAc3),   // %1
677     "m"(kScaleAc33)  // %2
678   );
679   asm volatile (
680     LABELALIGN
681   "1:                                          \n"
682     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
683     MEMOPREG(movdqa,0x00,0,3,1,xmm6)           //  movdqa  (%0,%3,1),%%xmm6
684     "movhlps   %%xmm0,%%xmm1                   \n"
685     "movhlps   %%xmm6,%%xmm7                   \n"
686     "punpcklbw %%xmm5,%%xmm0                   \n"
687     "punpcklbw %%xmm5,%%xmm1                   \n"
688     "punpcklbw %%xmm5,%%xmm6                   \n"
689     "punpcklbw %%xmm5,%%xmm7                   \n"
690     "paddusw   %%xmm6,%%xmm0                   \n"
691     "paddusw   %%xmm7,%%xmm1                   \n"
692     MEMOPREG(movdqa,0x00,0,3,2,xmm6)           //  movdqa  (%0,%3,2),%%xmm6
693     "lea       " MEMLEA(0x10,0) ",%0           \n"
694     "movhlps   %%xmm6,%%xmm7                   \n"
695     "punpcklbw %%xmm5,%%xmm6                   \n"
696     "punpcklbw %%xmm5,%%xmm7                   \n"
697     "paddusw   %%xmm6,%%xmm0                   \n"
698     "paddusw   %%xmm7,%%xmm1                   \n"
699     "movdqa    %%xmm0,%%xmm6                   \n"
700     "psrldq    $0x2,%%xmm0                     \n"
701     "paddusw   %%xmm0,%%xmm6                   \n"
702     "psrldq    $0x2,%%xmm0                     \n"
703     "paddusw   %%xmm0,%%xmm6                   \n"
704     "pshufb    %%xmm2,%%xmm6                   \n"
705     "movdqa    %%xmm1,%%xmm7                   \n"
706     "psrldq    $0x2,%%xmm1                     \n"
707     "paddusw   %%xmm1,%%xmm7                   \n"
708     "psrldq    $0x2,%%xmm1                     \n"
709     "paddusw   %%xmm1,%%xmm7                   \n"
710     "pshufb    %%xmm3,%%xmm7                   \n"
711     "paddusw   %%xmm7,%%xmm6                   \n"
712     "pmulhuw   %%xmm4,%%xmm6                   \n"
713     "packuswb  %%xmm6,%%xmm6                   \n"
714     "sub       $0x6,%2                         \n"
715     "movd      %%xmm6," MEMACCESS(1) "         \n"
716     "psrlq     $0x10,%%xmm6                    \n"
717     "movd      %%xmm6," MEMACCESS2(0x2,1) "    \n"
718     "lea       " MEMLEA(0x6,1) ",%1            \n"
719     "jg        1b                              \n"
720   : "+r"(src_ptr),    // %0
721     "+r"(dst_ptr),    // %1
722     "+r"(dst_width)   // %2
723   : "r"((intptr_t)(src_stride))   // %3
724   : "memory", "cc"
725 #if defined(__native_client__) && defined(__x86_64__)
726     , "r14"
727 #endif
728 #if defined(__SSE2__)
729     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
730 #endif
731   );
732 }
733
734 void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
735                        uint16* dst_ptr, int src_width, int src_height) {
736   int tmp_height = 0;
737   intptr_t tmp_src = 0;
738   asm volatile (
739     "pxor      %%xmm4,%%xmm4                   \n"
740     "sub       $0x1,%5                         \n"
741
742     LABELALIGN
743   "1:                                          \n"
744     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
745     "mov       %0,%3                           \n"
746     "add       %6,%0                           \n"
747     "movdqa    %%xmm0,%%xmm1                   \n"
748     "punpcklbw %%xmm4,%%xmm0                   \n"
749     "punpckhbw %%xmm4,%%xmm1                   \n"
750     "mov       %5,%2                           \n"
751     "test      %2,%2                           \n"
752     "je        3f                              \n"
753
754     LABELALIGN
755   "2:                                          \n"
756     "movdqa    " MEMACCESS(0) ",%%xmm2         \n"
757     "add       %6,%0                           \n"
758     "movdqa    %%xmm2,%%xmm3                   \n"
759     "punpcklbw %%xmm4,%%xmm2                   \n"
760     "punpckhbw %%xmm4,%%xmm3                   \n"
761     "paddusw   %%xmm2,%%xmm0                   \n"
762     "paddusw   %%xmm3,%%xmm1                   \n"
763     "sub       $0x1,%2                         \n"
764     "jg        2b                              \n"
765
766     LABELALIGN
767   "3:                                          \n"
768     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
769     "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
770     "lea       " MEMLEA(0x10,3) ",%0           \n"
771     "lea       " MEMLEA(0x20,1) ",%1           \n"
772     "sub       $0x10,%4                        \n"
773     "jg        1b                              \n"
774   : "+r"(src_ptr),     // %0
775     "+r"(dst_ptr),     // %1
776     "+r"(tmp_height),  // %2
777     "+r"(tmp_src),     // %3
778     "+r"(src_width),   // %4
779     "+rm"(src_height)  // %5
780   : "rm"((intptr_t)(src_stride))  // %6
781   : "memory", "cc"
782 #if defined(__SSE2__)
783     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
784 #endif
785   );
786 }
787
788 // Bilinear column filtering. SSSE3 version.
789 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
790                            int dst_width, int x, int dx) {
791   intptr_t x0 = 0, x1 = 0, temp_pixel = 0;
792   asm volatile (
793     "movd      %6,%%xmm2                       \n"
794     "movd      %7,%%xmm3                       \n"
795     "movl      $0x04040000,%k2                 \n"
796     "movd      %k2,%%xmm5                      \n"
797     "pcmpeqb   %%xmm6,%%xmm6                   \n"
798     "psrlw     $0x9,%%xmm6                     \n"
799     "pextrw    $0x1,%%xmm2,%k3                 \n"
800     "subl      $0x2,%5                         \n"
801     "jl        29f                             \n"
802     "movdqa    %%xmm2,%%xmm0                   \n"
803     "paddd     %%xmm3,%%xmm0                   \n"
804     "punpckldq %%xmm0,%%xmm2                   \n"
805     "punpckldq %%xmm3,%%xmm3                   \n"
806     "paddd     %%xmm3,%%xmm3                   \n"
807     "pextrw    $0x3,%%xmm2,%k4                 \n"
808
809     LABELALIGN
810   "2:                                          \n"
811     "movdqa    %%xmm2,%%xmm1                   \n"
812     "paddd     %%xmm3,%%xmm2                   \n"
813     MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
814     "movd      %k2,%%xmm0                      \n"
815     "psrlw     $0x9,%%xmm1                     \n"
816     BUNDLEALIGN
817     MEMOPARG(movzwl,0x00,1,4,1,k2)             //  movzwl  (%1,%4,1),%k2
818     "movd      %k2,%%xmm4                      \n"
819     "pshufb    %%xmm5,%%xmm1                   \n"
820     "punpcklwd %%xmm4,%%xmm0                   \n"
821     "pxor      %%xmm6,%%xmm1                   \n"
822     "pmaddubsw %%xmm1,%%xmm0                   \n"
823     "pextrw    $0x1,%%xmm2,%k3                 \n"
824     "pextrw    $0x3,%%xmm2,%k4                 \n"
825     "psrlw     $0x7,%%xmm0                     \n"
826     "packuswb  %%xmm0,%%xmm0                   \n"
827     "movd      %%xmm0,%k2                      \n"
828     "mov       %w2," MEMACCESS(0) "            \n"
829     "lea       " MEMLEA(0x2,0) ",%0            \n"
830     "sub       $0x2,%5                         \n"
831     "jge       2b                              \n"
832
833     LABELALIGN
834   "29:                                         \n"
835     "addl      $0x1,%5                         \n"
836     "jl        99f                             \n"
837     MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
838     "movd      %k2,%%xmm0                      \n"
839     "psrlw     $0x9,%%xmm2                     \n"
840     "pshufb    %%xmm5,%%xmm2                   \n"
841     "pxor      %%xmm6,%%xmm2                   \n"
842     "pmaddubsw %%xmm2,%%xmm0                   \n"
843     "psrlw     $0x7,%%xmm0                     \n"
844     "packuswb  %%xmm0,%%xmm0                   \n"
845     "movd      %%xmm0,%k2                      \n"
846     "mov       %b2," MEMACCESS(0) "            \n"
847   "99:                                         \n"
848   : "+r"(dst_ptr),     // %0
849     "+r"(src_ptr),     // %1
850     "+a"(temp_pixel),  // %2
851     "+r"(x0),          // %3
852     "+r"(x1),          // %4
853     "+rm"(dst_width)   // %5
854   : "rm"(x),           // %6
855     "rm"(dx)           // %7
856   : "memory", "cc"
857 #if defined(__native_client__) && defined(__x86_64__)
858     , "r14"
859 #endif
860 #if defined(__SSE2__)
861     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
862 #endif
863   );
864 }
865
866 // Reads 4 pixels, duplicates them and writes 8 pixels.
867 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
868 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
869                        int dst_width, int x, int dx) {
870   asm volatile (
871     LABELALIGN
872   "1:                                          \n"
873     "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
874     "lea       " MEMLEA(0x10,1) ",%1           \n"
875     "movdqa    %%xmm0,%%xmm1                   \n"
876     "punpcklbw %%xmm0,%%xmm0                   \n"
877     "punpckhbw %%xmm1,%%xmm1                   \n"
878     "sub       $0x20,%2                         \n"
879     "movdqa    %%xmm0," MEMACCESS(0) "         \n"
880     "movdqa    %%xmm1," MEMACCESS2(0x10,0) "   \n"
881     "lea       " MEMLEA(0x20,0) ",%0           \n"
882     "jg        1b                              \n"
883
884   : "+r"(dst_ptr),     // %0
885     "+r"(src_ptr),     // %1
886     "+r"(dst_width)    // %2
887   :
888   : "memory", "cc"
889 #if defined(__SSE2__)
890     , "xmm0", "xmm1"
891 #endif
892   );
893 }
894
895 void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
896                             ptrdiff_t src_stride,
897                             uint8* dst_argb, int dst_width) {
898   asm volatile (
899     LABELALIGN
900   "1:                                          \n"
901     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
902     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
903     "lea       " MEMLEA(0x20,0) ",%0           \n"
904     "shufps    $0xdd,%%xmm1,%%xmm0             \n"
905     "sub       $0x4,%2                         \n"
906     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
907     "lea       " MEMLEA(0x10,1) ",%1           \n"
908     "jg        1b                              \n"
909   : "+r"(src_argb),  // %0
910     "+r"(dst_argb),  // %1
911     "+r"(dst_width)  // %2
912   :
913   : "memory", "cc"
914 #if defined(__SSE2__)
915     , "xmm0", "xmm1"
916 #endif
917   );
918 }
919
920 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
921                                   ptrdiff_t src_stride,
922                                   uint8* dst_argb, int dst_width) {
923   asm volatile (
924     LABELALIGN
925   "1:                                          \n"
926     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
927     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
928     "lea       " MEMLEA(0x20,0) ",%0           \n"
929     "movdqa    %%xmm0,%%xmm2                   \n"
930     "shufps    $0x88,%%xmm1,%%xmm0             \n"
931     "shufps    $0xdd,%%xmm1,%%xmm2             \n"
932     "pavgb     %%xmm2,%%xmm0                   \n"
933     "sub       $0x4,%2                         \n"
934     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
935     "lea       " MEMLEA(0x10,1) ",%1           \n"
936     "jg        1b                              \n"
937   : "+r"(src_argb),  // %0
938     "+r"(dst_argb),  // %1
939     "+r"(dst_width)  // %2
940   :
941   : "memory", "cc"
942 #if defined(__SSE2__)
943     , "xmm0", "xmm1"
944 #endif
945   );
946 }
947
948 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
949                                ptrdiff_t src_stride,
950                                uint8* dst_argb, int dst_width) {
951   asm volatile (
952     LABELALIGN
953   "1:                                          \n"
954     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
955     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
956     BUNDLEALIGN
957     MEMOPREG(movdqa,0x00,0,3,1,xmm2)           //  movdqa   (%0,%3,1),%%xmm2
958     MEMOPREG(movdqa,0x10,0,3,1,xmm3)           //  movdqa   0x10(%0,%3,1),%%xmm3
959     "lea       " MEMLEA(0x20,0) ",%0           \n"
960     "pavgb     %%xmm2,%%xmm0                   \n"
961     "pavgb     %%xmm3,%%xmm1                   \n"
962     "movdqa    %%xmm0,%%xmm2                   \n"
963     "shufps    $0x88,%%xmm1,%%xmm0             \n"
964     "shufps    $0xdd,%%xmm1,%%xmm2             \n"
965     "pavgb     %%xmm2,%%xmm0                   \n"
966     "sub       $0x4,%2                         \n"
967     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
968     "lea       " MEMLEA(0x10,1) ",%1           \n"
969     "jg        1b                              \n"
970   : "+r"(src_argb),   // %0
971     "+r"(dst_argb),   // %1
972     "+r"(dst_width)   // %2
973   : "r"((intptr_t)(src_stride))   // %3
974   : "memory", "cc"
975 #if defined(__native_client__) && defined(__x86_64__)
976     , "r14"
977 #endif
978 #if defined(__SSE2__)
979     , "xmm0", "xmm1", "xmm2", "xmm3"
980 #endif
981   );
982 }
983
984 // Reads 4 pixels at a time.
985 // Alignment requirement: dst_argb 16 byte aligned.
986 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
987                                int src_stepx,
988                                uint8* dst_argb, int dst_width) {
989   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
990   intptr_t src_stepx_x12 = 0;
991   asm volatile (
992     "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
993     "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
994     LABELALIGN
995   "1:                                          \n"
996     "movd      " MEMACCESS(0) ",%%xmm0         \n"
997     MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
998     "punpckldq %%xmm1,%%xmm0                   \n"
999     BUNDLEALIGN
1000     MEMOPREG(movd,0x00,0,1,2,xmm2)             //  movd      (%0,%1,2),%%xmm2
1001     MEMOPREG(movd,0x00,0,4,1,xmm3)             //  movd      (%0,%4,1),%%xmm3
1002     "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
1003     "punpckldq %%xmm3,%%xmm2                   \n"
1004     "punpcklqdq %%xmm2,%%xmm0                  \n"
1005     "sub       $0x4,%3                         \n"
1006     "movdqa    %%xmm0," MEMACCESS(2) "         \n"
1007     "lea       " MEMLEA(0x10,2) ",%2           \n"
1008     "jg        1b                              \n"
1009   : "+r"(src_argb),      // %0
1010     "+r"(src_stepx_x4),  // %1
1011     "+r"(dst_argb),      // %2
1012     "+r"(dst_width),     // %3
1013     "+r"(src_stepx_x12)  // %4
1014   :
1015   : "memory", "cc"
1016 #if defined(__native_client__) && defined(__x86_64__)
1017     , "r14"
1018 #endif
1019 #if defined(__SSE2__)
1020     , "xmm0", "xmm1", "xmm2", "xmm3"
1021 #endif
1022   );
1023 }
1024
1025 // Blends four 2x2 to 4x1.
1026 // Alignment requirement: dst_argb 16 byte aligned.
1027 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
1028                                   ptrdiff_t src_stride, int src_stepx,
1029                                   uint8* dst_argb, int dst_width) {
1030   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
1031   intptr_t src_stepx_x12 = 0;
1032   intptr_t row1 = (intptr_t)(src_stride);
1033   asm volatile (
1034     "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
1035     "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
1036     "lea       " MEMLEA4(0x00,0,5,1) ",%5      \n"
1037
1038     LABELALIGN
1039   "1:                                          \n"
1040     "movq      " MEMACCESS(0) ",%%xmm0         \n"
1041     MEMOPREG(movhps,0x00,0,1,1,xmm0)           //  movhps    (%0,%1,1),%%xmm0
1042     MEMOPREG(movq,0x00,0,1,2,xmm1)             //  movq      (%0,%1,2),%%xmm1
1043     BUNDLEALIGN
1044     MEMOPREG(movhps,0x00,0,4,1,xmm1)           //  movhps    (%0,%4,1),%%xmm1
1045     "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
1046     "movq      " MEMACCESS(5) ",%%xmm2         \n"
1047     BUNDLEALIGN
1048     MEMOPREG(movhps,0x00,5,1,1,xmm2)           //  movhps    (%5,%1,1),%%xmm2
1049     MEMOPREG(movq,0x00,5,1,2,xmm3)             //  movq      (%5,%1,2),%%xmm3
1050     MEMOPREG(movhps,0x00,5,4,1,xmm3)           //  movhps    (%5,%4,1),%%xmm3
1051     "lea       " MEMLEA4(0x00,5,1,4) ",%5      \n"
1052     "pavgb     %%xmm2,%%xmm0                   \n"
1053     "pavgb     %%xmm3,%%xmm1                   \n"
1054     "movdqa    %%xmm0,%%xmm2                   \n"
1055     "shufps    $0x88,%%xmm1,%%xmm0             \n"
1056     "shufps    $0xdd,%%xmm1,%%xmm2             \n"
1057     "pavgb     %%xmm2,%%xmm0                   \n"
1058     "sub       $0x4,%3                         \n"
1059     "movdqa    %%xmm0," MEMACCESS(2) "         \n"
1060     "lea       " MEMLEA(0x10,2) ",%2           \n"
1061     "jg        1b                              \n"
1062   : "+r"(src_argb),       // %0
1063     "+r"(src_stepx_x4),   // %1
1064     "+r"(dst_argb),       // %2
1065     "+rm"(dst_width),     // %3
1066     "+r"(src_stepx_x12),  // %4
1067     "+r"(row1)            // %5
1068   :
1069   : "memory", "cc"
1070 #if defined(__native_client__) && defined(__x86_64__)
1071     , "r14"
1072 #endif
1073 #if defined(__SSE2__)
1074     , "xmm0", "xmm1", "xmm2", "xmm3"
1075 #endif
1076   );
1077 }
1078
1079 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
1080                         int dst_width, int x, int dx) {
1081   intptr_t x0 = 0, x1 = 0;
1082   asm volatile (
1083     "movd      %5,%%xmm2                       \n"
1084     "movd      %6,%%xmm3                       \n"
1085     "pshufd    $0x0,%%xmm2,%%xmm2              \n"
1086     "pshufd    $0x11,%%xmm3,%%xmm0             \n"
1087     "paddd     %%xmm0,%%xmm2                   \n"
1088     "paddd     %%xmm3,%%xmm3                   \n"
1089     "pshufd    $0x5,%%xmm3,%%xmm0              \n"
1090     "paddd     %%xmm0,%%xmm2                   \n"
1091     "paddd     %%xmm3,%%xmm3                   \n"
1092     "pshufd    $0x0,%%xmm3,%%xmm3              \n"
1093     "pextrw    $0x1,%%xmm2,%k0                 \n"
1094     "pextrw    $0x3,%%xmm2,%k1                 \n"
1095     "cmp       $0x0,%4                         \n"
1096     "jl        99f                             \n"
1097     "sub       $0x4,%4                         \n"
1098     "jl        49f                             \n"
1099
1100     LABELALIGN
1101   "40:                                         \n"
1102     MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
1103     MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
1104     "pextrw    $0x5,%%xmm2,%k0                 \n"
1105     "pextrw    $0x7,%%xmm2,%k1                 \n"
1106     "paddd     %%xmm3,%%xmm2                   \n"
1107     "punpckldq %%xmm1,%%xmm0                   \n"
1108     MEMOPREG(movd,0x00,3,0,4,xmm1)             //  movd      (%3,%0,4),%%xmm1
1109     MEMOPREG(movd,0x00,3,1,4,xmm4)             //  movd      (%3,%1,4),%%xmm4
1110     "pextrw    $0x1,%%xmm2,%k0                 \n"
1111     "pextrw    $0x3,%%xmm2,%k1                 \n"
1112     "punpckldq %%xmm4,%%xmm1                   \n"
1113     "punpcklqdq %%xmm1,%%xmm0                  \n"
1114     "sub       $0x4,%4                         \n"
1115     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
1116     "lea       " MEMLEA(0x10,2) ",%2           \n"
1117     "jge       40b                             \n"
1118
1119   "49:                                         \n"
1120     "test      $0x2,%4                         \n"
1121     "je        29f                             \n"
1122     BUNDLEALIGN
1123     MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
1124     MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
1125     "pextrw    $0x5,%%xmm2,%k0                 \n"
1126     "punpckldq %%xmm1,%%xmm0                   \n"
1127     "movq      %%xmm0," MEMACCESS(2) "         \n"
1128     "lea       " MEMLEA(0x8,2) ",%2            \n"
1129   "29:                                         \n"
1130     "test      $0x1,%4                         \n"
1131     "je        99f                             \n"
1132     MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
1133     "movd      %%xmm0," MEMACCESS(2) "         \n"
1134   "99:                                         \n"
1135   : "+a"(x0),          // %0
1136     "+d"(x1),          // %1
1137     "+r"(dst_argb),    // %2
1138     "+r"(src_argb),    // %3
1139     "+r"(dst_width)    // %4
1140   : "rm"(x),           // %5
1141     "rm"(dx)           // %6
1142   : "memory", "cc"
1143 #if defined(__native_client__) && defined(__x86_64__)
1144     , "r14"
1145 #endif
1146 #if defined(__SSE2__)
1147     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
1148 #endif
1149   );
1150 }
1151
1152 // Reads 4 pixels, duplicates them and writes 8 pixels.
1153 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
1154 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
1155                            int dst_width, int x, int dx) {
1156   asm volatile (
1157     LABELALIGN
1158   "1:                                          \n"
1159     "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
1160     "lea       " MEMLEA(0x10,1) ",%1           \n"
1161     "movdqa    %%xmm0,%%xmm1                   \n"
1162     "punpckldq %%xmm0,%%xmm0                   \n"
1163     "punpckhdq %%xmm1,%%xmm1                   \n"
1164     "sub       $0x8,%2                         \n"
1165     "movdqa    %%xmm0," MEMACCESS(0) "         \n"
1166     "movdqa    %%xmm1," MEMACCESS2(0x10,0) "   \n"
1167     "lea       " MEMLEA(0x20,0) ",%0           \n"
1168     "jg        1b                              \n"
1169
1170   : "+r"(dst_argb),    // %0
1171     "+r"(src_argb),    // %1
1172     "+r"(dst_width)    // %2
1173   :
1174   : "memory", "cc"
1175 #if defined(__native_client__) && defined(__x86_64__)
1176     , "r14"
1177 #endif
1178 #if defined(__SSE2__)
1179     , "xmm0", "xmm1"
1180 #endif
1181   );
1182 }
1183
1184 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
1185 static uvec8 kShuffleColARGB = {
1186   0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
1187   8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
1188 };
1189
1190 // Shuffle table for duplicating 2 fractions into 8 bytes each
1191 static uvec8 kShuffleFractions = {
1192   0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
1193 };
1194
1195 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
1196 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
1197                                int dst_width, int x, int dx) {
1198   intptr_t x0 = 0, x1 = 0;
1199   asm volatile (
1200     "movdqa    %0,%%xmm4                       \n"
1201     "movdqa    %1,%%xmm5                       \n"
1202   :
1203   : "m"(kShuffleColARGB),  // %0
1204     "m"(kShuffleFractions)  // %1
1205   );
1206
1207   asm volatile (
1208     "movd      %5,%%xmm2                       \n"
1209     "movd      %6,%%xmm3                       \n"
1210     "pcmpeqb   %%xmm6,%%xmm6                   \n"
1211     "psrlw     $0x9,%%xmm6                     \n"
1212     "pextrw    $0x1,%%xmm2,%k3                 \n"
1213     "sub       $0x2,%2                         \n"
1214     "jl        29f                             \n"
1215     "movdqa    %%xmm2,%%xmm0                   \n"
1216     "paddd     %%xmm3,%%xmm0                   \n"
1217     "punpckldq %%xmm0,%%xmm2                   \n"
1218     "punpckldq %%xmm3,%%xmm3                   \n"
1219     "paddd     %%xmm3,%%xmm3                   \n"
1220     "pextrw    $0x3,%%xmm2,%k4                 \n"
1221
1222     LABELALIGN
1223   "2:                                          \n"
1224     "movdqa    %%xmm2,%%xmm1                   \n"
1225     "paddd     %%xmm3,%%xmm2                   \n"
1226     MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
1227     "psrlw     $0x9,%%xmm1                     \n"
1228     BUNDLEALIGN
1229     MEMOPREG(movhps,0x00,1,4,4,xmm0)           //  movhps    (%1,%4,4),%%xmm0
1230     "pshufb    %%xmm5,%%xmm1                   \n"
1231     "pshufb    %%xmm4,%%xmm0                   \n"
1232     "pxor      %%xmm6,%%xmm1                   \n"
1233     "pmaddubsw %%xmm1,%%xmm0                   \n"
1234     "psrlw     $0x7,%%xmm0                     \n"
1235     "pextrw    $0x1,%%xmm2,%k3                 \n"
1236     "pextrw    $0x3,%%xmm2,%k4                 \n"
1237     "packuswb  %%xmm0,%%xmm0                   \n"
1238     "movq      %%xmm0," MEMACCESS(0) "         \n"
1239     "lea       " MEMLEA(0x8,0) ",%0            \n"
1240     "sub       $0x2,%2                         \n"
1241     "jge       2b                              \n"
1242
1243     LABELALIGN
1244   "29:                                         \n"
1245     "add       $0x1,%2                         \n"
1246     "jl        99f                             \n"
1247     "psrlw     $0x9,%%xmm2                     \n"
1248     BUNDLEALIGN
1249     MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
1250     "pshufb    %%xmm5,%%xmm2                   \n"
1251     "pshufb    %%xmm4,%%xmm0                   \n"
1252     "pxor      %%xmm6,%%xmm2                   \n"
1253     "pmaddubsw %%xmm2,%%xmm0                   \n"
1254     "psrlw     $0x7,%%xmm0                     \n"
1255     "packuswb  %%xmm0,%%xmm0                   \n"
1256     "movd      %%xmm0," MEMACCESS(0) "         \n"
1257
1258     LABELALIGN
1259   "99:                                         \n"
1260   : "+r"(dst_argb),    // %0
1261     "+r"(src_argb),    // %1
1262     "+rm"(dst_width),  // %2
1263     "+r"(x0),          // %3
1264     "+r"(x1)           // %4
1265   : "rm"(x),           // %5
1266     "rm"(dx)           // %6
1267   : "memory", "cc"
1268 #if defined(__native_client__) && defined(__x86_64__)
1269     , "r14"
1270 #endif
1271 #if defined(__SSE2__)
1272     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
1273 #endif
1274   );
1275 }
1276
1277 // Divide num by div and return as 16.16 fixed point result.
1278 int FixedDiv_X86(int num, int div) {
1279   asm volatile (
1280     "cdq                                       \n"
1281     "shld      $0x10,%%eax,%%edx               \n"
1282     "shl       $0x10,%%eax                     \n"
1283     "idiv      %1                              \n"
1284     "mov       %0, %%eax                       \n"
1285     : "+a"(num)  // %0
1286     : "c"(div)   // %1
1287     : "memory", "cc", "edx"
1288   );
1289   return num;
1290 }
1291
1292 // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
1293 int FixedDiv1_X86(int num, int div) {
1294   asm volatile (
1295     "cdq                                       \n"
1296     "shld      $0x10,%%eax,%%edx               \n"
1297     "shl       $0x10,%%eax                     \n"
1298     "sub       $0x10001,%%eax                  \n"
1299     "sbb       $0x0,%%edx                      \n"
1300     "sub       $0x1,%1                         \n"
1301     "idiv      %1                              \n"
1302     "mov       %0, %%eax                       \n"
1303     : "+a"(num)  // %0
1304     : "c"(div)   // %1
1305     : "memory", "cc", "edx"
1306   );
1307   return num;
1308 }
1309
1310 #endif  // defined(__x86_64__) || defined(__i386__)
1311
1312 #ifdef __cplusplus
1313 }  // extern "C"
1314 }  // namespace libyuv
1315 #endif