Upstream version 10.39.225.0
[platform/framework/web/crosswalk.git] / src / third_party / libvpx / source / libvpx / third_party / libyuv / source / rotate.cc
1 /*
2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10
11 #include "libyuv/rotate.h"
12
13 #include "libyuv/cpu_id.h"
14 #include "libyuv/convert.h"
15 #include "libyuv/planar_functions.h"
16 #include "libyuv/row.h"
17
18 #ifdef __cplusplus
19 namespace libyuv {
20 extern "C" {
21 #endif
22
23 #if !defined(LIBYUV_DISABLE_X86) && \
24     (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
25 #if defined(__APPLE__) && defined(__i386__)
26 #define DECLARE_FUNCTION(name)                                                 \
27     ".text                                     \n"                             \
28     ".private_extern _" #name "                \n"                             \
29     ".align 4,0x90                             \n"                             \
30 "_" #name ":                                   \n"
31 #elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
32 #define DECLARE_FUNCTION(name)                                                 \
33     ".text                                     \n"                             \
34     ".align 4,0x90                             \n"                             \
35 "_" #name ":                                   \n"
36 #else
37 #define DECLARE_FUNCTION(name)                                                 \
38     ".text                                     \n"                             \
39     ".align 4,0x90                             \n"                             \
40 #name ":                                       \n"
41 #endif
42 #endif
43
44 #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
45     (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
46 #define HAS_MIRRORROW_NEON
47 void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
48 #define HAS_MIRRORROW_UV_NEON
49 void MirrorUVRow_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width);
50 #define HAS_TRANSPOSE_WX8_NEON
51 void TransposeWx8_NEON(const uint8* src, int src_stride,
52                        uint8* dst, int dst_stride, int width);
53 #define HAS_TRANSPOSE_UVWX8_NEON
54 void TransposeUVWx8_NEON(const uint8* src, int src_stride,
55                          uint8* dst_a, int dst_stride_a,
56                          uint8* dst_b, int dst_stride_b,
57                          int width);
58 #endif  // defined(__ARM_NEON__)
59
60 #if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
61     defined(__mips__) && \
62     defined(__mips_dsp) && (__mips_dsp_rev >= 2)
63 #define HAS_TRANSPOSE_WX8_MIPS_DSPR2
64 void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
65                              uint8* dst, int dst_stride, int width);
66
67 void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
68                                   uint8* dst, int dst_stride, int width);
69 #define HAS_TRANSPOSE_UVWx8_MIPS_DSPR2
70 void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
71                                uint8* dst_a, int dst_stride_a,
72                                uint8* dst_b, int dst_stride_b,
73                                int width);
74 #endif  // defined(__mips__)
75
76 #if !defined(LIBYUV_DISABLE_X86) && \
77     defined(_M_IX86) && defined(_MSC_VER)
78 #define HAS_TRANSPOSE_WX8_SSSE3
79 __declspec(naked) __declspec(align(16))
80 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
81                                uint8* dst, int dst_stride, int width) {
82   __asm {
83     push      edi
84     push      esi
85     push      ebp
86     mov       eax, [esp + 12 + 4]   // src
87     mov       edi, [esp + 12 + 8]   // src_stride
88     mov       edx, [esp + 12 + 12]  // dst
89     mov       esi, [esp + 12 + 16]  // dst_stride
90     mov       ecx, [esp + 12 + 20]  // width
91
92     // Read in the data from the source pointer.
93     // First round of bit swap.
94     align      4
95  convertloop:
96     movq      xmm0, qword ptr [eax]
97     lea       ebp, [eax + 8]
98     movq      xmm1, qword ptr [eax + edi]
99     lea       eax, [eax + 2 * edi]
100     punpcklbw xmm0, xmm1
101     movq      xmm2, qword ptr [eax]
102     movdqa    xmm1, xmm0
103     palignr   xmm1, xmm1, 8
104     movq      xmm3, qword ptr [eax + edi]
105     lea       eax, [eax + 2 * edi]
106     punpcklbw xmm2, xmm3
107     movdqa    xmm3, xmm2
108     movq      xmm4, qword ptr [eax]
109     palignr   xmm3, xmm3, 8
110     movq      xmm5, qword ptr [eax + edi]
111     punpcklbw xmm4, xmm5
112     lea       eax, [eax + 2 * edi]
113     movdqa    xmm5, xmm4
114     movq      xmm6, qword ptr [eax]
115     palignr   xmm5, xmm5, 8
116     movq      xmm7, qword ptr [eax + edi]
117     punpcklbw xmm6, xmm7
118     mov       eax, ebp
119     movdqa    xmm7, xmm6
120     palignr   xmm7, xmm7, 8
121     // Second round of bit swap.
122     punpcklwd xmm0, xmm2
123     punpcklwd xmm1, xmm3
124     movdqa    xmm2, xmm0
125     movdqa    xmm3, xmm1
126     palignr   xmm2, xmm2, 8
127     palignr   xmm3, xmm3, 8
128     punpcklwd xmm4, xmm6
129     punpcklwd xmm5, xmm7
130     movdqa    xmm6, xmm4
131     movdqa    xmm7, xmm5
132     palignr   xmm6, xmm6, 8
133     palignr   xmm7, xmm7, 8
134     // Third round of bit swap.
135     // Write to the destination pointer.
136     punpckldq xmm0, xmm4
137     movq      qword ptr [edx], xmm0
138     movdqa    xmm4, xmm0
139     palignr   xmm4, xmm4, 8
140     movq      qword ptr [edx + esi], xmm4
141     lea       edx, [edx + 2 * esi]
142     punpckldq xmm2, xmm6
143     movdqa    xmm6, xmm2
144     palignr   xmm6, xmm6, 8
145     movq      qword ptr [edx], xmm2
146     punpckldq xmm1, xmm5
147     movq      qword ptr [edx + esi], xmm6
148     lea       edx, [edx + 2 * esi]
149     movdqa    xmm5, xmm1
150     movq      qword ptr [edx], xmm1
151     palignr   xmm5, xmm5, 8
152     punpckldq xmm3, xmm7
153     movq      qword ptr [edx + esi], xmm5
154     lea       edx, [edx + 2 * esi]
155     movq      qword ptr [edx], xmm3
156     movdqa    xmm7, xmm3
157     palignr   xmm7, xmm7, 8
158     sub       ecx, 8
159     movq      qword ptr [edx + esi], xmm7
160     lea       edx, [edx + 2 * esi]
161     jg        convertloop
162
163     pop       ebp
164     pop       esi
165     pop       edi
166     ret
167   }
168 }
169
170 #define HAS_TRANSPOSE_UVWX8_SSE2
171 __declspec(naked) __declspec(align(16))
172 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
173                                 uint8* dst_a, int dst_stride_a,
174                                 uint8* dst_b, int dst_stride_b,
175                                 int w) {
176   __asm {
177     push      ebx
178     push      esi
179     push      edi
180     push      ebp
181     mov       eax, [esp + 16 + 4]   // src
182     mov       edi, [esp + 16 + 8]   // src_stride
183     mov       edx, [esp + 16 + 12]  // dst_a
184     mov       esi, [esp + 16 + 16]  // dst_stride_a
185     mov       ebx, [esp + 16 + 20]  // dst_b
186     mov       ebp, [esp + 16 + 24]  // dst_stride_b
187     mov       ecx, esp
188     sub       esp, 4 + 16
189     and       esp, ~15
190     mov       [esp + 16], ecx
191     mov       ecx, [ecx + 16 + 28]  // w
192
193     align      4
194  convertloop:
195     // Read in the data from the source pointer.
196     // First round of bit swap.
197     movdqa    xmm0, [eax]
198     movdqa    xmm1, [eax + edi]
199     lea       eax, [eax + 2 * edi]
200     movdqa    xmm7, xmm0  // use xmm7 as temp register.
201     punpcklbw xmm0, xmm1
202     punpckhbw xmm7, xmm1
203     movdqa    xmm1, xmm7
204     movdqa    xmm2, [eax]
205     movdqa    xmm3, [eax + edi]
206     lea       eax, [eax + 2 * edi]
207     movdqa    xmm7, xmm2
208     punpcklbw xmm2, xmm3
209     punpckhbw xmm7, xmm3
210     movdqa    xmm3, xmm7
211     movdqa    xmm4, [eax]
212     movdqa    xmm5, [eax + edi]
213     lea       eax, [eax + 2 * edi]
214     movdqa    xmm7, xmm4
215     punpcklbw xmm4, xmm5
216     punpckhbw xmm7, xmm5
217     movdqa    xmm5, xmm7
218     movdqa    xmm6, [eax]
219     movdqa    xmm7, [eax + edi]
220     lea       eax, [eax + 2 * edi]
221     movdqa    [esp], xmm5  // backup xmm5
222     neg       edi
223     movdqa    xmm5, xmm6   // use xmm5 as temp register.
224     punpcklbw xmm6, xmm7
225     punpckhbw xmm5, xmm7
226     movdqa    xmm7, xmm5
227     lea       eax, [eax + 8 * edi + 16]
228     neg       edi
229     // Second round of bit swap.
230     movdqa    xmm5, xmm0
231     punpcklwd xmm0, xmm2
232     punpckhwd xmm5, xmm2
233     movdqa    xmm2, xmm5
234     movdqa    xmm5, xmm1
235     punpcklwd xmm1, xmm3
236     punpckhwd xmm5, xmm3
237     movdqa    xmm3, xmm5
238     movdqa    xmm5, xmm4
239     punpcklwd xmm4, xmm6
240     punpckhwd xmm5, xmm6
241     movdqa    xmm6, xmm5
242     movdqa    xmm5, [esp]  // restore xmm5
243     movdqa    [esp], xmm6  // backup xmm6
244     movdqa    xmm6, xmm5    // use xmm6 as temp register.
245     punpcklwd xmm5, xmm7
246     punpckhwd xmm6, xmm7
247     movdqa    xmm7, xmm6
248     // Third round of bit swap.
249     // Write to the destination pointer.
250     movdqa    xmm6, xmm0
251     punpckldq xmm0, xmm4
252     punpckhdq xmm6, xmm4
253     movdqa    xmm4, xmm6
254     movdqa    xmm6, [esp]  // restore xmm6
255     movlpd    qword ptr [edx], xmm0
256     movhpd    qword ptr [ebx], xmm0
257     movlpd    qword ptr [edx + esi], xmm4
258     lea       edx, [edx + 2 * esi]
259     movhpd    qword ptr [ebx + ebp], xmm4
260     lea       ebx, [ebx + 2 * ebp]
261     movdqa    xmm0, xmm2   // use xmm0 as the temp register.
262     punpckldq xmm2, xmm6
263     movlpd    qword ptr [edx], xmm2
264     movhpd    qword ptr [ebx], xmm2
265     punpckhdq xmm0, xmm6
266     movlpd    qword ptr [edx + esi], xmm0
267     lea       edx, [edx + 2 * esi]
268     movhpd    qword ptr [ebx + ebp], xmm0
269     lea       ebx, [ebx + 2 * ebp]
270     movdqa    xmm0, xmm1   // use xmm0 as the temp register.
271     punpckldq xmm1, xmm5
272     movlpd    qword ptr [edx], xmm1
273     movhpd    qword ptr [ebx], xmm1
274     punpckhdq xmm0, xmm5
275     movlpd    qword ptr [edx + esi], xmm0
276     lea       edx, [edx + 2 * esi]
277     movhpd    qword ptr [ebx + ebp], xmm0
278     lea       ebx, [ebx + 2 * ebp]
279     movdqa    xmm0, xmm3   // use xmm0 as the temp register.
280     punpckldq xmm3, xmm7
281     movlpd    qword ptr [edx], xmm3
282     movhpd    qword ptr [ebx], xmm3
283     punpckhdq xmm0, xmm7
284     sub       ecx, 8
285     movlpd    qword ptr [edx + esi], xmm0
286     lea       edx, [edx + 2 * esi]
287     movhpd    qword ptr [ebx + ebp], xmm0
288     lea       ebx, [ebx + 2 * ebp]
289     jg        convertloop
290
291     mov       esp, [esp + 16]
292     pop       ebp
293     pop       edi
294     pop       esi
295     pop       ebx
296     ret
297   }
298 }
299 #elif !defined(LIBYUV_DISABLE_X86) && \
300     (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
301 #define HAS_TRANSPOSE_WX8_SSSE3
302 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
303                                uint8* dst, int dst_stride, int width) {
304   asm volatile (
305     // Read in the data from the source pointer.
306     // First round of bit swap.
307     ".p2align  2                                 \n"
308   "1:                                            \n"
309     "movq       (%0),%%xmm0                      \n"
310     "movq       (%0,%3),%%xmm1                   \n"
311     "lea        (%0,%3,2),%0                     \n"
312     "punpcklbw  %%xmm1,%%xmm0                    \n"
313     "movq       (%0),%%xmm2                      \n"
314     "movdqa     %%xmm0,%%xmm1                    \n"
315     "palignr    $0x8,%%xmm1,%%xmm1               \n"
316     "movq       (%0,%3),%%xmm3                   \n"
317     "lea        (%0,%3,2),%0                     \n"
318     "punpcklbw  %%xmm3,%%xmm2                    \n"
319     "movdqa     %%xmm2,%%xmm3                    \n"
320     "movq       (%0),%%xmm4                      \n"
321     "palignr    $0x8,%%xmm3,%%xmm3               \n"
322     "movq       (%0,%3),%%xmm5                   \n"
323     "lea        (%0,%3,2),%0                     \n"
324     "punpcklbw  %%xmm5,%%xmm4                    \n"
325     "movdqa     %%xmm4,%%xmm5                    \n"
326     "movq       (%0),%%xmm6                      \n"
327     "palignr    $0x8,%%xmm5,%%xmm5               \n"
328     "movq       (%0,%3),%%xmm7                   \n"
329     "lea        (%0,%3,2),%0                     \n"
330     "punpcklbw  %%xmm7,%%xmm6                    \n"
331     "neg        %3                               \n"
332     "movdqa     %%xmm6,%%xmm7                    \n"
333     "lea        0x8(%0,%3,8),%0                  \n"
334     "palignr    $0x8,%%xmm7,%%xmm7               \n"
335     "neg        %3                               \n"
336      // Second round of bit swap.
337     "punpcklwd  %%xmm2,%%xmm0                    \n"
338     "punpcklwd  %%xmm3,%%xmm1                    \n"
339     "movdqa     %%xmm0,%%xmm2                    \n"
340     "movdqa     %%xmm1,%%xmm3                    \n"
341     "palignr    $0x8,%%xmm2,%%xmm2               \n"
342     "palignr    $0x8,%%xmm3,%%xmm3               \n"
343     "punpcklwd  %%xmm6,%%xmm4                    \n"
344     "punpcklwd  %%xmm7,%%xmm5                    \n"
345     "movdqa     %%xmm4,%%xmm6                    \n"
346     "movdqa     %%xmm5,%%xmm7                    \n"
347     "palignr    $0x8,%%xmm6,%%xmm6               \n"
348     "palignr    $0x8,%%xmm7,%%xmm7               \n"
349     // Third round of bit swap.
350     // Write to the destination pointer.
351     "punpckldq  %%xmm4,%%xmm0                    \n"
352     "movq       %%xmm0,(%1)                      \n"
353     "movdqa     %%xmm0,%%xmm4                    \n"
354     "palignr    $0x8,%%xmm4,%%xmm4               \n"
355     "movq       %%xmm4,(%1,%4)                   \n"
356     "lea        (%1,%4,2),%1                     \n"
357     "punpckldq  %%xmm6,%%xmm2                    \n"
358     "movdqa     %%xmm2,%%xmm6                    \n"
359     "movq       %%xmm2,(%1)                      \n"
360     "palignr    $0x8,%%xmm6,%%xmm6               \n"
361     "punpckldq  %%xmm5,%%xmm1                    \n"
362     "movq       %%xmm6,(%1,%4)                   \n"
363     "lea        (%1,%4,2),%1                     \n"
364     "movdqa     %%xmm1,%%xmm5                    \n"
365     "movq       %%xmm1,(%1)                      \n"
366     "palignr    $0x8,%%xmm5,%%xmm5               \n"
367     "movq       %%xmm5,(%1,%4)                   \n"
368     "lea        (%1,%4,2),%1                     \n"
369     "punpckldq  %%xmm7,%%xmm3                    \n"
370     "movq       %%xmm3,(%1)                      \n"
371     "movdqa     %%xmm3,%%xmm7                    \n"
372     "palignr    $0x8,%%xmm7,%%xmm7               \n"
373     "sub        $0x8,%2                          \n"
374     "movq       %%xmm7,(%1,%4)                   \n"
375     "lea        (%1,%4,2),%1                     \n"
376     "jg         1b                               \n"
377     : "+r"(src),    // %0
378       "+r"(dst),    // %1
379       "+r"(width)   // %2
380     : "r"((intptr_t)(src_stride)),  // %3
381       "r"((intptr_t)(dst_stride))   // %4
382     : "memory", "cc"
383   #if defined(__SSE2__)
384       , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
385   #endif
386   );
387 }
388
389 #if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)
390 #define HAS_TRANSPOSE_UVWX8_SSE2
391 void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
392                          uint8* dst_a, int dst_stride_a,
393                          uint8* dst_b, int dst_stride_b,
394                          int w);
395   asm (
396     DECLARE_FUNCTION(TransposeUVWx8_SSE2)
397     "push   %ebx                               \n"
398     "push   %esi                               \n"
399     "push   %edi                               \n"
400     "push   %ebp                               \n"
401     "mov    0x14(%esp),%eax                    \n"
402     "mov    0x18(%esp),%edi                    \n"
403     "mov    0x1c(%esp),%edx                    \n"
404     "mov    0x20(%esp),%esi                    \n"
405     "mov    0x24(%esp),%ebx                    \n"
406     "mov    0x28(%esp),%ebp                    \n"
407     "mov    %esp,%ecx                          \n"
408     "sub    $0x14,%esp                         \n"
409     "and    $0xfffffff0,%esp                   \n"
410     "mov    %ecx,0x10(%esp)                    \n"
411     "mov    0x2c(%ecx),%ecx                    \n"
412
413 "1:                                            \n"
414     "movdqa (%eax),%xmm0                       \n"
415     "movdqa (%eax,%edi,1),%xmm1                \n"
416     "lea    (%eax,%edi,2),%eax                 \n"
417     "movdqa %xmm0,%xmm7                        \n"
418     "punpcklbw %xmm1,%xmm0                     \n"
419     "punpckhbw %xmm1,%xmm7                     \n"
420     "movdqa %xmm7,%xmm1                        \n"
421     "movdqa (%eax),%xmm2                       \n"
422     "movdqa (%eax,%edi,1),%xmm3                \n"
423     "lea    (%eax,%edi,2),%eax                 \n"
424     "movdqa %xmm2,%xmm7                        \n"
425     "punpcklbw %xmm3,%xmm2                     \n"
426     "punpckhbw %xmm3,%xmm7                     \n"
427     "movdqa %xmm7,%xmm3                        \n"
428     "movdqa (%eax),%xmm4                       \n"
429     "movdqa (%eax,%edi,1),%xmm5                \n"
430     "lea    (%eax,%edi,2),%eax                 \n"
431     "movdqa %xmm4,%xmm7                        \n"
432     "punpcklbw %xmm5,%xmm4                     \n"
433     "punpckhbw %xmm5,%xmm7                     \n"
434     "movdqa %xmm7,%xmm5                        \n"
435     "movdqa (%eax),%xmm6                       \n"
436     "movdqa (%eax,%edi,1),%xmm7                \n"
437     "lea    (%eax,%edi,2),%eax                 \n"
438     "movdqa %xmm5,(%esp)                       \n"
439     "neg    %edi                               \n"
440     "movdqa %xmm6,%xmm5                        \n"
441     "punpcklbw %xmm7,%xmm6                     \n"
442     "punpckhbw %xmm7,%xmm5                     \n"
443     "movdqa %xmm5,%xmm7                        \n"
444     "lea    0x10(%eax,%edi,8),%eax             \n"
445     "neg    %edi                               \n"
446     "movdqa %xmm0,%xmm5                        \n"
447     "punpcklwd %xmm2,%xmm0                     \n"
448     "punpckhwd %xmm2,%xmm5                     \n"
449     "movdqa %xmm5,%xmm2                        \n"
450     "movdqa %xmm1,%xmm5                        \n"
451     "punpcklwd %xmm3,%xmm1                     \n"
452     "punpckhwd %xmm3,%xmm5                     \n"
453     "movdqa %xmm5,%xmm3                        \n"
454     "movdqa %xmm4,%xmm5                        \n"
455     "punpcklwd %xmm6,%xmm4                     \n"
456     "punpckhwd %xmm6,%xmm5                     \n"
457     "movdqa %xmm5,%xmm6                        \n"
458     "movdqa (%esp),%xmm5                       \n"
459     "movdqa %xmm6,(%esp)                       \n"
460     "movdqa %xmm5,%xmm6                        \n"
461     "punpcklwd %xmm7,%xmm5                     \n"
462     "punpckhwd %xmm7,%xmm6                     \n"
463     "movdqa %xmm6,%xmm7                        \n"
464     "movdqa %xmm0,%xmm6                        \n"
465     "punpckldq %xmm4,%xmm0                     \n"
466     "punpckhdq %xmm4,%xmm6                     \n"
467     "movdqa %xmm6,%xmm4                        \n"
468     "movdqa (%esp),%xmm6                       \n"
469     "movlpd %xmm0,(%edx)                       \n"
470     "movhpd %xmm0,(%ebx)                       \n"
471     "movlpd %xmm4,(%edx,%esi,1)                \n"
472     "lea    (%edx,%esi,2),%edx                 \n"
473     "movhpd %xmm4,(%ebx,%ebp,1)                \n"
474     "lea    (%ebx,%ebp,2),%ebx                 \n"
475     "movdqa %xmm2,%xmm0                        \n"
476     "punpckldq %xmm6,%xmm2                     \n"
477     "movlpd %xmm2,(%edx)                       \n"
478     "movhpd %xmm2,(%ebx)                       \n"
479     "punpckhdq %xmm6,%xmm0                     \n"
480     "movlpd %xmm0,(%edx,%esi,1)                \n"
481     "lea    (%edx,%esi,2),%edx                 \n"
482     "movhpd %xmm0,(%ebx,%ebp,1)                \n"
483     "lea    (%ebx,%ebp,2),%ebx                 \n"
484     "movdqa %xmm1,%xmm0                        \n"
485     "punpckldq %xmm5,%xmm1                     \n"
486     "movlpd %xmm1,(%edx)                       \n"
487     "movhpd %xmm1,(%ebx)                       \n"
488     "punpckhdq %xmm5,%xmm0                     \n"
489     "movlpd %xmm0,(%edx,%esi,1)                \n"
490     "lea    (%edx,%esi,2),%edx                 \n"
491     "movhpd %xmm0,(%ebx,%ebp,1)                \n"
492     "lea    (%ebx,%ebp,2),%ebx                 \n"
493     "movdqa %xmm3,%xmm0                        \n"
494     "punpckldq %xmm7,%xmm3                     \n"
495     "movlpd %xmm3,(%edx)                       \n"
496     "movhpd %xmm3,(%ebx)                       \n"
497     "punpckhdq %xmm7,%xmm0                     \n"
498     "sub    $0x8,%ecx                          \n"
499     "movlpd %xmm0,(%edx,%esi,1)                \n"
500     "lea    (%edx,%esi,2),%edx                 \n"
501     "movhpd %xmm0,(%ebx,%ebp,1)                \n"
502     "lea    (%ebx,%ebp,2),%ebx                 \n"
503     "jg     1b                                 \n"
504     "mov    0x10(%esp),%esp                    \n"
505     "pop    %ebp                               \n"
506     "pop    %edi                               \n"
507     "pop    %esi                               \n"
508     "pop    %ebx                               \n"
509 #if defined(__native_client__)
510     "pop    %ecx                               \n"
511     "and    $0xffffffe0,%ecx                   \n"
512     "jmp    *%ecx                              \n"
513 #else
514     "ret                                       \n"
515 #endif
516 );
517 #elif !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
518     defined(__x86_64__)
519 // 64 bit version has enough registers to do 16x8 to 8x16 at a time.
520 #define HAS_TRANSPOSE_WX8_FAST_SSSE3
521 static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
522                                     uint8* dst, int dst_stride, int width) {
523   asm volatile (
524   // Read in the data from the source pointer.
525   // First round of bit swap.
526   ".p2align  2                                 \n"
527 "1:                                            \n"
528   "movdqa     (%0),%%xmm0                      \n"
529   "movdqa     (%0,%3),%%xmm1                   \n"
530   "lea        (%0,%3,2),%0                     \n"
531   "movdqa     %%xmm0,%%xmm8                    \n"
532   "punpcklbw  %%xmm1,%%xmm0                    \n"
533   "punpckhbw  %%xmm1,%%xmm8                    \n"
534   "movdqa     (%0),%%xmm2                      \n"
535   "movdqa     %%xmm0,%%xmm1                    \n"
536   "movdqa     %%xmm8,%%xmm9                    \n"
537   "palignr    $0x8,%%xmm1,%%xmm1               \n"
538   "palignr    $0x8,%%xmm9,%%xmm9               \n"
539   "movdqa     (%0,%3),%%xmm3                   \n"
540   "lea        (%0,%3,2),%0                     \n"
541   "movdqa     %%xmm2,%%xmm10                   \n"
542   "punpcklbw  %%xmm3,%%xmm2                    \n"
543   "punpckhbw  %%xmm3,%%xmm10                   \n"
544   "movdqa     %%xmm2,%%xmm3                    \n"
545   "movdqa     %%xmm10,%%xmm11                  \n"
546   "movdqa     (%0),%%xmm4                      \n"
547   "palignr    $0x8,%%xmm3,%%xmm3               \n"
548   "palignr    $0x8,%%xmm11,%%xmm11             \n"
549   "movdqa     (%0,%3),%%xmm5                   \n"
550   "lea        (%0,%3,2),%0                     \n"
551   "movdqa     %%xmm4,%%xmm12                   \n"
552   "punpcklbw  %%xmm5,%%xmm4                    \n"
553   "punpckhbw  %%xmm5,%%xmm12                   \n"
554   "movdqa     %%xmm4,%%xmm5                    \n"
555   "movdqa     %%xmm12,%%xmm13                  \n"
556   "movdqa     (%0),%%xmm6                      \n"
557   "palignr    $0x8,%%xmm5,%%xmm5               \n"
558   "palignr    $0x8,%%xmm13,%%xmm13             \n"
559   "movdqa     (%0,%3),%%xmm7                   \n"
560   "lea        (%0,%3,2),%0                     \n"
561   "movdqa     %%xmm6,%%xmm14                   \n"
562   "punpcklbw  %%xmm7,%%xmm6                    \n"
563   "punpckhbw  %%xmm7,%%xmm14                   \n"
564   "neg        %3                               \n"
565   "movdqa     %%xmm6,%%xmm7                    \n"
566   "movdqa     %%xmm14,%%xmm15                  \n"
567   "lea        0x10(%0,%3,8),%0                 \n"
568   "palignr    $0x8,%%xmm7,%%xmm7               \n"
569   "palignr    $0x8,%%xmm15,%%xmm15             \n"
570   "neg        %3                               \n"
571    // Second round of bit swap.
572   "punpcklwd  %%xmm2,%%xmm0                    \n"
573   "punpcklwd  %%xmm3,%%xmm1                    \n"
574   "movdqa     %%xmm0,%%xmm2                    \n"
575   "movdqa     %%xmm1,%%xmm3                    \n"
576   "palignr    $0x8,%%xmm2,%%xmm2               \n"
577   "palignr    $0x8,%%xmm3,%%xmm3               \n"
578   "punpcklwd  %%xmm6,%%xmm4                    \n"
579   "punpcklwd  %%xmm7,%%xmm5                    \n"
580   "movdqa     %%xmm4,%%xmm6                    \n"
581   "movdqa     %%xmm5,%%xmm7                    \n"
582   "palignr    $0x8,%%xmm6,%%xmm6               \n"
583   "palignr    $0x8,%%xmm7,%%xmm7               \n"
584   "punpcklwd  %%xmm10,%%xmm8                   \n"
585   "punpcklwd  %%xmm11,%%xmm9                   \n"
586   "movdqa     %%xmm8,%%xmm10                   \n"
587   "movdqa     %%xmm9,%%xmm11                   \n"
588   "palignr    $0x8,%%xmm10,%%xmm10             \n"
589   "palignr    $0x8,%%xmm11,%%xmm11             \n"
590   "punpcklwd  %%xmm14,%%xmm12                  \n"
591   "punpcklwd  %%xmm15,%%xmm13                  \n"
592   "movdqa     %%xmm12,%%xmm14                  \n"
593   "movdqa     %%xmm13,%%xmm15                  \n"
594   "palignr    $0x8,%%xmm14,%%xmm14             \n"
595   "palignr    $0x8,%%xmm15,%%xmm15             \n"
596   // Third round of bit swap.
597   // Write to the destination pointer.
598   "punpckldq  %%xmm4,%%xmm0                    \n"
599   "movq       %%xmm0,(%1)                      \n"
600   "movdqa     %%xmm0,%%xmm4                    \n"
601   "palignr    $0x8,%%xmm4,%%xmm4               \n"
602   "movq       %%xmm4,(%1,%4)                   \n"
603   "lea        (%1,%4,2),%1                     \n"
604   "punpckldq  %%xmm6,%%xmm2                    \n"
605   "movdqa     %%xmm2,%%xmm6                    \n"
606   "movq       %%xmm2,(%1)                      \n"
607   "palignr    $0x8,%%xmm6,%%xmm6               \n"
608   "punpckldq  %%xmm5,%%xmm1                    \n"
609   "movq       %%xmm6,(%1,%4)                   \n"
610   "lea        (%1,%4,2),%1                     \n"
611   "movdqa     %%xmm1,%%xmm5                    \n"
612   "movq       %%xmm1,(%1)                      \n"
613   "palignr    $0x8,%%xmm5,%%xmm5               \n"
614   "movq       %%xmm5,(%1,%4)                   \n"
615   "lea        (%1,%4,2),%1                     \n"
616   "punpckldq  %%xmm7,%%xmm3                    \n"
617   "movq       %%xmm3,(%1)                      \n"
618   "movdqa     %%xmm3,%%xmm7                    \n"
619   "palignr    $0x8,%%xmm7,%%xmm7               \n"
620   "movq       %%xmm7,(%1,%4)                   \n"
621   "lea        (%1,%4,2),%1                     \n"
622   "punpckldq  %%xmm12,%%xmm8                   \n"
623   "movq       %%xmm8,(%1)                      \n"
624   "movdqa     %%xmm8,%%xmm12                   \n"
625   "palignr    $0x8,%%xmm12,%%xmm12             \n"
626   "movq       %%xmm12,(%1,%4)                  \n"
627   "lea        (%1,%4,2),%1                     \n"
628   "punpckldq  %%xmm14,%%xmm10                  \n"
629   "movdqa     %%xmm10,%%xmm14                  \n"
630   "movq       %%xmm10,(%1)                     \n"
631   "palignr    $0x8,%%xmm14,%%xmm14             \n"
632   "punpckldq  %%xmm13,%%xmm9                   \n"
633   "movq       %%xmm14,(%1,%4)                  \n"
634   "lea        (%1,%4,2),%1                     \n"
635   "movdqa     %%xmm9,%%xmm13                   \n"
636   "movq       %%xmm9,(%1)                      \n"
637   "palignr    $0x8,%%xmm13,%%xmm13             \n"
638   "movq       %%xmm13,(%1,%4)                  \n"
639   "lea        (%1,%4,2),%1                     \n"
640   "punpckldq  %%xmm15,%%xmm11                  \n"
641   "movq       %%xmm11,(%1)                     \n"
642   "movdqa     %%xmm11,%%xmm15                  \n"
643   "palignr    $0x8,%%xmm15,%%xmm15             \n"
644   "sub        $0x10,%2                         \n"
645   "movq       %%xmm15,(%1,%4)                  \n"
646   "lea        (%1,%4,2),%1                     \n"
647   "jg         1b                               \n"
648   : "+r"(src),    // %0
649     "+r"(dst),    // %1
650     "+r"(width)   // %2
651   : "r"((intptr_t)(src_stride)),  // %3
652     "r"((intptr_t)(dst_stride))   // %4
653   : "memory", "cc",
654     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
655     "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",  "xmm14",  "xmm15"
656 );
657 }
658
659 #define HAS_TRANSPOSE_UVWX8_SSE2
660 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
661                                 uint8* dst_a, int dst_stride_a,
662                                 uint8* dst_b, int dst_stride_b,
663                                 int w) {
664   asm volatile (
665   // Read in the data from the source pointer.
666   // First round of bit swap.
667   ".p2align  2                                 \n"
668 "1:                                            \n"
669   "movdqa     (%0),%%xmm0                      \n"
670   "movdqa     (%0,%4),%%xmm1                   \n"
671   "lea        (%0,%4,2),%0                     \n"
672   "movdqa     %%xmm0,%%xmm8                    \n"
673   "punpcklbw  %%xmm1,%%xmm0                    \n"
674   "punpckhbw  %%xmm1,%%xmm8                    \n"
675   "movdqa     %%xmm8,%%xmm1                    \n"
676   "movdqa     (%0),%%xmm2                      \n"
677   "movdqa     (%0,%4),%%xmm3                   \n"
678   "lea        (%0,%4,2),%0                     \n"
679   "movdqa     %%xmm2,%%xmm8                    \n"
680   "punpcklbw  %%xmm3,%%xmm2                    \n"
681   "punpckhbw  %%xmm3,%%xmm8                    \n"
682   "movdqa     %%xmm8,%%xmm3                    \n"
683   "movdqa     (%0),%%xmm4                      \n"
684   "movdqa     (%0,%4),%%xmm5                   \n"
685   "lea        (%0,%4,2),%0                     \n"
686   "movdqa     %%xmm4,%%xmm8                    \n"
687   "punpcklbw  %%xmm5,%%xmm4                    \n"
688   "punpckhbw  %%xmm5,%%xmm8                    \n"
689   "movdqa     %%xmm8,%%xmm5                    \n"
690   "movdqa     (%0),%%xmm6                      \n"
691   "movdqa     (%0,%4),%%xmm7                   \n"
692   "lea        (%0,%4,2),%0                     \n"
693   "movdqa     %%xmm6,%%xmm8                    \n"
694   "punpcklbw  %%xmm7,%%xmm6                    \n"
695   "neg        %4                               \n"
696   "lea        0x10(%0,%4,8),%0                 \n"
697   "punpckhbw  %%xmm7,%%xmm8                    \n"
698   "movdqa     %%xmm8,%%xmm7                    \n"
699   "neg        %4                               \n"
700    // Second round of bit swap.
701   "movdqa     %%xmm0,%%xmm8                    \n"
702   "movdqa     %%xmm1,%%xmm9                    \n"
703   "punpckhwd  %%xmm2,%%xmm8                    \n"
704   "punpckhwd  %%xmm3,%%xmm9                    \n"
705   "punpcklwd  %%xmm2,%%xmm0                    \n"
706   "punpcklwd  %%xmm3,%%xmm1                    \n"
707   "movdqa     %%xmm8,%%xmm2                    \n"
708   "movdqa     %%xmm9,%%xmm3                    \n"
709   "movdqa     %%xmm4,%%xmm8                    \n"
710   "movdqa     %%xmm5,%%xmm9                    \n"
711   "punpckhwd  %%xmm6,%%xmm8                    \n"
712   "punpckhwd  %%xmm7,%%xmm9                    \n"
713   "punpcklwd  %%xmm6,%%xmm4                    \n"
714   "punpcklwd  %%xmm7,%%xmm5                    \n"
715   "movdqa     %%xmm8,%%xmm6                    \n"
716   "movdqa     %%xmm9,%%xmm7                    \n"
717   // Third round of bit swap.
718   // Write to the destination pointer.
719   "movdqa     %%xmm0,%%xmm8                    \n"
720   "punpckldq  %%xmm4,%%xmm0                    \n"
721   "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
722   "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
723   "punpckhdq  %%xmm4,%%xmm8                    \n"
724   "movlpd     %%xmm8,(%1,%5)                   \n"
725   "lea        (%1,%5,2),%1                     \n"
726   "movhpd     %%xmm8,(%2,%6)                   \n"
727   "lea        (%2,%6,2),%2                     \n"
728   "movdqa     %%xmm2,%%xmm8                    \n"
729   "punpckldq  %%xmm6,%%xmm2                    \n"
730   "movlpd     %%xmm2,(%1)                      \n"
731   "movhpd     %%xmm2,(%2)                      \n"
732   "punpckhdq  %%xmm6,%%xmm8                    \n"
733   "movlpd     %%xmm8,(%1,%5)                   \n"
734   "lea        (%1,%5,2),%1                     \n"
735   "movhpd     %%xmm8,(%2,%6)                   \n"
736   "lea        (%2,%6,2),%2                     \n"
737   "movdqa     %%xmm1,%%xmm8                    \n"
738   "punpckldq  %%xmm5,%%xmm1                    \n"
739   "movlpd     %%xmm1,(%1)                      \n"
740   "movhpd     %%xmm1,(%2)                      \n"
741   "punpckhdq  %%xmm5,%%xmm8                    \n"
742   "movlpd     %%xmm8,(%1,%5)                   \n"
743   "lea        (%1,%5,2),%1                     \n"
744   "movhpd     %%xmm8,(%2,%6)                   \n"
745   "lea        (%2,%6,2),%2                     \n"
746   "movdqa     %%xmm3,%%xmm8                    \n"
747   "punpckldq  %%xmm7,%%xmm3                    \n"
748   "movlpd     %%xmm3,(%1)                      \n"
749   "movhpd     %%xmm3,(%2)                      \n"
750   "punpckhdq  %%xmm7,%%xmm8                    \n"
751   "sub        $0x8,%3                          \n"
752   "movlpd     %%xmm8,(%1,%5)                   \n"
753   "lea        (%1,%5,2),%1                     \n"
754   "movhpd     %%xmm8,(%2,%6)                   \n"
755   "lea        (%2,%6,2),%2                     \n"
756   "jg         1b                               \n"
757   : "+r"(src),    // %0
758     "+r"(dst_a),  // %1
759     "+r"(dst_b),  // %2
760     "+r"(w)   // %3
761   : "r"((intptr_t)(src_stride)),    // %4
762     "r"((intptr_t)(dst_stride_a)),  // %5
763     "r"((intptr_t)(dst_stride_b))   // %6
764   : "memory", "cc",
765     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
766     "xmm8", "xmm9"
767 );
768 }
769 #endif
770 #endif
771
772 static void TransposeWx8_C(const uint8* src, int src_stride,
773                            uint8* dst, int dst_stride,
774                            int width) {
775   int i;
776   for (i = 0; i < width; ++i) {
777     dst[0] = src[0 * src_stride];
778     dst[1] = src[1 * src_stride];
779     dst[2] = src[2 * src_stride];
780     dst[3] = src[3 * src_stride];
781     dst[4] = src[4 * src_stride];
782     dst[5] = src[5 * src_stride];
783     dst[6] = src[6 * src_stride];
784     dst[7] = src[7 * src_stride];
785     ++src;
786     dst += dst_stride;
787   }
788 }
789
790 static void TransposeWxH_C(const uint8* src, int src_stride,
791                            uint8* dst, int dst_stride,
792                            int width, int height) {
793   int i;
794   for (i = 0; i < width; ++i) {
795     int j;
796     for (j = 0; j < height; ++j) {
797       dst[i * dst_stride + j] = src[j * src_stride + i];
798     }
799   }
800 }
801
802 LIBYUV_API
803 void TransposePlane(const uint8* src, int src_stride,
804                     uint8* dst, int dst_stride,
805                     int width, int height) {
806   int i = height;
807   void (*TransposeWx8)(const uint8* src, int src_stride,
808                        uint8* dst, int dst_stride,
809                        int width) = TransposeWx8_C;
810 #if defined(HAS_TRANSPOSE_WX8_NEON)
811   if (TestCpuFlag(kCpuHasNEON)) {
812     TransposeWx8 = TransposeWx8_NEON;
813   }
814 #endif
815 #if defined(HAS_TRANSPOSE_WX8_SSSE3)
816   if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
817     TransposeWx8 = TransposeWx8_SSSE3;
818   }
819 #endif
820 #if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
821   if (TestCpuFlag(kCpuHasSSSE3) &&
822       IS_ALIGNED(width, 16) &&
823       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
824     TransposeWx8 = TransposeWx8_FAST_SSSE3;
825   }
826 #endif
827 #if defined(HAS_TRANSPOSE_WX8_MIPS_DSPR2)
828   if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
829     if (IS_ALIGNED(width, 4) &&
830         IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
831       TransposeWx8 = TransposeWx8_FAST_MIPS_DSPR2;
832     } else {
833       TransposeWx8 = TransposeWx8_MIPS_DSPR2;
834     }
835   }
836 #endif
837
838   // Work across the source in 8x8 tiles
839   while (i >= 8) {
840     TransposeWx8(src, src_stride, dst, dst_stride, width);
841     src += 8 * src_stride;    // Go down 8 rows.
842     dst += 8;                 // Move over 8 columns.
843     i -= 8;
844   }
845
846   TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
847 }
848
849 LIBYUV_API
850 void RotatePlane90(const uint8* src, int src_stride,
851                    uint8* dst, int dst_stride,
852                    int width, int height) {
853   // Rotate by 90 is a transpose with the source read
854   // from bottom to top. So set the source pointer to the end
855   // of the buffer and flip the sign of the source stride.
856   src += src_stride * (height - 1);
857   src_stride = -src_stride;
858   TransposePlane(src, src_stride, dst, dst_stride, width, height);
859 }
860
861 LIBYUV_API
862 void RotatePlane270(const uint8* src, int src_stride,
863                     uint8* dst, int dst_stride,
864                     int width, int height) {
865   // Rotate by 270 is a transpose with the destination written
866   // from bottom to top. So set the destination pointer to the end
867   // of the buffer and flip the sign of the destination stride.
868   dst += dst_stride * (width - 1);
869   dst_stride = -dst_stride;
870   TransposePlane(src, src_stride, dst, dst_stride, width, height);
871 }
872
873 LIBYUV_API
874 void RotatePlane180(const uint8* src, int src_stride,
875                     uint8* dst, int dst_stride,
876                     int width, int height) {
877   // Swap first and last row and mirror the content. Uses a temporary row.
878   align_buffer_64(row, width);
879   const uint8* src_bot = src + src_stride * (height - 1);
880   uint8* dst_bot = dst + dst_stride * (height - 1);
881   int half_height = (height + 1) >> 1;
882   int y;
883   void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
884   void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
885 #if defined(HAS_MIRRORROW_NEON)
886   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
887     MirrorRow = MirrorRow_NEON;
888   }
889 #endif
890 #if defined(HAS_MIRRORROW_SSE2)
891   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
892       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
893       IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
894     MirrorRow = MirrorRow_SSE2;
895   }
896 #endif
897 #if defined(HAS_MIRRORROW_SSSE3)
898   if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
899       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
900       IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
901     MirrorRow = MirrorRow_SSSE3;
902   }
903 #endif
904 #if defined(HAS_MIRRORROW_AVX2)
905   if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
906     MirrorRow = MirrorRow_AVX2;
907   }
908 #endif
909 #if defined(HAS_MIRRORROW_MIPS_DSPR2)
910   if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
911       IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
912       IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {
913     MirrorRow = MirrorRow_MIPS_DSPR2;
914   }
915 #endif
916 #if defined(HAS_COPYROW_NEON)
917   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
918     CopyRow = CopyRow_NEON;
919   }
920 #endif
921 #if defined(HAS_COPYROW_X86)
922   if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
923     CopyRow = CopyRow_X86;
924   }
925 #endif
926 #if defined(HAS_COPYROW_SSE2)
927   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
928       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
929       IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
930     CopyRow = CopyRow_SSE2;
931   }
932 #endif
933 #if defined(HAS_COPYROW_ERMS)
934   if (TestCpuFlag(kCpuHasERMS)) {
935     CopyRow = CopyRow_ERMS;
936   }
937 #endif
938 #if defined(HAS_COPYROW_MIPS)
939   if (TestCpuFlag(kCpuHasMIPS)) {
940     CopyRow = CopyRow_MIPS;
941   }
942 #endif
943
944   // Odd height will harmlessly mirror the middle row twice.
945   for (y = 0; y < half_height; ++y) {
946     MirrorRow(src, row, width);  // Mirror first row into a buffer
947     src += src_stride;
948     MirrorRow(src_bot, dst, width);  // Mirror last row into first row
949     dst += dst_stride;
950     CopyRow(row, dst_bot, width);  // Copy first mirrored row into last
951     src_bot -= src_stride;
952     dst_bot -= dst_stride;
953   }
954   free_aligned_buffer_64(row);
955 }
956
957 static void TransposeUVWx8_C(const uint8* src, int src_stride,
958                              uint8* dst_a, int dst_stride_a,
959                              uint8* dst_b, int dst_stride_b,
960                              int width) {
961   int i;
962   for (i = 0; i < width; ++i) {
963     dst_a[0] = src[0 * src_stride + 0];
964     dst_b[0] = src[0 * src_stride + 1];
965     dst_a[1] = src[1 * src_stride + 0];
966     dst_b[1] = src[1 * src_stride + 1];
967     dst_a[2] = src[2 * src_stride + 0];
968     dst_b[2] = src[2 * src_stride + 1];
969     dst_a[3] = src[3 * src_stride + 0];
970     dst_b[3] = src[3 * src_stride + 1];
971     dst_a[4] = src[4 * src_stride + 0];
972     dst_b[4] = src[4 * src_stride + 1];
973     dst_a[5] = src[5 * src_stride + 0];
974     dst_b[5] = src[5 * src_stride + 1];
975     dst_a[6] = src[6 * src_stride + 0];
976     dst_b[6] = src[6 * src_stride + 1];
977     dst_a[7] = src[7 * src_stride + 0];
978     dst_b[7] = src[7 * src_stride + 1];
979     src += 2;
980     dst_a += dst_stride_a;
981     dst_b += dst_stride_b;
982   }
983 }
984
985 static void TransposeUVWxH_C(const uint8* src, int src_stride,
986                              uint8* dst_a, int dst_stride_a,
987                              uint8* dst_b, int dst_stride_b,
988                              int width, int height) {
989   int i;
990   for (i = 0; i < width * 2; i += 2) {
991     int j;
992     for (j = 0; j < height; ++j) {
993       dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
994       dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
995     }
996   }
997 }
998
999 LIBYUV_API
1000 void TransposeUV(const uint8* src, int src_stride,
1001                  uint8* dst_a, int dst_stride_a,
1002                  uint8* dst_b, int dst_stride_b,
1003                  int width, int height) {
1004   int i = height;
1005   void (*TransposeUVWx8)(const uint8* src, int src_stride,
1006                          uint8* dst_a, int dst_stride_a,
1007                          uint8* dst_b, int dst_stride_b,
1008                          int width) = TransposeUVWx8_C;
1009 #if defined(HAS_TRANSPOSE_UVWX8_NEON)
1010   if (TestCpuFlag(kCpuHasNEON)) {
1011     TransposeUVWx8 = TransposeUVWx8_NEON;
1012   }
1013 #elif defined(HAS_TRANSPOSE_UVWX8_SSE2)
1014   if (TestCpuFlag(kCpuHasSSE2) &&
1015       IS_ALIGNED(width, 8) &&
1016       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
1017     TransposeUVWx8 = TransposeUVWx8_SSE2;
1018   }
1019 #elif defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2)
1020   if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) &&
1021       IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
1022     TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2;
1023   }
1024 #endif
1025
1026   // Work through the source in 8x8 tiles.
1027   while (i >= 8) {
1028     TransposeUVWx8(src, src_stride,
1029                    dst_a, dst_stride_a,
1030                    dst_b, dst_stride_b,
1031                    width);
1032     src += 8 * src_stride;    // Go down 8 rows.
1033     dst_a += 8;               // Move over 8 columns.
1034     dst_b += 8;               // Move over 8 columns.
1035     i -= 8;
1036   }
1037
1038   TransposeUVWxH_C(src, src_stride,
1039                    dst_a, dst_stride_a,
1040                    dst_b, dst_stride_b,
1041                    width, i);
1042 }
1043
1044 LIBYUV_API
1045 void RotateUV90(const uint8* src, int src_stride,
1046                 uint8* dst_a, int dst_stride_a,
1047                 uint8* dst_b, int dst_stride_b,
1048                 int width, int height) {
1049   src += src_stride * (height - 1);
1050   src_stride = -src_stride;
1051
1052   TransposeUV(src, src_stride,
1053               dst_a, dst_stride_a,
1054               dst_b, dst_stride_b,
1055               width, height);
1056 }
1057
1058 LIBYUV_API
1059 void RotateUV270(const uint8* src, int src_stride,
1060                  uint8* dst_a, int dst_stride_a,
1061                  uint8* dst_b, int dst_stride_b,
1062                  int width, int height) {
1063   dst_a += dst_stride_a * (width - 1);
1064   dst_b += dst_stride_b * (width - 1);
1065   dst_stride_a = -dst_stride_a;
1066   dst_stride_b = -dst_stride_b;
1067
1068   TransposeUV(src, src_stride,
1069               dst_a, dst_stride_a,
1070               dst_b, dst_stride_b,
1071               width, height);
1072 }
1073
1074 // Rotate 180 is a horizontal and vertical flip.
1075 LIBYUV_API
1076 void RotateUV180(const uint8* src, int src_stride,
1077                  uint8* dst_a, int dst_stride_a,
1078                  uint8* dst_b, int dst_stride_b,
1079                  int width, int height) {
1080   int i;
1081   void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
1082       MirrorUVRow_C;
1083 #if defined(HAS_MIRRORUVROW_NEON)
1084   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
1085     MirrorRowUV = MirrorUVRow_NEON;
1086   }
1087 #elif defined(HAS_MIRRORROW_UV_SSSE3)
1088   if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
1089       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
1090     MirrorRowUV = MirrorUVRow_SSSE3;
1091   }
1092 #elif defined(HAS_MIRRORUVROW_MIPS_DSPR2)
1093   if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
1094       IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
1095     MirrorRowUV = MirrorUVRow_MIPS_DSPR2;
1096   }
1097 #endif
1098
1099   dst_a += dst_stride_a * (height - 1);
1100   dst_b += dst_stride_b * (height - 1);
1101
1102   for (i = 0; i < height; ++i) {
1103     MirrorRowUV(src, dst_a, dst_b, width);
1104     src += src_stride;
1105     dst_a -= dst_stride_a;
1106     dst_b -= dst_stride_b;
1107   }
1108 }
1109
1110 LIBYUV_API
1111 int RotatePlane(const uint8* src, int src_stride,
1112                 uint8* dst, int dst_stride,
1113                 int width, int height,
1114                 enum RotationMode mode) {
1115   if (!src || width <= 0 || height == 0 || !dst) {
1116     return -1;
1117   }
1118
1119   // Negative height means invert the image.
1120   if (height < 0) {
1121     height = -height;
1122     src = src + (height - 1) * src_stride;
1123     src_stride = -src_stride;
1124   }
1125
1126   switch (mode) {
1127     case kRotate0:
1128       // copy frame
1129       CopyPlane(src, src_stride,
1130                 dst, dst_stride,
1131                 width, height);
1132       return 0;
1133     case kRotate90:
1134       RotatePlane90(src, src_stride,
1135                     dst, dst_stride,
1136                     width, height);
1137       return 0;
1138     case kRotate270:
1139       RotatePlane270(src, src_stride,
1140                      dst, dst_stride,
1141                      width, height);
1142       return 0;
1143     case kRotate180:
1144       RotatePlane180(src, src_stride,
1145                      dst, dst_stride,
1146                      width, height);
1147       return 0;
1148     default:
1149       break;
1150   }
1151   return -1;
1152 }
1153
1154 LIBYUV_API
1155 int I420Rotate(const uint8* src_y, int src_stride_y,
1156                const uint8* src_u, int src_stride_u,
1157                const uint8* src_v, int src_stride_v,
1158                uint8* dst_y, int dst_stride_y,
1159                uint8* dst_u, int dst_stride_u,
1160                uint8* dst_v, int dst_stride_v,
1161                int width, int height,
1162                enum RotationMode mode) {
1163   int halfwidth = (width + 1) >> 1;
1164   int halfheight = (height + 1) >> 1;
1165   if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
1166       !dst_y || !dst_u || !dst_v) {
1167     return -1;
1168   }
1169
1170   // Negative height means invert the image.
1171   if (height < 0) {
1172     height = -height;
1173     halfheight = (height + 1) >> 1;
1174     src_y = src_y + (height - 1) * src_stride_y;
1175     src_u = src_u + (halfheight - 1) * src_stride_u;
1176     src_v = src_v + (halfheight - 1) * src_stride_v;
1177     src_stride_y = -src_stride_y;
1178     src_stride_u = -src_stride_u;
1179     src_stride_v = -src_stride_v;
1180   }
1181
1182   switch (mode) {
1183     case kRotate0:
1184       // copy frame
1185       return I420Copy(src_y, src_stride_y,
1186                       src_u, src_stride_u,
1187                       src_v, src_stride_v,
1188                       dst_y, dst_stride_y,
1189                       dst_u, dst_stride_u,
1190                       dst_v, dst_stride_v,
1191                       width, height);
1192     case kRotate90:
1193       RotatePlane90(src_y, src_stride_y,
1194                     dst_y, dst_stride_y,
1195                     width, height);
1196       RotatePlane90(src_u, src_stride_u,
1197                     dst_u, dst_stride_u,
1198                     halfwidth, halfheight);
1199       RotatePlane90(src_v, src_stride_v,
1200                     dst_v, dst_stride_v,
1201                     halfwidth, halfheight);
1202       return 0;
1203     case kRotate270:
1204       RotatePlane270(src_y, src_stride_y,
1205                      dst_y, dst_stride_y,
1206                      width, height);
1207       RotatePlane270(src_u, src_stride_u,
1208                      dst_u, dst_stride_u,
1209                      halfwidth, halfheight);
1210       RotatePlane270(src_v, src_stride_v,
1211                      dst_v, dst_stride_v,
1212                      halfwidth, halfheight);
1213       return 0;
1214     case kRotate180:
1215       RotatePlane180(src_y, src_stride_y,
1216                      dst_y, dst_stride_y,
1217                      width, height);
1218       RotatePlane180(src_u, src_stride_u,
1219                      dst_u, dst_stride_u,
1220                      halfwidth, halfheight);
1221       RotatePlane180(src_v, src_stride_v,
1222                      dst_v, dst_stride_v,
1223                      halfwidth, halfheight);
1224       return 0;
1225     default:
1226       break;
1227   }
1228   return -1;
1229 }
1230
1231 LIBYUV_API
1232 int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
1233                      const uint8* src_uv, int src_stride_uv,
1234                      uint8* dst_y, int dst_stride_y,
1235                      uint8* dst_u, int dst_stride_u,
1236                      uint8* dst_v, int dst_stride_v,
1237                      int width, int height,
1238                      enum RotationMode mode) {
1239   int halfwidth = (width + 1) >> 1;
1240   int halfheight = (height + 1) >> 1;
1241   if (!src_y || !src_uv || width <= 0 || height == 0 ||
1242       !dst_y || !dst_u || !dst_v) {
1243     return -1;
1244   }
1245
1246   // Negative height means invert the image.
1247   if (height < 0) {
1248     height = -height;
1249     halfheight = (height + 1) >> 1;
1250     src_y = src_y + (height - 1) * src_stride_y;
1251     src_uv = src_uv + (halfheight - 1) * src_stride_uv;
1252     src_stride_y = -src_stride_y;
1253     src_stride_uv = -src_stride_uv;
1254   }
1255
1256   switch (mode) {
1257     case kRotate0:
1258       // copy frame
1259       return NV12ToI420(src_y, src_stride_y,
1260                         src_uv, src_stride_uv,
1261                         dst_y, dst_stride_y,
1262                         dst_u, dst_stride_u,
1263                         dst_v, dst_stride_v,
1264                         width, height);
1265     case kRotate90:
1266       RotatePlane90(src_y, src_stride_y,
1267                     dst_y, dst_stride_y,
1268                     width, height);
1269       RotateUV90(src_uv, src_stride_uv,
1270                  dst_u, dst_stride_u,
1271                  dst_v, dst_stride_v,
1272                  halfwidth, halfheight);
1273       return 0;
1274     case kRotate270:
1275       RotatePlane270(src_y, src_stride_y,
1276                      dst_y, dst_stride_y,
1277                      width, height);
1278       RotateUV270(src_uv, src_stride_uv,
1279                   dst_u, dst_stride_u,
1280                   dst_v, dst_stride_v,
1281                   halfwidth, halfheight);
1282       return 0;
1283     case kRotate180:
1284       RotatePlane180(src_y, src_stride_y,
1285                      dst_y, dst_stride_y,
1286                      width, height);
1287       RotateUV180(src_uv, src_stride_uv,
1288                   dst_u, dst_stride_u,
1289                   dst_v, dst_stride_v,
1290                   halfwidth, halfheight);
1291       return 0;
1292     default:
1293       break;
1294   }
1295   return -1;
1296 }
1297
1298 #ifdef __cplusplus
1299 }  // extern "C"
1300 }  // namespace libyuv
1301 #endif