Update To 11.40.268.0
[platform/framework/web/crosswalk.git] / src / third_party / libyuv / source / rotate.cc
1 /*
2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10
11 #include "libyuv/rotate.h"
12
13 #include "libyuv/cpu_id.h"
14 #include "libyuv/convert.h"
15 #include "libyuv/planar_functions.h"
16 #include "libyuv/row.h"
17
18 #ifdef __cplusplus
19 namespace libyuv {
20 extern "C" {
21 #endif
22
23 #if !defined(LIBYUV_DISABLE_X86) && \
24     (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
25 #if defined(__APPLE__) && defined(__i386__)
26 #define DECLARE_FUNCTION(name)                                                 \
27     ".text                                     \n"                             \
28     ".private_extern _" #name "                \n"                             \
29     ".align 4,0x90                             \n"                             \
30 "_" #name ":                                   \n"
31 #elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
32 #define DECLARE_FUNCTION(name)                                                 \
33     ".text                                     \n"                             \
34     ".align 4,0x90                             \n"                             \
35 "_" #name ":                                   \n"
36 #else
37 #define DECLARE_FUNCTION(name)                                                 \
38     ".text                                     \n"                             \
39     ".align 4,0x90                             \n"                             \
40 #name ":                                       \n"
41 #endif
42 #endif
43
44 #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
45     (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
46 #define HAS_TRANSPOSE_WX8_NEON
47 void TransposeWx8_NEON(const uint8* src, int src_stride,
48                        uint8* dst, int dst_stride, int width);
49 #define HAS_TRANSPOSE_UVWX8_NEON
50 void TransposeUVWx8_NEON(const uint8* src, int src_stride,
51                          uint8* dst_a, int dst_stride_a,
52                          uint8* dst_b, int dst_stride_b,
53                          int width);
54 #endif
55
56 #if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
57     defined(__mips__) && \
58     defined(__mips_dsp) && (__mips_dsp_rev >= 2)
59 #define HAS_TRANSPOSE_WX8_MIPS_DSPR2
60 void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
61                              uint8* dst, int dst_stride, int width);
62
63 void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
64                                   uint8* dst, int dst_stride, int width);
65 #define HAS_TRANSPOSE_UVWx8_MIPS_DSPR2
66 void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
67                                uint8* dst_a, int dst_stride_a,
68                                uint8* dst_b, int dst_stride_b,
69                                int width);
70 #endif  // defined(__mips__)
71
72 #if !defined(LIBYUV_DISABLE_X86) && \
73     defined(_M_IX86) && defined(_MSC_VER)
74 #define HAS_TRANSPOSE_WX8_SSSE3
75 __declspec(naked) __declspec(align(16))
76 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
77                                uint8* dst, int dst_stride, int width) {
78   __asm {
79     push      edi
80     push      esi
81     push      ebp
82     mov       eax, [esp + 12 + 4]   // src
83     mov       edi, [esp + 12 + 8]   // src_stride
84     mov       edx, [esp + 12 + 12]  // dst
85     mov       esi, [esp + 12 + 16]  // dst_stride
86     mov       ecx, [esp + 12 + 20]  // width
87
88     // Read in the data from the source pointer.
89     // First round of bit swap.
90     align      4
91  convertloop:
92     movq      xmm0, qword ptr [eax]
93     lea       ebp, [eax + 8]
94     movq      xmm1, qword ptr [eax + edi]
95     lea       eax, [eax + 2 * edi]
96     punpcklbw xmm0, xmm1
97     movq      xmm2, qword ptr [eax]
98     movdqa    xmm1, xmm0
99     palignr   xmm1, xmm1, 8
100     movq      xmm3, qword ptr [eax + edi]
101     lea       eax, [eax + 2 * edi]
102     punpcklbw xmm2, xmm3
103     movdqa    xmm3, xmm2
104     movq      xmm4, qword ptr [eax]
105     palignr   xmm3, xmm3, 8
106     movq      xmm5, qword ptr [eax + edi]
107     punpcklbw xmm4, xmm5
108     lea       eax, [eax + 2 * edi]
109     movdqa    xmm5, xmm4
110     movq      xmm6, qword ptr [eax]
111     palignr   xmm5, xmm5, 8
112     movq      xmm7, qword ptr [eax + edi]
113     punpcklbw xmm6, xmm7
114     mov       eax, ebp
115     movdqa    xmm7, xmm6
116     palignr   xmm7, xmm7, 8
117     // Second round of bit swap.
118     punpcklwd xmm0, xmm2
119     punpcklwd xmm1, xmm3
120     movdqa    xmm2, xmm0
121     movdqa    xmm3, xmm1
122     palignr   xmm2, xmm2, 8
123     palignr   xmm3, xmm3, 8
124     punpcklwd xmm4, xmm6
125     punpcklwd xmm5, xmm7
126     movdqa    xmm6, xmm4
127     movdqa    xmm7, xmm5
128     palignr   xmm6, xmm6, 8
129     palignr   xmm7, xmm7, 8
130     // Third round of bit swap.
131     // Write to the destination pointer.
132     punpckldq xmm0, xmm4
133     movq      qword ptr [edx], xmm0
134     movdqa    xmm4, xmm0
135     palignr   xmm4, xmm4, 8
136     movq      qword ptr [edx + esi], xmm4
137     lea       edx, [edx + 2 * esi]
138     punpckldq xmm2, xmm6
139     movdqa    xmm6, xmm2
140     palignr   xmm6, xmm6, 8
141     movq      qword ptr [edx], xmm2
142     punpckldq xmm1, xmm5
143     movq      qword ptr [edx + esi], xmm6
144     lea       edx, [edx + 2 * esi]
145     movdqa    xmm5, xmm1
146     movq      qword ptr [edx], xmm1
147     palignr   xmm5, xmm5, 8
148     punpckldq xmm3, xmm7
149     movq      qword ptr [edx + esi], xmm5
150     lea       edx, [edx + 2 * esi]
151     movq      qword ptr [edx], xmm3
152     movdqa    xmm7, xmm3
153     palignr   xmm7, xmm7, 8
154     sub       ecx, 8
155     movq      qword ptr [edx + esi], xmm7
156     lea       edx, [edx + 2 * esi]
157     jg        convertloop
158
159     pop       ebp
160     pop       esi
161     pop       edi
162     ret
163   }
164 }
165
166 #define HAS_TRANSPOSE_UVWX8_SSE2
167 __declspec(naked) __declspec(align(16))
168 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
169                                 uint8* dst_a, int dst_stride_a,
170                                 uint8* dst_b, int dst_stride_b,
171                                 int w) {
172   __asm {
173     push      ebx
174     push      esi
175     push      edi
176     push      ebp
177     mov       eax, [esp + 16 + 4]   // src
178     mov       edi, [esp + 16 + 8]   // src_stride
179     mov       edx, [esp + 16 + 12]  // dst_a
180     mov       esi, [esp + 16 + 16]  // dst_stride_a
181     mov       ebx, [esp + 16 + 20]  // dst_b
182     mov       ebp, [esp + 16 + 24]  // dst_stride_b
183     mov       ecx, esp
184     sub       esp, 4 + 16
185     and       esp, ~15
186     mov       [esp + 16], ecx
187     mov       ecx, [ecx + 16 + 28]  // w
188
189     align      4
190  convertloop:
191     // Read in the data from the source pointer.
192     // First round of bit swap.
193     movdqu    xmm0, [eax]
194     movdqu    xmm1, [eax + edi]
195     lea       eax, [eax + 2 * edi]
196     movdqa    xmm7, xmm0  // use xmm7 as temp register.
197     punpcklbw xmm0, xmm1
198     punpckhbw xmm7, xmm1
199     movdqa    xmm1, xmm7
200     movdqu    xmm2, [eax]
201     movdqu    xmm3, [eax + edi]
202     lea       eax, [eax + 2 * edi]
203     movdqa    xmm7, xmm2
204     punpcklbw xmm2, xmm3
205     punpckhbw xmm7, xmm3
206     movdqa    xmm3, xmm7
207     movdqu    xmm4, [eax]
208     movdqu    xmm5, [eax + edi]
209     lea       eax, [eax + 2 * edi]
210     movdqa    xmm7, xmm4
211     punpcklbw xmm4, xmm5
212     punpckhbw xmm7, xmm5
213     movdqa    xmm5, xmm7
214     movdqu    xmm6, [eax]
215     movdqu    xmm7, [eax + edi]
216     lea       eax, [eax + 2 * edi]
217     movdqu    [esp], xmm5  // backup xmm5
218     neg       edi
219     movdqa    xmm5, xmm6   // use xmm5 as temp register.
220     punpcklbw xmm6, xmm7
221     punpckhbw xmm5, xmm7
222     movdqa    xmm7, xmm5
223     lea       eax, [eax + 8 * edi + 16]
224     neg       edi
225     // Second round of bit swap.
226     movdqa    xmm5, xmm0
227     punpcklwd xmm0, xmm2
228     punpckhwd xmm5, xmm2
229     movdqa    xmm2, xmm5
230     movdqa    xmm5, xmm1
231     punpcklwd xmm1, xmm3
232     punpckhwd xmm5, xmm3
233     movdqa    xmm3, xmm5
234     movdqa    xmm5, xmm4
235     punpcklwd xmm4, xmm6
236     punpckhwd xmm5, xmm6
237     movdqa    xmm6, xmm5
238     movdqu    xmm5, [esp]  // restore xmm5
239     movdqu    [esp], xmm6  // backup xmm6
240     movdqa    xmm6, xmm5    // use xmm6 as temp register.
241     punpcklwd xmm5, xmm7
242     punpckhwd xmm6, xmm7
243     movdqa    xmm7, xmm6
244     // Third round of bit swap.
245     // Write to the destination pointer.
246     movdqa    xmm6, xmm0
247     punpckldq xmm0, xmm4
248     punpckhdq xmm6, xmm4
249     movdqa    xmm4, xmm6
250     movdqu    xmm6, [esp]  // restore xmm6
251     movlpd    qword ptr [edx], xmm0
252     movhpd    qword ptr [ebx], xmm0
253     movlpd    qword ptr [edx + esi], xmm4
254     lea       edx, [edx + 2 * esi]
255     movhpd    qword ptr [ebx + ebp], xmm4
256     lea       ebx, [ebx + 2 * ebp]
257     movdqa    xmm0, xmm2   // use xmm0 as the temp register.
258     punpckldq xmm2, xmm6
259     movlpd    qword ptr [edx], xmm2
260     movhpd    qword ptr [ebx], xmm2
261     punpckhdq xmm0, xmm6
262     movlpd    qword ptr [edx + esi], xmm0
263     lea       edx, [edx + 2 * esi]
264     movhpd    qword ptr [ebx + ebp], xmm0
265     lea       ebx, [ebx + 2 * ebp]
266     movdqa    xmm0, xmm1   // use xmm0 as the temp register.
267     punpckldq xmm1, xmm5
268     movlpd    qword ptr [edx], xmm1
269     movhpd    qword ptr [ebx], xmm1
270     punpckhdq xmm0, xmm5
271     movlpd    qword ptr [edx + esi], xmm0
272     lea       edx, [edx + 2 * esi]
273     movhpd    qword ptr [ebx + ebp], xmm0
274     lea       ebx, [ebx + 2 * ebp]
275     movdqa    xmm0, xmm3   // use xmm0 as the temp register.
276     punpckldq xmm3, xmm7
277     movlpd    qword ptr [edx], xmm3
278     movhpd    qword ptr [ebx], xmm3
279     punpckhdq xmm0, xmm7
280     sub       ecx, 8
281     movlpd    qword ptr [edx + esi], xmm0
282     lea       edx, [edx + 2 * esi]
283     movhpd    qword ptr [ebx + ebp], xmm0
284     lea       ebx, [ebx + 2 * ebp]
285     jg        convertloop
286
287     mov       esp, [esp + 16]
288     pop       ebp
289     pop       edi
290     pop       esi
291     pop       ebx
292     ret
293   }
294 }
295 #endif
296 #if !defined(LIBYUV_DISABLE_X86) && \
297     (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
298 #define HAS_TRANSPOSE_WX8_SSSE3
299 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
300                                uint8* dst, int dst_stride, int width) {
301   asm volatile (
302     // Read in the data from the source pointer.
303     // First round of bit swap.
304     ".p2align  2                                 \n"
305   "1:                                            \n"
306     "movq       (%0),%%xmm0                      \n"
307     "movq       (%0,%3),%%xmm1                   \n"
308     "lea        (%0,%3,2),%0                     \n"
309     "punpcklbw  %%xmm1,%%xmm0                    \n"
310     "movq       (%0),%%xmm2                      \n"
311     "movdqa     %%xmm0,%%xmm1                    \n"
312     "palignr    $0x8,%%xmm1,%%xmm1               \n"
313     "movq       (%0,%3),%%xmm3                   \n"
314     "lea        (%0,%3,2),%0                     \n"
315     "punpcklbw  %%xmm3,%%xmm2                    \n"
316     "movdqa     %%xmm2,%%xmm3                    \n"
317     "movq       (%0),%%xmm4                      \n"
318     "palignr    $0x8,%%xmm3,%%xmm3               \n"
319     "movq       (%0,%3),%%xmm5                   \n"
320     "lea        (%0,%3,2),%0                     \n"
321     "punpcklbw  %%xmm5,%%xmm4                    \n"
322     "movdqa     %%xmm4,%%xmm5                    \n"
323     "movq       (%0),%%xmm6                      \n"
324     "palignr    $0x8,%%xmm5,%%xmm5               \n"
325     "movq       (%0,%3),%%xmm7                   \n"
326     "lea        (%0,%3,2),%0                     \n"
327     "punpcklbw  %%xmm7,%%xmm6                    \n"
328     "neg        %3                               \n"
329     "movdqa     %%xmm6,%%xmm7                    \n"
330     "lea        0x8(%0,%3,8),%0                  \n"
331     "palignr    $0x8,%%xmm7,%%xmm7               \n"
332     "neg        %3                               \n"
333      // Second round of bit swap.
334     "punpcklwd  %%xmm2,%%xmm0                    \n"
335     "punpcklwd  %%xmm3,%%xmm1                    \n"
336     "movdqa     %%xmm0,%%xmm2                    \n"
337     "movdqa     %%xmm1,%%xmm3                    \n"
338     "palignr    $0x8,%%xmm2,%%xmm2               \n"
339     "palignr    $0x8,%%xmm3,%%xmm3               \n"
340     "punpcklwd  %%xmm6,%%xmm4                    \n"
341     "punpcklwd  %%xmm7,%%xmm5                    \n"
342     "movdqa     %%xmm4,%%xmm6                    \n"
343     "movdqa     %%xmm5,%%xmm7                    \n"
344     "palignr    $0x8,%%xmm6,%%xmm6               \n"
345     "palignr    $0x8,%%xmm7,%%xmm7               \n"
346     // Third round of bit swap.
347     // Write to the destination pointer.
348     "punpckldq  %%xmm4,%%xmm0                    \n"
349     "movq       %%xmm0,(%1)                      \n"
350     "movdqa     %%xmm0,%%xmm4                    \n"
351     "palignr    $0x8,%%xmm4,%%xmm4               \n"
352     "movq       %%xmm4,(%1,%4)                   \n"
353     "lea        (%1,%4,2),%1                     \n"
354     "punpckldq  %%xmm6,%%xmm2                    \n"
355     "movdqa     %%xmm2,%%xmm6                    \n"
356     "movq       %%xmm2,(%1)                      \n"
357     "palignr    $0x8,%%xmm6,%%xmm6               \n"
358     "punpckldq  %%xmm5,%%xmm1                    \n"
359     "movq       %%xmm6,(%1,%4)                   \n"
360     "lea        (%1,%4,2),%1                     \n"
361     "movdqa     %%xmm1,%%xmm5                    \n"
362     "movq       %%xmm1,(%1)                      \n"
363     "palignr    $0x8,%%xmm5,%%xmm5               \n"
364     "movq       %%xmm5,(%1,%4)                   \n"
365     "lea        (%1,%4,2),%1                     \n"
366     "punpckldq  %%xmm7,%%xmm3                    \n"
367     "movq       %%xmm3,(%1)                      \n"
368     "movdqa     %%xmm3,%%xmm7                    \n"
369     "palignr    $0x8,%%xmm7,%%xmm7               \n"
370     "sub        $0x8,%2                          \n"
371     "movq       %%xmm7,(%1,%4)                   \n"
372     "lea        (%1,%4,2),%1                     \n"
373     "jg         1b                               \n"
374     : "+r"(src),    // %0
375       "+r"(dst),    // %1
376       "+r"(width)   // %2
377     : "r"((intptr_t)(src_stride)),  // %3
378       "r"((intptr_t)(dst_stride))   // %4
379     : "memory", "cc"
380   #if defined(__SSE2__)
381       , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
382   #endif
383   );
384 }
385
386 #if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)
387 #define HAS_TRANSPOSE_UVWX8_SSE2
388 void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
389                          uint8* dst_a, int dst_stride_a,
390                          uint8* dst_b, int dst_stride_b,
391                          int w);
392   asm (
393     DECLARE_FUNCTION(TransposeUVWx8_SSE2)
394     "push   %ebx                               \n"
395     "push   %esi                               \n"
396     "push   %edi                               \n"
397     "push   %ebp                               \n"
398     "mov    0x14(%esp),%eax                    \n"
399     "mov    0x18(%esp),%edi                    \n"
400     "mov    0x1c(%esp),%edx                    \n"
401     "mov    0x20(%esp),%esi                    \n"
402     "mov    0x24(%esp),%ebx                    \n"
403     "mov    0x28(%esp),%ebp                    \n"
404     "mov    %esp,%ecx                          \n"
405     "sub    $0x14,%esp                         \n"
406     "and    $0xfffffff0,%esp                   \n"
407     "mov    %ecx,0x10(%esp)                    \n"
408     "mov    0x2c(%ecx),%ecx                    \n"
409
410 "1:                                            \n"
411     "movdqu (%eax),%xmm0                       \n"
412     "movdqu (%eax,%edi,1),%xmm1                \n"
413     "lea    (%eax,%edi,2),%eax                 \n"
414     "movdqa %xmm0,%xmm7                        \n"
415     "punpcklbw %xmm1,%xmm0                     \n"
416     "punpckhbw %xmm1,%xmm7                     \n"
417     "movdqa %xmm7,%xmm1                        \n"
418     "movdqu (%eax),%xmm2                       \n"
419     "movdqu (%eax,%edi,1),%xmm3                \n"
420     "lea    (%eax,%edi,2),%eax                 \n"
421     "movdqa %xmm2,%xmm7                        \n"
422     "punpcklbw %xmm3,%xmm2                     \n"
423     "punpckhbw %xmm3,%xmm7                     \n"
424     "movdqa %xmm7,%xmm3                        \n"
425     "movdqu (%eax),%xmm4                       \n"
426     "movdqu (%eax,%edi,1),%xmm5                \n"
427     "lea    (%eax,%edi,2),%eax                 \n"
428     "movdqa %xmm4,%xmm7                        \n"
429     "punpcklbw %xmm5,%xmm4                     \n"
430     "punpckhbw %xmm5,%xmm7                     \n"
431     "movdqa %xmm7,%xmm5                        \n"
432     "movdqu (%eax),%xmm6                       \n"
433     "movdqu (%eax,%edi,1),%xmm7                \n"
434     "lea    (%eax,%edi,2),%eax                 \n"
435     "movdqu %xmm5,(%esp)                       \n"
436     "neg    %edi                               \n"
437     "movdqa %xmm6,%xmm5                        \n"
438     "punpcklbw %xmm7,%xmm6                     \n"
439     "punpckhbw %xmm7,%xmm5                     \n"
440     "movdqa %xmm5,%xmm7                        \n"
441     "lea    0x10(%eax,%edi,8),%eax             \n"
442     "neg    %edi                               \n"
443     "movdqa %xmm0,%xmm5                        \n"
444     "punpcklwd %xmm2,%xmm0                     \n"
445     "punpckhwd %xmm2,%xmm5                     \n"
446     "movdqa %xmm5,%xmm2                        \n"
447     "movdqa %xmm1,%xmm5                        \n"
448     "punpcklwd %xmm3,%xmm1                     \n"
449     "punpckhwd %xmm3,%xmm5                     \n"
450     "movdqa %xmm5,%xmm3                        \n"
451     "movdqa %xmm4,%xmm5                        \n"
452     "punpcklwd %xmm6,%xmm4                     \n"
453     "punpckhwd %xmm6,%xmm5                     \n"
454     "movdqa %xmm5,%xmm6                        \n"
455     "movdqu (%esp),%xmm5                       \n"
456     "movdqu %xmm6,(%esp)                       \n"
457     "movdqa %xmm5,%xmm6                        \n"
458     "punpcklwd %xmm7,%xmm5                     \n"
459     "punpckhwd %xmm7,%xmm6                     \n"
460     "movdqa %xmm6,%xmm7                        \n"
461     "movdqa %xmm0,%xmm6                        \n"
462     "punpckldq %xmm4,%xmm0                     \n"
463     "punpckhdq %xmm4,%xmm6                     \n"
464     "movdqa %xmm6,%xmm4                        \n"
465     "movdqu (%esp),%xmm6                       \n"
466     "movlpd %xmm0,(%edx)                       \n"
467     "movhpd %xmm0,(%ebx)                       \n"
468     "movlpd %xmm4,(%edx,%esi,1)                \n"
469     "lea    (%edx,%esi,2),%edx                 \n"
470     "movhpd %xmm4,(%ebx,%ebp,1)                \n"
471     "lea    (%ebx,%ebp,2),%ebx                 \n"
472     "movdqa %xmm2,%xmm0                        \n"
473     "punpckldq %xmm6,%xmm2                     \n"
474     "movlpd %xmm2,(%edx)                       \n"
475     "movhpd %xmm2,(%ebx)                       \n"
476     "punpckhdq %xmm6,%xmm0                     \n"
477     "movlpd %xmm0,(%edx,%esi,1)                \n"
478     "lea    (%edx,%esi,2),%edx                 \n"
479     "movhpd %xmm0,(%ebx,%ebp,1)                \n"
480     "lea    (%ebx,%ebp,2),%ebx                 \n"
481     "movdqa %xmm1,%xmm0                        \n"
482     "punpckldq %xmm5,%xmm1                     \n"
483     "movlpd %xmm1,(%edx)                       \n"
484     "movhpd %xmm1,(%ebx)                       \n"
485     "punpckhdq %xmm5,%xmm0                     \n"
486     "movlpd %xmm0,(%edx,%esi,1)                \n"
487     "lea    (%edx,%esi,2),%edx                 \n"
488     "movhpd %xmm0,(%ebx,%ebp,1)                \n"
489     "lea    (%ebx,%ebp,2),%ebx                 \n"
490     "movdqa %xmm3,%xmm0                        \n"
491     "punpckldq %xmm7,%xmm3                     \n"
492     "movlpd %xmm3,(%edx)                       \n"
493     "movhpd %xmm3,(%ebx)                       \n"
494     "punpckhdq %xmm7,%xmm0                     \n"
495     "sub    $0x8,%ecx                          \n"
496     "movlpd %xmm0,(%edx,%esi,1)                \n"
497     "lea    (%edx,%esi,2),%edx                 \n"
498     "movhpd %xmm0,(%ebx,%ebp,1)                \n"
499     "lea    (%ebx,%ebp,2),%ebx                 \n"
500     "jg     1b                                 \n"
501     "mov    0x10(%esp),%esp                    \n"
502     "pop    %ebp                               \n"
503     "pop    %edi                               \n"
504     "pop    %esi                               \n"
505     "pop    %ebx                               \n"
506 #if defined(__native_client__)
507     "pop    %ecx                               \n"
508     "and    $0xffffffe0,%ecx                   \n"
509     "jmp    *%ecx                              \n"
510 #else
511     "ret                                       \n"
512 #endif
513 );
514 #endif
515 #if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
516     defined(__x86_64__)
517 // 64 bit version has enough registers to do 16x8 to 8x16 at a time.
518 #define HAS_TRANSPOSE_WX8_FAST_SSSE3
519 static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
520                                     uint8* dst, int dst_stride, int width) {
521   asm volatile (
522   // Read in the data from the source pointer.
523   // First round of bit swap.
524   ".p2align  2                                 \n"
525 "1:                                            \n"
526   "movdqu     (%0),%%xmm0                      \n"
527   "movdqu     (%0,%3),%%xmm1                   \n"
528   "lea        (%0,%3,2),%0                     \n"
529   "movdqa     %%xmm0,%%xmm8                    \n"
530   "punpcklbw  %%xmm1,%%xmm0                    \n"
531   "punpckhbw  %%xmm1,%%xmm8                    \n"
532   "movdqu     (%0),%%xmm2                      \n"
533   "movdqa     %%xmm0,%%xmm1                    \n"
534   "movdqa     %%xmm8,%%xmm9                    \n"
535   "palignr    $0x8,%%xmm1,%%xmm1               \n"
536   "palignr    $0x8,%%xmm9,%%xmm9               \n"
537   "movdqu     (%0,%3),%%xmm3                   \n"
538   "lea        (%0,%3,2),%0                     \n"
539   "movdqa     %%xmm2,%%xmm10                   \n"
540   "punpcklbw  %%xmm3,%%xmm2                    \n"
541   "punpckhbw  %%xmm3,%%xmm10                   \n"
542   "movdqa     %%xmm2,%%xmm3                    \n"
543   "movdqa     %%xmm10,%%xmm11                  \n"
544   "movdqu     (%0),%%xmm4                      \n"
545   "palignr    $0x8,%%xmm3,%%xmm3               \n"
546   "palignr    $0x8,%%xmm11,%%xmm11             \n"
547   "movdqu     (%0,%3),%%xmm5                   \n"
548   "lea        (%0,%3,2),%0                     \n"
549   "movdqa     %%xmm4,%%xmm12                   \n"
550   "punpcklbw  %%xmm5,%%xmm4                    \n"
551   "punpckhbw  %%xmm5,%%xmm12                   \n"
552   "movdqa     %%xmm4,%%xmm5                    \n"
553   "movdqa     %%xmm12,%%xmm13                  \n"
554   "movdqu     (%0),%%xmm6                      \n"
555   "palignr    $0x8,%%xmm5,%%xmm5               \n"
556   "palignr    $0x8,%%xmm13,%%xmm13             \n"
557   "movdqu     (%0,%3),%%xmm7                   \n"
558   "lea        (%0,%3,2),%0                     \n"
559   "movdqa     %%xmm6,%%xmm14                   \n"
560   "punpcklbw  %%xmm7,%%xmm6                    \n"
561   "punpckhbw  %%xmm7,%%xmm14                   \n"
562   "neg        %3                               \n"
563   "movdqa     %%xmm6,%%xmm7                    \n"
564   "movdqa     %%xmm14,%%xmm15                  \n"
565   "lea        0x10(%0,%3,8),%0                 \n"
566   "palignr    $0x8,%%xmm7,%%xmm7               \n"
567   "palignr    $0x8,%%xmm15,%%xmm15             \n"
568   "neg        %3                               \n"
569    // Second round of bit swap.
570   "punpcklwd  %%xmm2,%%xmm0                    \n"
571   "punpcklwd  %%xmm3,%%xmm1                    \n"
572   "movdqa     %%xmm0,%%xmm2                    \n"
573   "movdqa     %%xmm1,%%xmm3                    \n"
574   "palignr    $0x8,%%xmm2,%%xmm2               \n"
575   "palignr    $0x8,%%xmm3,%%xmm3               \n"
576   "punpcklwd  %%xmm6,%%xmm4                    \n"
577   "punpcklwd  %%xmm7,%%xmm5                    \n"
578   "movdqa     %%xmm4,%%xmm6                    \n"
579   "movdqa     %%xmm5,%%xmm7                    \n"
580   "palignr    $0x8,%%xmm6,%%xmm6               \n"
581   "palignr    $0x8,%%xmm7,%%xmm7               \n"
582   "punpcklwd  %%xmm10,%%xmm8                   \n"
583   "punpcklwd  %%xmm11,%%xmm9                   \n"
584   "movdqa     %%xmm8,%%xmm10                   \n"
585   "movdqa     %%xmm9,%%xmm11                   \n"
586   "palignr    $0x8,%%xmm10,%%xmm10             \n"
587   "palignr    $0x8,%%xmm11,%%xmm11             \n"
588   "punpcklwd  %%xmm14,%%xmm12                  \n"
589   "punpcklwd  %%xmm15,%%xmm13                  \n"
590   "movdqa     %%xmm12,%%xmm14                  \n"
591   "movdqa     %%xmm13,%%xmm15                  \n"
592   "palignr    $0x8,%%xmm14,%%xmm14             \n"
593   "palignr    $0x8,%%xmm15,%%xmm15             \n"
594   // Third round of bit swap.
595   // Write to the destination pointer.
596   "punpckldq  %%xmm4,%%xmm0                    \n"
597   "movq       %%xmm0,(%1)                      \n"
598   "movdqa     %%xmm0,%%xmm4                    \n"
599   "palignr    $0x8,%%xmm4,%%xmm4               \n"
600   "movq       %%xmm4,(%1,%4)                   \n"
601   "lea        (%1,%4,2),%1                     \n"
602   "punpckldq  %%xmm6,%%xmm2                    \n"
603   "movdqa     %%xmm2,%%xmm6                    \n"
604   "movq       %%xmm2,(%1)                      \n"
605   "palignr    $0x8,%%xmm6,%%xmm6               \n"
606   "punpckldq  %%xmm5,%%xmm1                    \n"
607   "movq       %%xmm6,(%1,%4)                   \n"
608   "lea        (%1,%4,2),%1                     \n"
609   "movdqa     %%xmm1,%%xmm5                    \n"
610   "movq       %%xmm1,(%1)                      \n"
611   "palignr    $0x8,%%xmm5,%%xmm5               \n"
612   "movq       %%xmm5,(%1,%4)                   \n"
613   "lea        (%1,%4,2),%1                     \n"
614   "punpckldq  %%xmm7,%%xmm3                    \n"
615   "movq       %%xmm3,(%1)                      \n"
616   "movdqa     %%xmm3,%%xmm7                    \n"
617   "palignr    $0x8,%%xmm7,%%xmm7               \n"
618   "movq       %%xmm7,(%1,%4)                   \n"
619   "lea        (%1,%4,2),%1                     \n"
620   "punpckldq  %%xmm12,%%xmm8                   \n"
621   "movq       %%xmm8,(%1)                      \n"
622   "movdqa     %%xmm8,%%xmm12                   \n"
623   "palignr    $0x8,%%xmm12,%%xmm12             \n"
624   "movq       %%xmm12,(%1,%4)                  \n"
625   "lea        (%1,%4,2),%1                     \n"
626   "punpckldq  %%xmm14,%%xmm10                  \n"
627   "movdqa     %%xmm10,%%xmm14                  \n"
628   "movq       %%xmm10,(%1)                     \n"
629   "palignr    $0x8,%%xmm14,%%xmm14             \n"
630   "punpckldq  %%xmm13,%%xmm9                   \n"
631   "movq       %%xmm14,(%1,%4)                  \n"
632   "lea        (%1,%4,2),%1                     \n"
633   "movdqa     %%xmm9,%%xmm13                   \n"
634   "movq       %%xmm9,(%1)                      \n"
635   "palignr    $0x8,%%xmm13,%%xmm13             \n"
636   "movq       %%xmm13,(%1,%4)                  \n"
637   "lea        (%1,%4,2),%1                     \n"
638   "punpckldq  %%xmm15,%%xmm11                  \n"
639   "movq       %%xmm11,(%1)                     \n"
640   "movdqa     %%xmm11,%%xmm15                  \n"
641   "palignr    $0x8,%%xmm15,%%xmm15             \n"
642   "sub        $0x10,%2                         \n"
643   "movq       %%xmm15,(%1,%4)                  \n"
644   "lea        (%1,%4,2),%1                     \n"
645   "jg         1b                               \n"
646   : "+r"(src),    // %0
647     "+r"(dst),    // %1
648     "+r"(width)   // %2
649   : "r"((intptr_t)(src_stride)),  // %3
650     "r"((intptr_t)(dst_stride))   // %4
651   : "memory", "cc",
652     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
653     "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",  "xmm14",  "xmm15"
654 );
655 }
656
657 #define HAS_TRANSPOSE_UVWX8_SSE2
658 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
659                                 uint8* dst_a, int dst_stride_a,
660                                 uint8* dst_b, int dst_stride_b,
661                                 int w) {
662   asm volatile (
663   // Read in the data from the source pointer.
664   // First round of bit swap.
665   ".p2align  2                                 \n"
666 "1:                                            \n"
667   "movdqu     (%0),%%xmm0                      \n"
668   "movdqu     (%0,%4),%%xmm1                   \n"
669   "lea        (%0,%4,2),%0                     \n"
670   "movdqa     %%xmm0,%%xmm8                    \n"
671   "punpcklbw  %%xmm1,%%xmm0                    \n"
672   "punpckhbw  %%xmm1,%%xmm8                    \n"
673   "movdqa     %%xmm8,%%xmm1                    \n"
674   "movdqu     (%0),%%xmm2                      \n"
675   "movdqu     (%0,%4),%%xmm3                   \n"
676   "lea        (%0,%4,2),%0                     \n"
677   "movdqa     %%xmm2,%%xmm8                    \n"
678   "punpcklbw  %%xmm3,%%xmm2                    \n"
679   "punpckhbw  %%xmm3,%%xmm8                    \n"
680   "movdqa     %%xmm8,%%xmm3                    \n"
681   "movdqu     (%0),%%xmm4                      \n"
682   "movdqu     (%0,%4),%%xmm5                   \n"
683   "lea        (%0,%4,2),%0                     \n"
684   "movdqa     %%xmm4,%%xmm8                    \n"
685   "punpcklbw  %%xmm5,%%xmm4                    \n"
686   "punpckhbw  %%xmm5,%%xmm8                    \n"
687   "movdqa     %%xmm8,%%xmm5                    \n"
688   "movdqu     (%0),%%xmm6                      \n"
689   "movdqu     (%0,%4),%%xmm7                   \n"
690   "lea        (%0,%4,2),%0                     \n"
691   "movdqa     %%xmm6,%%xmm8                    \n"
692   "punpcklbw  %%xmm7,%%xmm6                    \n"
693   "neg        %4                               \n"
694   "lea        0x10(%0,%4,8),%0                 \n"
695   "punpckhbw  %%xmm7,%%xmm8                    \n"
696   "movdqa     %%xmm8,%%xmm7                    \n"
697   "neg        %4                               \n"
698    // Second round of bit swap.
699   "movdqa     %%xmm0,%%xmm8                    \n"
700   "movdqa     %%xmm1,%%xmm9                    \n"
701   "punpckhwd  %%xmm2,%%xmm8                    \n"
702   "punpckhwd  %%xmm3,%%xmm9                    \n"
703   "punpcklwd  %%xmm2,%%xmm0                    \n"
704   "punpcklwd  %%xmm3,%%xmm1                    \n"
705   "movdqa     %%xmm8,%%xmm2                    \n"
706   "movdqa     %%xmm9,%%xmm3                    \n"
707   "movdqa     %%xmm4,%%xmm8                    \n"
708   "movdqa     %%xmm5,%%xmm9                    \n"
709   "punpckhwd  %%xmm6,%%xmm8                    \n"
710   "punpckhwd  %%xmm7,%%xmm9                    \n"
711   "punpcklwd  %%xmm6,%%xmm4                    \n"
712   "punpcklwd  %%xmm7,%%xmm5                    \n"
713   "movdqa     %%xmm8,%%xmm6                    \n"
714   "movdqa     %%xmm9,%%xmm7                    \n"
715   // Third round of bit swap.
716   // Write to the destination pointer.
717   "movdqa     %%xmm0,%%xmm8                    \n"
718   "punpckldq  %%xmm4,%%xmm0                    \n"
719   "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
720   "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
721   "punpckhdq  %%xmm4,%%xmm8                    \n"
722   "movlpd     %%xmm8,(%1,%5)                   \n"
723   "lea        (%1,%5,2),%1                     \n"
724   "movhpd     %%xmm8,(%2,%6)                   \n"
725   "lea        (%2,%6,2),%2                     \n"
726   "movdqa     %%xmm2,%%xmm8                    \n"
727   "punpckldq  %%xmm6,%%xmm2                    \n"
728   "movlpd     %%xmm2,(%1)                      \n"
729   "movhpd     %%xmm2,(%2)                      \n"
730   "punpckhdq  %%xmm6,%%xmm8                    \n"
731   "movlpd     %%xmm8,(%1,%5)                   \n"
732   "lea        (%1,%5,2),%1                     \n"
733   "movhpd     %%xmm8,(%2,%6)                   \n"
734   "lea        (%2,%6,2),%2                     \n"
735   "movdqa     %%xmm1,%%xmm8                    \n"
736   "punpckldq  %%xmm5,%%xmm1                    \n"
737   "movlpd     %%xmm1,(%1)                      \n"
738   "movhpd     %%xmm1,(%2)                      \n"
739   "punpckhdq  %%xmm5,%%xmm8                    \n"
740   "movlpd     %%xmm8,(%1,%5)                   \n"
741   "lea        (%1,%5,2),%1                     \n"
742   "movhpd     %%xmm8,(%2,%6)                   \n"
743   "lea        (%2,%6,2),%2                     \n"
744   "movdqa     %%xmm3,%%xmm8                    \n"
745   "punpckldq  %%xmm7,%%xmm3                    \n"
746   "movlpd     %%xmm3,(%1)                      \n"
747   "movhpd     %%xmm3,(%2)                      \n"
748   "punpckhdq  %%xmm7,%%xmm8                    \n"
749   "sub        $0x8,%3                          \n"
750   "movlpd     %%xmm8,(%1,%5)                   \n"
751   "lea        (%1,%5,2),%1                     \n"
752   "movhpd     %%xmm8,(%2,%6)                   \n"
753   "lea        (%2,%6,2),%2                     \n"
754   "jg         1b                               \n"
755   : "+r"(src),    // %0
756     "+r"(dst_a),  // %1
757     "+r"(dst_b),  // %2
758     "+r"(w)   // %3
759   : "r"((intptr_t)(src_stride)),    // %4
760     "r"((intptr_t)(dst_stride_a)),  // %5
761     "r"((intptr_t)(dst_stride_b))   // %6
762   : "memory", "cc",
763     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
764     "xmm8", "xmm9"
765 );
766 }
767 #endif
768 #endif
769
770 static void TransposeWx8_C(const uint8* src, int src_stride,
771                            uint8* dst, int dst_stride,
772                            int width) {
773   int i;
774   for (i = 0; i < width; ++i) {
775     dst[0] = src[0 * src_stride];
776     dst[1] = src[1 * src_stride];
777     dst[2] = src[2 * src_stride];
778     dst[3] = src[3 * src_stride];
779     dst[4] = src[4 * src_stride];
780     dst[5] = src[5 * src_stride];
781     dst[6] = src[6 * src_stride];
782     dst[7] = src[7 * src_stride];
783     ++src;
784     dst += dst_stride;
785   }
786 }
787
788 static void TransposeWxH_C(const uint8* src, int src_stride,
789                            uint8* dst, int dst_stride,
790                            int width, int height) {
791   int i;
792   for (i = 0; i < width; ++i) {
793     int j;
794     for (j = 0; j < height; ++j) {
795       dst[i * dst_stride + j] = src[j * src_stride + i];
796     }
797   }
798 }
799
800 LIBYUV_API
801 void TransposePlane(const uint8* src, int src_stride,
802                     uint8* dst, int dst_stride,
803                     int width, int height) {
804   int i = height;
805   void (*TransposeWx8)(const uint8* src, int src_stride,
806                        uint8* dst, int dst_stride,
807                        int width) = TransposeWx8_C;
808 #if defined(HAS_TRANSPOSE_WX8_NEON)
809   if (TestCpuFlag(kCpuHasNEON)) {
810     TransposeWx8 = TransposeWx8_NEON;
811   }
812 #endif
813 #if defined(HAS_TRANSPOSE_WX8_SSSE3)
814   if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
815     TransposeWx8 = TransposeWx8_SSSE3;
816   }
817 #endif
818 #if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
819   if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
820     TransposeWx8 = TransposeWx8_FAST_SSSE3;
821   }
822 #endif
823 #if defined(HAS_TRANSPOSE_WX8_MIPS_DSPR2)
824   if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
825     if (IS_ALIGNED(width, 4) &&
826         IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
827       TransposeWx8 = TransposeWx8_FAST_MIPS_DSPR2;
828     } else {
829       TransposeWx8 = TransposeWx8_MIPS_DSPR2;
830     }
831   }
832 #endif
833
834   // Work across the source in 8x8 tiles
835   while (i >= 8) {
836     TransposeWx8(src, src_stride, dst, dst_stride, width);
837     src += 8 * src_stride;    // Go down 8 rows.
838     dst += 8;                 // Move over 8 columns.
839     i -= 8;
840   }
841
842   TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
843 }
844
845 LIBYUV_API
846 void RotatePlane90(const uint8* src, int src_stride,
847                    uint8* dst, int dst_stride,
848                    int width, int height) {
849   // Rotate by 90 is a transpose with the source read
850   // from bottom to top. So set the source pointer to the end
851   // of the buffer and flip the sign of the source stride.
852   src += src_stride * (height - 1);
853   src_stride = -src_stride;
854   TransposePlane(src, src_stride, dst, dst_stride, width, height);
855 }
856
857 LIBYUV_API
858 void RotatePlane270(const uint8* src, int src_stride,
859                     uint8* dst, int dst_stride,
860                     int width, int height) {
861   // Rotate by 270 is a transpose with the destination written
862   // from bottom to top. So set the destination pointer to the end
863   // of the buffer and flip the sign of the destination stride.
864   dst += dst_stride * (width - 1);
865   dst_stride = -dst_stride;
866   TransposePlane(src, src_stride, dst, dst_stride, width, height);
867 }
868
869 LIBYUV_API
870 void RotatePlane180(const uint8* src, int src_stride,
871                     uint8* dst, int dst_stride,
872                     int width, int height) {
873   // Swap first and last row and mirror the content. Uses a temporary row.
874   align_buffer_64(row, width);
875   const uint8* src_bot = src + src_stride * (height - 1);
876   uint8* dst_bot = dst + dst_stride * (height - 1);
877   int half_height = (height + 1) >> 1;
878   int y;
879   void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
880   void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
881 #if defined(HAS_MIRRORROW_NEON)
882   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
883     MirrorRow = MirrorRow_NEON;
884   }
885 #endif
886 #if defined(HAS_MIRRORROW_SSE2)
887   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
888     MirrorRow = MirrorRow_SSE2;
889   }
890 #endif
891 #if defined(HAS_MIRRORROW_SSSE3)
892   if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
893     MirrorRow = MirrorRow_SSSE3;
894   }
895 #endif
896 #if defined(HAS_MIRRORROW_AVX2)
897   if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
898     MirrorRow = MirrorRow_AVX2;
899   }
900 #endif
901 // TODO(fbarchard): Mirror on mips handle unaligned memory.
902 #if defined(HAS_MIRRORROW_MIPS_DSPR2)
903   if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
904       IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
905       IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {
906     MirrorRow = MirrorRow_MIPS_DSPR2;
907   }
908 #endif
909 #if defined(HAS_COPYROW_NEON)
910   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
911     CopyRow = CopyRow_NEON;
912   }
913 #endif
914 #if defined(HAS_COPYROW_X86)
915   if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
916     CopyRow = CopyRow_X86;
917   }
918 #endif
919 #if defined(HAS_COPYROW_SSE2)
920   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
921     CopyRow = CopyRow_SSE2;
922   }
923 #endif
924 #if defined(HAS_COPYROW_AVX)
925   if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) {
926     CopyRow = CopyRow_AVX;
927   }
928 #endif
929 #if defined(HAS_COPYROW_ERMS)
930   if (TestCpuFlag(kCpuHasERMS)) {
931     CopyRow = CopyRow_ERMS;
932   }
933 #endif
934 #if defined(HAS_COPYROW_MIPS)
935   if (TestCpuFlag(kCpuHasMIPS)) {
936     CopyRow = CopyRow_MIPS;
937   }
938 #endif
939
940   // Odd height will harmlessly mirror the middle row twice.
941   for (y = 0; y < half_height; ++y) {
942     MirrorRow(src, row, width);  // Mirror first row into a buffer
943     src += src_stride;
944     MirrorRow(src_bot, dst, width);  // Mirror last row into first row
945     dst += dst_stride;
946     CopyRow(row, dst_bot, width);  // Copy first mirrored row into last
947     src_bot -= src_stride;
948     dst_bot -= dst_stride;
949   }
950   free_aligned_buffer_64(row);
951 }
952
953 static void TransposeUVWx8_C(const uint8* src, int src_stride,
954                              uint8* dst_a, int dst_stride_a,
955                              uint8* dst_b, int dst_stride_b,
956                              int width) {
957   int i;
958   for (i = 0; i < width; ++i) {
959     dst_a[0] = src[0 * src_stride + 0];
960     dst_b[0] = src[0 * src_stride + 1];
961     dst_a[1] = src[1 * src_stride + 0];
962     dst_b[1] = src[1 * src_stride + 1];
963     dst_a[2] = src[2 * src_stride + 0];
964     dst_b[2] = src[2 * src_stride + 1];
965     dst_a[3] = src[3 * src_stride + 0];
966     dst_b[3] = src[3 * src_stride + 1];
967     dst_a[4] = src[4 * src_stride + 0];
968     dst_b[4] = src[4 * src_stride + 1];
969     dst_a[5] = src[5 * src_stride + 0];
970     dst_b[5] = src[5 * src_stride + 1];
971     dst_a[6] = src[6 * src_stride + 0];
972     dst_b[6] = src[6 * src_stride + 1];
973     dst_a[7] = src[7 * src_stride + 0];
974     dst_b[7] = src[7 * src_stride + 1];
975     src += 2;
976     dst_a += dst_stride_a;
977     dst_b += dst_stride_b;
978   }
979 }
980
981 static void TransposeUVWxH_C(const uint8* src, int src_stride,
982                              uint8* dst_a, int dst_stride_a,
983                              uint8* dst_b, int dst_stride_b,
984                              int width, int height) {
985   int i;
986   for (i = 0; i < width * 2; i += 2) {
987     int j;
988     for (j = 0; j < height; ++j) {
989       dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
990       dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
991     }
992   }
993 }
994
995 LIBYUV_API
996 void TransposeUV(const uint8* src, int src_stride,
997                  uint8* dst_a, int dst_stride_a,
998                  uint8* dst_b, int dst_stride_b,
999                  int width, int height) {
1000   int i = height;
1001   void (*TransposeUVWx8)(const uint8* src, int src_stride,
1002                          uint8* dst_a, int dst_stride_a,
1003                          uint8* dst_b, int dst_stride_b,
1004                          int width) = TransposeUVWx8_C;
1005 #if defined(HAS_TRANSPOSE_UVWX8_NEON)
1006   if (TestCpuFlag(kCpuHasNEON)) {
1007     TransposeUVWx8 = TransposeUVWx8_NEON;
1008   }
1009 #endif
1010 #if defined(HAS_TRANSPOSE_UVWX8_SSE2)
1011   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
1012     TransposeUVWx8 = TransposeUVWx8_SSE2;
1013   }
1014 #endif
1015 #if defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2)
1016   if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) &&
1017       IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
1018     TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2;
1019   }
1020 #endif
1021
1022   // Work through the source in 8x8 tiles.
1023   while (i >= 8) {
1024     TransposeUVWx8(src, src_stride,
1025                    dst_a, dst_stride_a,
1026                    dst_b, dst_stride_b,
1027                    width);
1028     src += 8 * src_stride;    // Go down 8 rows.
1029     dst_a += 8;               // Move over 8 columns.
1030     dst_b += 8;               // Move over 8 columns.
1031     i -= 8;
1032   }
1033
1034   TransposeUVWxH_C(src, src_stride,
1035                    dst_a, dst_stride_a,
1036                    dst_b, dst_stride_b,
1037                    width, i);
1038 }
1039
1040 LIBYUV_API
1041 void RotateUV90(const uint8* src, int src_stride,
1042                 uint8* dst_a, int dst_stride_a,
1043                 uint8* dst_b, int dst_stride_b,
1044                 int width, int height) {
1045   src += src_stride * (height - 1);
1046   src_stride = -src_stride;
1047
1048   TransposeUV(src, src_stride,
1049               dst_a, dst_stride_a,
1050               dst_b, dst_stride_b,
1051               width, height);
1052 }
1053
1054 LIBYUV_API
1055 void RotateUV270(const uint8* src, int src_stride,
1056                  uint8* dst_a, int dst_stride_a,
1057                  uint8* dst_b, int dst_stride_b,
1058                  int width, int height) {
1059   dst_a += dst_stride_a * (width - 1);
1060   dst_b += dst_stride_b * (width - 1);
1061   dst_stride_a = -dst_stride_a;
1062   dst_stride_b = -dst_stride_b;
1063
1064   TransposeUV(src, src_stride,
1065               dst_a, dst_stride_a,
1066               dst_b, dst_stride_b,
1067               width, height);
1068 }
1069
1070 // Rotate 180 is a horizontal and vertical flip.
1071 LIBYUV_API
1072 void RotateUV180(const uint8* src, int src_stride,
1073                  uint8* dst_a, int dst_stride_a,
1074                  uint8* dst_b, int dst_stride_b,
1075                  int width, int height) {
1076   int i;
1077   void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
1078       MirrorUVRow_C;
1079 #if defined(HAS_MIRRORUVROW_NEON)
1080   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
1081     MirrorRowUV = MirrorUVRow_NEON;
1082   }
1083 #endif
1084 #if defined(HAS_MIRRORROW_UV_SSSE3)
1085   if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
1086     MirrorRowUV = MirrorUVRow_SSSE3;
1087   }
1088 #endif
1089 #if defined(HAS_MIRRORUVROW_MIPS_DSPR2)
1090   if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
1091       IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
1092     MirrorRowUV = MirrorUVRow_MIPS_DSPR2;
1093   }
1094 #endif
1095
1096   dst_a += dst_stride_a * (height - 1);
1097   dst_b += dst_stride_b * (height - 1);
1098
1099   for (i = 0; i < height; ++i) {
1100     MirrorRowUV(src, dst_a, dst_b, width);
1101     src += src_stride;
1102     dst_a -= dst_stride_a;
1103     dst_b -= dst_stride_b;
1104   }
1105 }
1106
1107 LIBYUV_API
1108 int RotatePlane(const uint8* src, int src_stride,
1109                 uint8* dst, int dst_stride,
1110                 int width, int height,
1111                 enum RotationMode mode) {
1112   if (!src || width <= 0 || height == 0 || !dst) {
1113     return -1;
1114   }
1115
1116   // Negative height means invert the image.
1117   if (height < 0) {
1118     height = -height;
1119     src = src + (height - 1) * src_stride;
1120     src_stride = -src_stride;
1121   }
1122
1123   switch (mode) {
1124     case kRotate0:
1125       // copy frame
1126       CopyPlane(src, src_stride,
1127                 dst, dst_stride,
1128                 width, height);
1129       return 0;
1130     case kRotate90:
1131       RotatePlane90(src, src_stride,
1132                     dst, dst_stride,
1133                     width, height);
1134       return 0;
1135     case kRotate270:
1136       RotatePlane270(src, src_stride,
1137                      dst, dst_stride,
1138                      width, height);
1139       return 0;
1140     case kRotate180:
1141       RotatePlane180(src, src_stride,
1142                      dst, dst_stride,
1143                      width, height);
1144       return 0;
1145     default:
1146       break;
1147   }
1148   return -1;
1149 }
1150
1151 LIBYUV_API
1152 int I420Rotate(const uint8* src_y, int src_stride_y,
1153                const uint8* src_u, int src_stride_u,
1154                const uint8* src_v, int src_stride_v,
1155                uint8* dst_y, int dst_stride_y,
1156                uint8* dst_u, int dst_stride_u,
1157                uint8* dst_v, int dst_stride_v,
1158                int width, int height,
1159                enum RotationMode mode) {
1160   int halfwidth = (width + 1) >> 1;
1161   int halfheight = (height + 1) >> 1;
1162   if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
1163       !dst_y || !dst_u || !dst_v) {
1164     return -1;
1165   }
1166
1167   // Negative height means invert the image.
1168   if (height < 0) {
1169     height = -height;
1170     halfheight = (height + 1) >> 1;
1171     src_y = src_y + (height - 1) * src_stride_y;
1172     src_u = src_u + (halfheight - 1) * src_stride_u;
1173     src_v = src_v + (halfheight - 1) * src_stride_v;
1174     src_stride_y = -src_stride_y;
1175     src_stride_u = -src_stride_u;
1176     src_stride_v = -src_stride_v;
1177   }
1178
1179   switch (mode) {
1180     case kRotate0:
1181       // copy frame
1182       return I420Copy(src_y, src_stride_y,
1183                       src_u, src_stride_u,
1184                       src_v, src_stride_v,
1185                       dst_y, dst_stride_y,
1186                       dst_u, dst_stride_u,
1187                       dst_v, dst_stride_v,
1188                       width, height);
1189     case kRotate90:
1190       RotatePlane90(src_y, src_stride_y,
1191                     dst_y, dst_stride_y,
1192                     width, height);
1193       RotatePlane90(src_u, src_stride_u,
1194                     dst_u, dst_stride_u,
1195                     halfwidth, halfheight);
1196       RotatePlane90(src_v, src_stride_v,
1197                     dst_v, dst_stride_v,
1198                     halfwidth, halfheight);
1199       return 0;
1200     case kRotate270:
1201       RotatePlane270(src_y, src_stride_y,
1202                      dst_y, dst_stride_y,
1203                      width, height);
1204       RotatePlane270(src_u, src_stride_u,
1205                      dst_u, dst_stride_u,
1206                      halfwidth, halfheight);
1207       RotatePlane270(src_v, src_stride_v,
1208                      dst_v, dst_stride_v,
1209                      halfwidth, halfheight);
1210       return 0;
1211     case kRotate180:
1212       RotatePlane180(src_y, src_stride_y,
1213                      dst_y, dst_stride_y,
1214                      width, height);
1215       RotatePlane180(src_u, src_stride_u,
1216                      dst_u, dst_stride_u,
1217                      halfwidth, halfheight);
1218       RotatePlane180(src_v, src_stride_v,
1219                      dst_v, dst_stride_v,
1220                      halfwidth, halfheight);
1221       return 0;
1222     default:
1223       break;
1224   }
1225   return -1;
1226 }
1227
1228 LIBYUV_API
1229 int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
1230                      const uint8* src_uv, int src_stride_uv,
1231                      uint8* dst_y, int dst_stride_y,
1232                      uint8* dst_u, int dst_stride_u,
1233                      uint8* dst_v, int dst_stride_v,
1234                      int width, int height,
1235                      enum RotationMode mode) {
1236   int halfwidth = (width + 1) >> 1;
1237   int halfheight = (height + 1) >> 1;
1238   if (!src_y || !src_uv || width <= 0 || height == 0 ||
1239       !dst_y || !dst_u || !dst_v) {
1240     return -1;
1241   }
1242
1243   // Negative height means invert the image.
1244   if (height < 0) {
1245     height = -height;
1246     halfheight = (height + 1) >> 1;
1247     src_y = src_y + (height - 1) * src_stride_y;
1248     src_uv = src_uv + (halfheight - 1) * src_stride_uv;
1249     src_stride_y = -src_stride_y;
1250     src_stride_uv = -src_stride_uv;
1251   }
1252
1253   switch (mode) {
1254     case kRotate0:
1255       // copy frame
1256       return NV12ToI420(src_y, src_stride_y,
1257                         src_uv, src_stride_uv,
1258                         dst_y, dst_stride_y,
1259                         dst_u, dst_stride_u,
1260                         dst_v, dst_stride_v,
1261                         width, height);
1262     case kRotate90:
1263       RotatePlane90(src_y, src_stride_y,
1264                     dst_y, dst_stride_y,
1265                     width, height);
1266       RotateUV90(src_uv, src_stride_uv,
1267                  dst_u, dst_stride_u,
1268                  dst_v, dst_stride_v,
1269                  halfwidth, halfheight);
1270       return 0;
1271     case kRotate270:
1272       RotatePlane270(src_y, src_stride_y,
1273                      dst_y, dst_stride_y,
1274                      width, height);
1275       RotateUV270(src_uv, src_stride_uv,
1276                   dst_u, dst_stride_u,
1277                   dst_v, dst_stride_v,
1278                   halfwidth, halfheight);
1279       return 0;
1280     case kRotate180:
1281       RotatePlane180(src_y, src_stride_y,
1282                      dst_y, dst_stride_y,
1283                      width, height);
1284       RotateUV180(src_uv, src_stride_uv,
1285                   dst_u, dst_stride_u,
1286                   dst_v, dst_stride_v,
1287                   halfwidth, halfheight);
1288       return 0;
1289     default:
1290       break;
1291   }
1292   return -1;
1293 }
1294
1295 #ifdef __cplusplus
1296 }  // extern "C"
1297 }  // namespace libyuv
1298 #endif