src/third_party/libyuv/source/rotate.cc

   1 /*
   2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS. All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10
  11 #include "libyuv/rotate.h"
  12
  13 #include "libyuv/cpu_id.h"
  14 #include "libyuv/convert.h"
  15 #include "libyuv/planar_functions.h"
  16 #include "libyuv/row.h"
  17
  18 #ifdef __cplusplus
  19 namespace libyuv {
  20 extern "C" {
  21 #endif
  22
  23 #if !defined(LIBYUV_DISABLE_X86) && \
  24     (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
  25 #if defined(__APPLE__) && defined(__i386__)
  26 #define DECLARE_FUNCTION(name)                                                 \
  27     ".text                                     \n"                             \
  28     ".private_extern _" #name "                \n"                             \
  29     ".align 4,0x90                             \n"                             \
  30 "_" #name ":                                   \n"
  31 #elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
  32 #define DECLARE_FUNCTION(name)                                                 \
  33     ".text                                     \n"                             \
  34     ".align 4,0x90                             \n"                             \
  35 "_" #name ":                                   \n"
  36 #else
  37 #define DECLARE_FUNCTION(name)                                                 \
  38     ".text                                     \n"                             \
  39     ".align 4,0x90                             \n"                             \
  40 #name ":                                       \n"
  41 #endif
  42 #endif
  43
  44 #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
  45     (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
  46 #define HAS_TRANSPOSE_WX8_NEON
  47 void TransposeWx8_NEON(const uint8* src, int src_stride,
  48                        uint8* dst, int dst_stride, int width);
  49 #define HAS_TRANSPOSE_UVWX8_NEON
  50 void TransposeUVWx8_NEON(const uint8* src, int src_stride,
  51                          uint8* dst_a, int dst_stride_a,
  52                          uint8* dst_b, int dst_stride_b,
  53                          int width);
  54 #endif
  55
  56 #if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
  57     defined(__mips__) && \
  58     defined(__mips_dsp) && (__mips_dsp_rev >= 2)
  59 #define HAS_TRANSPOSE_WX8_MIPS_DSPR2
  60 void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
  61                              uint8* dst, int dst_stride, int width);
  62
  63 void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
  64                                   uint8* dst, int dst_stride, int width);
  65 #define HAS_TRANSPOSE_UVWx8_MIPS_DSPR2
  66 void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
  67                                uint8* dst_a, int dst_stride_a,
  68                                uint8* dst_b, int dst_stride_b,
  69                                int width);
  70 #endif  // defined(__mips__)
  71
  72 #if !defined(LIBYUV_DISABLE_X86) && \
  73     defined(_M_IX86) && defined(_MSC_VER)
  74 #define HAS_TRANSPOSE_WX8_SSSE3
  75 __declspec(naked) __declspec(align(16))
  76 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
  77                                uint8* dst, int dst_stride, int width) {
  78   __asm {
  79     push      edi
  80     push      esi
  81     push      ebp
  82     mov       eax, [esp + 12 + 4]   // src
  83     mov       edi, [esp + 12 + 8]   // src_stride
  84     mov       edx, [esp + 12 + 12]  // dst
  85     mov       esi, [esp + 12 + 16]  // dst_stride
  86     mov       ecx, [esp + 12 + 20]  // width
  87
  88     // Read in the data from the source pointer.
  89     // First round of bit swap.
  90     align      4
  91  convertloop:
  92     movq      xmm0, qword ptr [eax]
  93     lea       ebp, [eax + 8]
  94     movq      xmm1, qword ptr [eax + edi]
  95     lea       eax, [eax + 2 * edi]
  96     punpcklbw xmm0, xmm1
  97     movq      xmm2, qword ptr [eax]
  98     movdqa    xmm1, xmm0
  99     palignr   xmm1, xmm1, 8
 100     movq      xmm3, qword ptr [eax + edi]
 101     lea       eax, [eax + 2 * edi]
 102     punpcklbw xmm2, xmm3
 103     movdqa    xmm3, xmm2
 104     movq      xmm4, qword ptr [eax]
 105     palignr   xmm3, xmm3, 8
 106     movq      xmm5, qword ptr [eax + edi]
 107     punpcklbw xmm4, xmm5
 108     lea       eax, [eax + 2 * edi]
 109     movdqa    xmm5, xmm4
 110     movq      xmm6, qword ptr [eax]
 111     palignr   xmm5, xmm5, 8
 112     movq      xmm7, qword ptr [eax + edi]
 113     punpcklbw xmm6, xmm7
 114     mov       eax, ebp
 115     movdqa    xmm7, xmm6
 116     palignr   xmm7, xmm7, 8
 117     // Second round of bit swap.
 118     punpcklwd xmm0, xmm2
 119     punpcklwd xmm1, xmm3
 120     movdqa    xmm2, xmm0
 121     movdqa    xmm3, xmm1
 122     palignr   xmm2, xmm2, 8
 123     palignr   xmm3, xmm3, 8
 124     punpcklwd xmm4, xmm6
 125     punpcklwd xmm5, xmm7
 126     movdqa    xmm6, xmm4
 127     movdqa    xmm7, xmm5
 128     palignr   xmm6, xmm6, 8
 129     palignr   xmm7, xmm7, 8
 130     // Third round of bit swap.
 131     // Write to the destination pointer.
 132     punpckldq xmm0, xmm4
 133     movq      qword ptr [edx], xmm0
 134     movdqa    xmm4, xmm0
 135     palignr   xmm4, xmm4, 8
 136     movq      qword ptr [edx + esi], xmm4
 137     lea       edx, [edx + 2 * esi]
 138     punpckldq xmm2, xmm6
 139     movdqa    xmm6, xmm2
 140     palignr   xmm6, xmm6, 8
 141     movq      qword ptr [edx], xmm2
 142     punpckldq xmm1, xmm5
 143     movq      qword ptr [edx + esi], xmm6
 144     lea       edx, [edx + 2 * esi]
 145     movdqa    xmm5, xmm1
 146     movq      qword ptr [edx], xmm1
 147     palignr   xmm5, xmm5, 8
 148     punpckldq xmm3, xmm7
 149     movq      qword ptr [edx + esi], xmm5
 150     lea       edx, [edx + 2 * esi]
 151     movq      qword ptr [edx], xmm3
 152     movdqa    xmm7, xmm3
 153     palignr   xmm7, xmm7, 8
 154     sub       ecx, 8
 155     movq      qword ptr [edx + esi], xmm7
 156     lea       edx, [edx + 2 * esi]
 157     jg        convertloop
 158
 159     pop       ebp
 160     pop       esi
 161     pop       edi
 162     ret
 163   }
 164 }
 165
 166 #define HAS_TRANSPOSE_UVWX8_SSE2
 167 __declspec(naked) __declspec(align(16))
 168 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
 169                                 uint8* dst_a, int dst_stride_a,
 170                                 uint8* dst_b, int dst_stride_b,
 171                                 int w) {
 172   __asm {
 173     push      ebx
 174     push      esi
 175     push      edi
 176     push      ebp
 177     mov       eax, [esp + 16 + 4]   // src
 178     mov       edi, [esp + 16 + 8]   // src_stride
 179     mov       edx, [esp + 16 + 12]  // dst_a
 180     mov       esi, [esp + 16 + 16]  // dst_stride_a
 181     mov       ebx, [esp + 16 + 20]  // dst_b
 182     mov       ebp, [esp + 16 + 24]  // dst_stride_b
 183     mov       ecx, esp
 184     sub       esp, 4 + 16
 185     and       esp, ~15
 186     mov       [esp + 16], ecx
 187     mov       ecx, [ecx + 16 + 28]  // w
 188
 189     align      4
 190  convertloop:
 191     // Read in the data from the source pointer.
 192     // First round of bit swap.
 193     movdqu    xmm0, [eax]
 194     movdqu    xmm1, [eax + edi]
 195     lea       eax, [eax + 2 * edi]
 196     movdqa    xmm7, xmm0  // use xmm7 as temp register.
 197     punpcklbw xmm0, xmm1
 198     punpckhbw xmm7, xmm1
 199     movdqa    xmm1, xmm7
 200     movdqu    xmm2, [eax]
 201     movdqu    xmm3, [eax + edi]
 202     lea       eax, [eax + 2 * edi]
 203     movdqa    xmm7, xmm2
 204     punpcklbw xmm2, xmm3
 205     punpckhbw xmm7, xmm3
 206     movdqa    xmm3, xmm7
 207     movdqu    xmm4, [eax]
 208     movdqu    xmm5, [eax + edi]
 209     lea       eax, [eax + 2 * edi]
 210     movdqa    xmm7, xmm4
 211     punpcklbw xmm4, xmm5
 212     punpckhbw xmm7, xmm5
 213     movdqa    xmm5, xmm7
 214     movdqu    xmm6, [eax]
 215     movdqu    xmm7, [eax + edi]
 216     lea       eax, [eax + 2 * edi]
 217     movdqu    [esp], xmm5  // backup xmm5
 218     neg       edi
 219     movdqa    xmm5, xmm6   // use xmm5 as temp register.
 220     punpcklbw xmm6, xmm7
 221     punpckhbw xmm5, xmm7
 222     movdqa    xmm7, xmm5
 223     lea       eax, [eax + 8 * edi + 16]
 224     neg       edi
 225     // Second round of bit swap.
 226     movdqa    xmm5, xmm0
 227     punpcklwd xmm0, xmm2
 228     punpckhwd xmm5, xmm2
 229     movdqa    xmm2, xmm5
 230     movdqa    xmm5, xmm1
 231     punpcklwd xmm1, xmm3
 232     punpckhwd xmm5, xmm3
 233     movdqa    xmm3, xmm5
 234     movdqa    xmm5, xmm4
 235     punpcklwd xmm4, xmm6
 236     punpckhwd xmm5, xmm6
 237     movdqa    xmm6, xmm5
 238     movdqu    xmm5, [esp]  // restore xmm5
 239     movdqu    [esp], xmm6  // backup xmm6
 240     movdqa    xmm6, xmm5    // use xmm6 as temp register.
 241     punpcklwd xmm5, xmm7
 242     punpckhwd xmm6, xmm7
 243     movdqa    xmm7, xmm6
 244     // Third round of bit swap.
 245     // Write to the destination pointer.
 246     movdqa    xmm6, xmm0
 247     punpckldq xmm0, xmm4
 248     punpckhdq xmm6, xmm4
 249     movdqa    xmm4, xmm6
 250     movdqu    xmm6, [esp]  // restore xmm6
 251     movlpd    qword ptr [edx], xmm0
 252     movhpd    qword ptr [ebx], xmm0
 253     movlpd    qword ptr [edx + esi], xmm4
 254     lea       edx, [edx + 2 * esi]
 255     movhpd    qword ptr [ebx + ebp], xmm4
 256     lea       ebx, [ebx + 2 * ebp]
 257     movdqa    xmm0, xmm2   // use xmm0 as the temp register.
 258     punpckldq xmm2, xmm6
 259     movlpd    qword ptr [edx], xmm2
 260     movhpd    qword ptr [ebx], xmm2
 261     punpckhdq xmm0, xmm6
 262     movlpd    qword ptr [edx + esi], xmm0
 263     lea       edx, [edx + 2 * esi]
 264     movhpd    qword ptr [ebx + ebp], xmm0
 265     lea       ebx, [ebx + 2 * ebp]
 266     movdqa    xmm0, xmm1   // use xmm0 as the temp register.
 267     punpckldq xmm1, xmm5
 268     movlpd    qword ptr [edx], xmm1
 269     movhpd    qword ptr [ebx], xmm1
 270     punpckhdq xmm0, xmm5
 271     movlpd    qword ptr [edx + esi], xmm0
 272     lea       edx, [edx + 2 * esi]
 273     movhpd    qword ptr [ebx + ebp], xmm0
 274     lea       ebx, [ebx + 2 * ebp]
 275     movdqa    xmm0, xmm3   // use xmm0 as the temp register.
 276     punpckldq xmm3, xmm7
 277     movlpd    qword ptr [edx], xmm3
 278     movhpd    qword ptr [ebx], xmm3
 279     punpckhdq xmm0, xmm7
 280     sub       ecx, 8
 281     movlpd    qword ptr [edx + esi], xmm0
 282     lea       edx, [edx + 2 * esi]
 283     movhpd    qword ptr [ebx + ebp], xmm0
 284     lea       ebx, [ebx + 2 * ebp]
 285     jg        convertloop
 286
 287     mov       esp, [esp + 16]
 288     pop       ebp
 289     pop       edi
 290     pop       esi
 291     pop       ebx
 292     ret
 293   }
 294 }
 295 #endif
 296 #if !defined(LIBYUV_DISABLE_X86) && \
 297     (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
 298 #define HAS_TRANSPOSE_WX8_SSSE3
 299 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
 300                                uint8* dst, int dst_stride, int width) {
 301   asm volatile (
 302     // Read in the data from the source pointer.
 303     // First round of bit swap.
 304     ".p2align  2                                 \n"
 305   "1:                                            \n"
 306     "movq       (%0),%%xmm0                      \n"
 307     "movq       (%0,%3),%%xmm1                   \n"
 308     "lea        (%0,%3,2),%0                     \n"
 309     "punpcklbw  %%xmm1,%%xmm0                    \n"
 310     "movq       (%0),%%xmm2                      \n"
 311     "movdqa     %%xmm0,%%xmm1                    \n"
 312     "palignr    $0x8,%%xmm1,%%xmm1               \n"
 313     "movq       (%0,%3),%%xmm3                   \n"
 314     "lea        (%0,%3,2),%0                     \n"
 315     "punpcklbw  %%xmm3,%%xmm2                    \n"
 316     "movdqa     %%xmm2,%%xmm3                    \n"
 317     "movq       (%0),%%xmm4                      \n"
 318     "palignr    $0x8,%%xmm3,%%xmm3               \n"
 319     "movq       (%0,%3),%%xmm5                   \n"
 320     "lea        (%0,%3,2),%0                     \n"
 321     "punpcklbw  %%xmm5,%%xmm4                    \n"
 322     "movdqa     %%xmm4,%%xmm5                    \n"
 323     "movq       (%0),%%xmm6                      \n"
 324     "palignr    $0x8,%%xmm5,%%xmm5               \n"
 325     "movq       (%0,%3),%%xmm7                   \n"
 326     "lea        (%0,%3,2),%0                     \n"
 327     "punpcklbw  %%xmm7,%%xmm6                    \n"
 328     "neg        %3                               \n"
 329     "movdqa     %%xmm6,%%xmm7                    \n"
 330     "lea        0x8(%0,%3,8),%0                  \n"
 331     "palignr    $0x8,%%xmm7,%%xmm7               \n"
 332     "neg        %3                               \n"
 333      // Second round of bit swap.
 334     "punpcklwd  %%xmm2,%%xmm0                    \n"
 335     "punpcklwd  %%xmm3,%%xmm1                    \n"
 336     "movdqa     %%xmm0,%%xmm2                    \n"
 337     "movdqa     %%xmm1,%%xmm3                    \n"
 338     "palignr    $0x8,%%xmm2,%%xmm2               \n"
 339     "palignr    $0x8,%%xmm3,%%xmm3               \n"
 340     "punpcklwd  %%xmm6,%%xmm4                    \n"
 341     "punpcklwd  %%xmm7,%%xmm5                    \n"
 342     "movdqa     %%xmm4,%%xmm6                    \n"
 343     "movdqa     %%xmm5,%%xmm7                    \n"
 344     "palignr    $0x8,%%xmm6,%%xmm6               \n"
 345     "palignr    $0x8,%%xmm7,%%xmm7               \n"
 346     // Third round of bit swap.
 347     // Write to the destination pointer.
 348     "punpckldq  %%xmm4,%%xmm0                    \n"
 349     "movq       %%xmm0,(%1)                      \n"
 350     "movdqa     %%xmm0,%%xmm4                    \n"
 351     "palignr    $0x8,%%xmm4,%%xmm4               \n"
 352     "movq       %%xmm4,(%1,%4)                   \n"
 353     "lea        (%1,%4,2),%1                     \n"
 354     "punpckldq  %%xmm6,%%xmm2                    \n"
 355     "movdqa     %%xmm2,%%xmm6                    \n"
 356     "movq       %%xmm2,(%1)                      \n"
 357     "palignr    $0x8,%%xmm6,%%xmm6               \n"
 358     "punpckldq  %%xmm5,%%xmm1                    \n"
 359     "movq       %%xmm6,(%1,%4)                   \n"
 360     "lea        (%1,%4,2),%1                     \n"
 361     "movdqa     %%xmm1,%%xmm5                    \n"
 362     "movq       %%xmm1,(%1)                      \n"
 363     "palignr    $0x8,%%xmm5,%%xmm5               \n"
 364     "movq       %%xmm5,(%1,%4)                   \n"
 365     "lea        (%1,%4,2),%1                     \n"
 366     "punpckldq  %%xmm7,%%xmm3                    \n"
 367     "movq       %%xmm3,(%1)                      \n"
 368     "movdqa     %%xmm3,%%xmm7                    \n"
 369     "palignr    $0x8,%%xmm7,%%xmm7               \n"
 370     "sub        $0x8,%2                          \n"
 371     "movq       %%xmm7,(%1,%4)                   \n"
 372     "lea        (%1,%4,2),%1                     \n"
 373     "jg         1b                               \n"
 374     : "+r"(src),    // %0
 375       "+r"(dst),    // %1
 376       "+r"(width)   // %2
 377     : "r"((intptr_t)(src_stride)),  // %3
 378       "r"((intptr_t)(dst_stride))   // %4
 379     : "memory", "cc"
 380   #if defined(__SSE2__)
 381       , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
 382   #endif
 383   );
 384 }
 385
 386 #if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)
 387 #define HAS_TRANSPOSE_UVWX8_SSE2
 388 void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
 389                          uint8* dst_a, int dst_stride_a,
 390                          uint8* dst_b, int dst_stride_b,
 391                          int w);
 392   asm (
 393     DECLARE_FUNCTION(TransposeUVWx8_SSE2)
 394     "push   %ebx                               \n"
 395     "push   %esi                               \n"
 396     "push   %edi                               \n"
 397     "push   %ebp                               \n"
 398     "mov    0x14(%esp),%eax                    \n"
 399     "mov    0x18(%esp),%edi                    \n"
 400     "mov    0x1c(%esp),%edx                    \n"
 401     "mov    0x20(%esp),%esi                    \n"
 402     "mov    0x24(%esp),%ebx                    \n"
 403     "mov    0x28(%esp),%ebp                    \n"
 404     "mov    %esp,%ecx                          \n"
 405     "sub    $0x14,%esp                         \n"
 406     "and    $0xfffffff0,%esp                   \n"
 407     "mov    %ecx,0x10(%esp)                    \n"
 408     "mov    0x2c(%ecx),%ecx                    \n"
 409
 410 "1:                                            \n"
 411     "movdqu (%eax),%xmm0                       \n"
 412     "movdqu (%eax,%edi,1),%xmm1                \n"
 413     "lea    (%eax,%edi,2),%eax                 \n"
 414     "movdqa %xmm0,%xmm7                        \n"
 415     "punpcklbw %xmm1,%xmm0                     \n"
 416     "punpckhbw %xmm1,%xmm7                     \n"
 417     "movdqa %xmm7,%xmm1                        \n"
 418     "movdqu (%eax),%xmm2                       \n"
 419     "movdqu (%eax,%edi,1),%xmm3                \n"
 420     "lea    (%eax,%edi,2),%eax                 \n"
 421     "movdqa %xmm2,%xmm7                        \n"
 422     "punpcklbw %xmm3,%xmm2                     \n"
 423     "punpckhbw %xmm3,%xmm7                     \n"
 424     "movdqa %xmm7,%xmm3                        \n"
 425     "movdqu (%eax),%xmm4                       \n"
 426     "movdqu (%eax,%edi,1),%xmm5                \n"
 427     "lea    (%eax,%edi,2),%eax                 \n"
 428     "movdqa %xmm4,%xmm7                        \n"
 429     "punpcklbw %xmm5,%xmm4                     \n"
 430     "punpckhbw %xmm5,%xmm7                     \n"
 431     "movdqa %xmm7,%xmm5                        \n"
 432     "movdqu (%eax),%xmm6                       \n"
 433     "movdqu (%eax,%edi,1),%xmm7                \n"
 434     "lea    (%eax,%edi,2),%eax                 \n"
 435     "movdqu %xmm5,(%esp)                       \n"
 436     "neg    %edi                               \n"
 437     "movdqa %xmm6,%xmm5                        \n"
 438     "punpcklbw %xmm7,%xmm6                     \n"
 439     "punpckhbw %xmm7,%xmm5                     \n"
 440     "movdqa %xmm5,%xmm7                        \n"
 441     "lea    0x10(%eax,%edi,8),%eax             \n"
 442     "neg    %edi                               \n"
 443     "movdqa %xmm0,%xmm5                        \n"
 444     "punpcklwd %xmm2,%xmm0                     \n"
 445     "punpckhwd %xmm2,%xmm5                     \n"
 446     "movdqa %xmm5,%xmm2                        \n"
 447     "movdqa %xmm1,%xmm5                        \n"
 448     "punpcklwd %xmm3,%xmm1                     \n"
 449     "punpckhwd %xmm3,%xmm5                     \n"
 450     "movdqa %xmm5,%xmm3                        \n"
 451     "movdqa %xmm4,%xmm5                        \n"
 452     "punpcklwd %xmm6,%xmm4                     \n"
 453     "punpckhwd %xmm6,%xmm5                     \n"
 454     "movdqa %xmm5,%xmm6                        \n"
 455     "movdqu (%esp),%xmm5                       \n"
 456     "movdqu %xmm6,(%esp)                       \n"
 457     "movdqa %xmm5,%xmm6                        \n"
 458     "punpcklwd %xmm7,%xmm5                     \n"
 459     "punpckhwd %xmm7,%xmm6                     \n"
 460     "movdqa %xmm6,%xmm7                        \n"
 461     "movdqa %xmm0,%xmm6                        \n"
 462     "punpckldq %xmm4,%xmm0                     \n"
 463     "punpckhdq %xmm4,%xmm6                     \n"
 464     "movdqa %xmm6,%xmm4                        \n"
 465     "movdqu (%esp),%xmm6                       \n"
 466     "movlpd %xmm0,(%edx)                       \n"
 467     "movhpd %xmm0,(%ebx)                       \n"
 468     "movlpd %xmm4,(%edx,%esi,1)                \n"
 469     "lea    (%edx,%esi,2),%edx                 \n"
 470     "movhpd %xmm4,(%ebx,%ebp,1)                \n"
 471     "lea    (%ebx,%ebp,2),%ebx                 \n"
 472     "movdqa %xmm2,%xmm0                        \n"
 473     "punpckldq %xmm6,%xmm2                     \n"
 474     "movlpd %xmm2,(%edx)                       \n"
 475     "movhpd %xmm2,(%ebx)                       \n"
 476     "punpckhdq %xmm6,%xmm0                     \n"
 477     "movlpd %xmm0,(%edx,%esi,1)                \n"
 478     "lea    (%edx,%esi,2),%edx                 \n"
 479     "movhpd %xmm0,(%ebx,%ebp,1)                \n"
 480     "lea    (%ebx,%ebp,2),%ebx                 \n"
 481     "movdqa %xmm1,%xmm0                        \n"
 482     "punpckldq %xmm5,%xmm1                     \n"
 483     "movlpd %xmm1,(%edx)                       \n"
 484     "movhpd %xmm1,(%ebx)                       \n"
 485     "punpckhdq %xmm5,%xmm0                     \n"
 486     "movlpd %xmm0,(%edx,%esi,1)                \n"
 487     "lea    (%edx,%esi,2),%edx                 \n"
 488     "movhpd %xmm0,(%ebx,%ebp,1)                \n"
 489     "lea    (%ebx,%ebp,2),%ebx                 \n"
 490     "movdqa %xmm3,%xmm0                        \n"
 491     "punpckldq %xmm7,%xmm3                     \n"
 492     "movlpd %xmm3,(%edx)                       \n"
 493     "movhpd %xmm3,(%ebx)                       \n"
 494     "punpckhdq %xmm7,%xmm0                     \n"
 495     "sub    $0x8,%ecx                          \n"
 496     "movlpd %xmm0,(%edx,%esi,1)                \n"
 497     "lea    (%edx,%esi,2),%edx                 \n"
 498     "movhpd %xmm0,(%ebx,%ebp,1)                \n"
 499     "lea    (%ebx,%ebp,2),%ebx                 \n"
 500     "jg     1b                                 \n"
 501     "mov    0x10(%esp),%esp                    \n"
 502     "pop    %ebp                               \n"
 503     "pop    %edi                               \n"
 504     "pop    %esi                               \n"
 505     "pop    %ebx                               \n"
 506 #if defined(__native_client__)
 507     "pop    %ecx                               \n"
 508     "and    $0xffffffe0,%ecx                   \n"
 509     "jmp    *%ecx                              \n"
 510 #else
 511     "ret                                       \n"
 512 #endif
 513 );
 514 #endif
 515 #if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
 516     defined(__x86_64__)
 517 // 64 bit version has enough registers to do 16x8 to 8x16 at a time.
 518 #define HAS_TRANSPOSE_WX8_FAST_SSSE3
 519 static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
 520                                     uint8* dst, int dst_stride, int width) {
 521   asm volatile (
 522   // Read in the data from the source pointer.
 523   // First round of bit swap.
 524   ".p2align  2                                 \n"
 525 "1:                                            \n"
 526   "movdqu     (%0),%%xmm0                      \n"
 527   "movdqu     (%0,%3),%%xmm1                   \n"
 528   "lea        (%0,%3,2),%0                     \n"
 529   "movdqa     %%xmm0,%%xmm8                    \n"
 530   "punpcklbw  %%xmm1,%%xmm0                    \n"
 531   "punpckhbw  %%xmm1,%%xmm8                    \n"
 532   "movdqu     (%0),%%xmm2                      \n"
 533   "movdqa     %%xmm0,%%xmm1                    \n"
 534   "movdqa     %%xmm8,%%xmm9                    \n"
 535   "palignr    $0x8,%%xmm1,%%xmm1               \n"
 536   "palignr    $0x8,%%xmm9,%%xmm9               \n"
 537   "movdqu     (%0,%3),%%xmm3                   \n"
 538   "lea        (%0,%3,2),%0                     \n"
 539   "movdqa     %%xmm2,%%xmm10                   \n"
 540   "punpcklbw  %%xmm3,%%xmm2                    \n"
 541   "punpckhbw  %%xmm3,%%xmm10                   \n"
 542   "movdqa     %%xmm2,%%xmm3                    \n"
 543   "movdqa     %%xmm10,%%xmm11                  \n"
 544   "movdqu     (%0),%%xmm4                      \n"
 545   "palignr    $0x8,%%xmm3,%%xmm3               \n"
 546   "palignr    $0x8,%%xmm11,%%xmm11             \n"
 547   "movdqu     (%0,%3),%%xmm5                   \n"
 548   "lea        (%0,%3,2),%0                     \n"
 549   "movdqa     %%xmm4,%%xmm12                   \n"
 550   "punpcklbw  %%xmm5,%%xmm4                    \n"
 551   "punpckhbw  %%xmm5,%%xmm12                   \n"
 552   "movdqa     %%xmm4,%%xmm5                    \n"
 553   "movdqa     %%xmm12,%%xmm13                  \n"
 554   "movdqu     (%0),%%xmm6                      \n"
 555   "palignr    $0x8,%%xmm5,%%xmm5               \n"
 556   "palignr    $0x8,%%xmm13,%%xmm13             \n"
 557   "movdqu     (%0,%3),%%xmm7                   \n"
 558   "lea        (%0,%3,2),%0                     \n"
 559   "movdqa     %%xmm6,%%xmm14                   \n"
 560   "punpcklbw  %%xmm7,%%xmm6                    \n"
 561   "punpckhbw  %%xmm7,%%xmm14                   \n"
 562   "neg        %3                               \n"
 563   "movdqa     %%xmm6,%%xmm7                    \n"
 564   "movdqa     %%xmm14,%%xmm15                  \n"
 565   "lea        0x10(%0,%3,8),%0                 \n"
 566   "palignr    $0x8,%%xmm7,%%xmm7               \n"
 567   "palignr    $0x8,%%xmm15,%%xmm15             \n"
 568   "neg        %3                               \n"
 569    // Second round of bit swap.
 570   "punpcklwd  %%xmm2,%%xmm0                    \n"
 571   "punpcklwd  %%xmm3,%%xmm1                    \n"
 572   "movdqa     %%xmm0,%%xmm2                    \n"
 573   "movdqa     %%xmm1,%%xmm3                    \n"
 574   "palignr    $0x8,%%xmm2,%%xmm2               \n"
 575   "palignr    $0x8,%%xmm3,%%xmm3               \n"
 576   "punpcklwd  %%xmm6,%%xmm4                    \n"
 577   "punpcklwd  %%xmm7,%%xmm5                    \n"
 578   "movdqa     %%xmm4,%%xmm6                    \n"
 579   "movdqa     %%xmm5,%%xmm7                    \n"
 580   "palignr    $0x8,%%xmm6,%%xmm6               \n"
 581   "palignr    $0x8,%%xmm7,%%xmm7               \n"
 582   "punpcklwd  %%xmm10,%%xmm8                   \n"
 583   "punpcklwd  %%xmm11,%%xmm9                   \n"
 584   "movdqa     %%xmm8,%%xmm10                   \n"
 585   "movdqa     %%xmm9,%%xmm11                   \n"
 586   "palignr    $0x8,%%xmm10,%%xmm10             \n"
 587   "palignr    $0x8,%%xmm11,%%xmm11             \n"
 588   "punpcklwd  %%xmm14,%%xmm12                  \n"
 589   "punpcklwd  %%xmm15,%%xmm13                  \n"
 590   "movdqa     %%xmm12,%%xmm14                  \n"
 591   "movdqa     %%xmm13,%%xmm15                  \n"
 592   "palignr    $0x8,%%xmm14,%%xmm14             \n"
 593   "palignr    $0x8,%%xmm15,%%xmm15             \n"
 594   // Third round of bit swap.
 595   // Write to the destination pointer.
 596   "punpckldq  %%xmm4,%%xmm0                    \n"
 597   "movq       %%xmm0,(%1)                      \n"
 598   "movdqa     %%xmm0,%%xmm4                    \n"
 599   "palignr    $0x8,%%xmm4,%%xmm4               \n"
 600   "movq       %%xmm4,(%1,%4)                   \n"
 601   "lea        (%1,%4,2),%1                     \n"
 602   "punpckldq  %%xmm6,%%xmm2                    \n"
 603   "movdqa     %%xmm2,%%xmm6                    \n"
 604   "movq       %%xmm2,(%1)                      \n"
 605   "palignr    $0x8,%%xmm6,%%xmm6               \n"
 606   "punpckldq  %%xmm5,%%xmm1                    \n"
 607   "movq       %%xmm6,(%1,%4)                   \n"
 608   "lea        (%1,%4,2),%1                     \n"
 609   "movdqa     %%xmm1,%%xmm5                    \n"
 610   "movq       %%xmm1,(%1)                      \n"
 611   "palignr    $0x8,%%xmm5,%%xmm5               \n"
 612   "movq       %%xmm5,(%1,%4)                   \n"
 613   "lea        (%1,%4,2),%1                     \n"
 614   "punpckldq  %%xmm7,%%xmm3                    \n"
 615   "movq       %%xmm3,(%1)                      \n"
 616   "movdqa     %%xmm3,%%xmm7                    \n"
 617   "palignr    $0x8,%%xmm7,%%xmm7               \n"
 618   "movq       %%xmm7,(%1,%4)                   \n"
 619   "lea        (%1,%4,2),%1                     \n"
 620   "punpckldq  %%xmm12,%%xmm8                   \n"
 621   "movq       %%xmm8,(%1)                      \n"
 622   "movdqa     %%xmm8,%%xmm12                   \n"
 623   "palignr    $0x8,%%xmm12,%%xmm12             \n"
 624   "movq       %%xmm12,(%1,%4)                  \n"
 625   "lea        (%1,%4,2),%1                     \n"
 626   "punpckldq  %%xmm14,%%xmm10                  \n"
 627   "movdqa     %%xmm10,%%xmm14                  \n"
 628   "movq       %%xmm10,(%1)                     \n"
 629   "palignr    $0x8,%%xmm14,%%xmm14             \n"
 630   "punpckldq  %%xmm13,%%xmm9                   \n"
 631   "movq       %%xmm14,(%1,%4)                  \n"
 632   "lea        (%1,%4,2),%1                     \n"
 633   "movdqa     %%xmm9,%%xmm13                   \n"
 634   "movq       %%xmm9,(%1)                      \n"
 635   "palignr    $0x8,%%xmm13,%%xmm13             \n"
 636   "movq       %%xmm13,(%1,%4)                  \n"
 637   "lea        (%1,%4,2),%1                     \n"
 638   "punpckldq  %%xmm15,%%xmm11                  \n"
 639   "movq       %%xmm11,(%1)                     \n"
 640   "movdqa     %%xmm11,%%xmm15                  \n"
 641   "palignr    $0x8,%%xmm15,%%xmm15             \n"
 642   "sub        $0x10,%2                         \n"
 643   "movq       %%xmm15,(%1,%4)                  \n"
 644   "lea        (%1,%4,2),%1                     \n"
 645   "jg         1b                               \n"
 646   : "+r"(src),    // %0
 647     "+r"(dst),    // %1
 648     "+r"(width)   // %2
 649   : "r"((intptr_t)(src_stride)),  // %3
 650     "r"((intptr_t)(dst_stride))   // %4
 651   : "memory", "cc",
 652     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
 653     "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",  "xmm14",  "xmm15"
 654 );
 655 }
 656
 657 #define HAS_TRANSPOSE_UVWX8_SSE2
 658 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
 659                                 uint8* dst_a, int dst_stride_a,
 660                                 uint8* dst_b, int dst_stride_b,
 661                                 int w) {
 662   asm volatile (
 663   // Read in the data from the source pointer.
 664   // First round of bit swap.
 665   ".p2align  2                                 \n"
 666 "1:                                            \n"
 667   "movdqu     (%0),%%xmm0                      \n"
 668   "movdqu     (%0,%4),%%xmm1                   \n"
 669   "lea        (%0,%4,2),%0                     \n"
 670   "movdqa     %%xmm0,%%xmm8                    \n"
 671   "punpcklbw  %%xmm1,%%xmm0                    \n"
 672   "punpckhbw  %%xmm1,%%xmm8                    \n"
 673   "movdqa     %%xmm8,%%xmm1                    \n"
 674   "movdqu     (%0),%%xmm2                      \n"
 675   "movdqu     (%0,%4),%%xmm3                   \n"
 676   "lea        (%0,%4,2),%0                     \n"
 677   "movdqa     %%xmm2,%%xmm8                    \n"
 678   "punpcklbw  %%xmm3,%%xmm2                    \n"
 679   "punpckhbw  %%xmm3,%%xmm8                    \n"
 680   "movdqa     %%xmm8,%%xmm3                    \n"
 681   "movdqu     (%0),%%xmm4                      \n"
 682   "movdqu     (%0,%4),%%xmm5                   \n"
 683   "lea        (%0,%4,2),%0                     \n"
 684   "movdqa     %%xmm4,%%xmm8                    \n"
 685   "punpcklbw  %%xmm5,%%xmm4                    \n"
 686   "punpckhbw  %%xmm5,%%xmm8                    \n"
 687   "movdqa     %%xmm8,%%xmm5                    \n"
 688   "movdqu     (%0),%%xmm6                      \n"
 689   "movdqu     (%0,%4),%%xmm7                   \n"
 690   "lea        (%0,%4,2),%0                     \n"
 691   "movdqa     %%xmm6,%%xmm8                    \n"
 692   "punpcklbw  %%xmm7,%%xmm6                    \n"
 693   "neg        %4                               \n"
 694   "lea        0x10(%0,%4,8),%0                 \n"
 695   "punpckhbw  %%xmm7,%%xmm8                    \n"
 696   "movdqa     %%xmm8,%%xmm7                    \n"
 697   "neg        %4                               \n"
 698    // Second round of bit swap.
 699   "movdqa     %%xmm0,%%xmm8                    \n"
 700   "movdqa     %%xmm1,%%xmm9                    \n"
 701   "punpckhwd  %%xmm2,%%xmm8                    \n"
 702   "punpckhwd  %%xmm3,%%xmm9                    \n"
 703   "punpcklwd  %%xmm2,%%xmm0                    \n"
 704   "punpcklwd  %%xmm3,%%xmm1                    \n"
 705   "movdqa     %%xmm8,%%xmm2                    \n"
 706   "movdqa     %%xmm9,%%xmm3                    \n"
 707   "movdqa     %%xmm4,%%xmm8                    \n"
 708   "movdqa     %%xmm5,%%xmm9                    \n"
 709   "punpckhwd  %%xmm6,%%xmm8                    \n"
 710   "punpckhwd  %%xmm7,%%xmm9                    \n"
 711   "punpcklwd  %%xmm6,%%xmm4                    \n"
 712   "punpcklwd  %%xmm7,%%xmm5                    \n"
 713   "movdqa     %%xmm8,%%xmm6                    \n"
 714   "movdqa     %%xmm9,%%xmm7                    \n"
 715   // Third round of bit swap.
 716   // Write to the destination pointer.
 717   "movdqa     %%xmm0,%%xmm8                    \n"
 718   "punpckldq  %%xmm4,%%xmm0                    \n"
 719   "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
 720   "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
 721   "punpckhdq  %%xmm4,%%xmm8                    \n"
 722   "movlpd     %%xmm8,(%1,%5)                   \n"
 723   "lea        (%1,%5,2),%1                     \n"
 724   "movhpd     %%xmm8,(%2,%6)                   \n"
 725   "lea        (%2,%6,2),%2                     \n"
 726   "movdqa     %%xmm2,%%xmm8                    \n"
 727   "punpckldq  %%xmm6,%%xmm2                    \n"
 728   "movlpd     %%xmm2,(%1)                      \n"
 729   "movhpd     %%xmm2,(%2)                      \n"
 730   "punpckhdq  %%xmm6,%%xmm8                    \n"
 731   "movlpd     %%xmm8,(%1,%5)                   \n"
 732   "lea        (%1,%5,2),%1                     \n"
 733   "movhpd     %%xmm8,(%2,%6)                   \n"
 734   "lea        (%2,%6,2),%2                     \n"
 735   "movdqa     %%xmm1,%%xmm8                    \n"
 736   "punpckldq  %%xmm5,%%xmm1                    \n"
 737   "movlpd     %%xmm1,(%1)                      \n"
 738   "movhpd     %%xmm1,(%2)                      \n"
 739   "punpckhdq  %%xmm5,%%xmm8                    \n"
 740   "movlpd     %%xmm8,(%1,%5)                   \n"
 741   "lea        (%1,%5,2),%1                     \n"
 742   "movhpd     %%xmm8,(%2,%6)                   \n"
 743   "lea        (%2,%6,2),%2                     \n"
 744   "movdqa     %%xmm3,%%xmm8                    \n"
 745   "punpckldq  %%xmm7,%%xmm3                    \n"
 746   "movlpd     %%xmm3,(%1)                      \n"
 747   "movhpd     %%xmm3,(%2)                      \n"
 748   "punpckhdq  %%xmm7,%%xmm8                    \n"
 749   "sub        $0x8,%3                          \n"
 750   "movlpd     %%xmm8,(%1,%5)                   \n"
 751   "lea        (%1,%5,2),%1                     \n"
 752   "movhpd     %%xmm8,(%2,%6)                   \n"
 753   "lea        (%2,%6,2),%2                     \n"
 754   "jg         1b                               \n"
 755   : "+r"(src),    // %0
 756     "+r"(dst_a),  // %1
 757     "+r"(dst_b),  // %2
 758     "+r"(w)   // %3
 759   : "r"((intptr_t)(src_stride)),    // %4
 760     "r"((intptr_t)(dst_stride_a)),  // %5
 761     "r"((intptr_t)(dst_stride_b))   // %6
 762   : "memory", "cc",
 763     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
 764     "xmm8", "xmm9"
 765 );
 766 }
 767 #endif
 768 #endif
 769
 770 static void TransposeWx8_C(const uint8* src, int src_stride,
 771                            uint8* dst, int dst_stride,
 772                            int width) {
 773   int i;
 774   for (i = 0; i < width; ++i) {
 775     dst[0] = src[0 * src_stride];
 776     dst[1] = src[1 * src_stride];
 777     dst[2] = src[2 * src_stride];
 778     dst[3] = src[3 * src_stride];
 779     dst[4] = src[4 * src_stride];
 780     dst[5] = src[5 * src_stride];
 781     dst[6] = src[6 * src_stride];
 782     dst[7] = src[7 * src_stride];
 783     ++src;
 784     dst += dst_stride;
 785   }
 786 }
 787
 788 static void TransposeWxH_C(const uint8* src, int src_stride,
 789                            uint8* dst, int dst_stride,
 790                            int width, int height) {
 791   int i;
 792   for (i = 0; i < width; ++i) {
 793     int j;
 794     for (j = 0; j < height; ++j) {
 795       dst[i * dst_stride + j] = src[j * src_stride + i];
 796     }
 797   }
 798 }
 799
 800 LIBYUV_API
 801 void TransposePlane(const uint8* src, int src_stride,
 802                     uint8* dst, int dst_stride,
 803                     int width, int height) {
 804   int i = height;
 805   void (*TransposeWx8)(const uint8* src, int src_stride,
 806                        uint8* dst, int dst_stride,
 807                        int width) = TransposeWx8_C;
 808 #if defined(HAS_TRANSPOSE_WX8_NEON)
 809   if (TestCpuFlag(kCpuHasNEON)) {
 810     TransposeWx8 = TransposeWx8_NEON;
 811   }
 812 #endif
 813 #if defined(HAS_TRANSPOSE_WX8_SSSE3)
 814   if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
 815     TransposeWx8 = TransposeWx8_SSSE3;
 816   }
 817 #endif
 818 #if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
 819   if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
 820     TransposeWx8 = TransposeWx8_FAST_SSSE3;
 821   }
 822 #endif
 823 #if defined(HAS_TRANSPOSE_WX8_MIPS_DSPR2)
 824   if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
 825     if (IS_ALIGNED(width, 4) &&
 826         IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
 827       TransposeWx8 = TransposeWx8_FAST_MIPS_DSPR2;
 828     } else {
 829       TransposeWx8 = TransposeWx8_MIPS_DSPR2;
 830     }
 831   }
 832 #endif
 833
 834   // Work across the source in 8x8 tiles
 835   while (i >= 8) {
 836     TransposeWx8(src, src_stride, dst, dst_stride, width);
 837     src += 8 * src_stride;    // Go down 8 rows.
 838     dst += 8;                 // Move over 8 columns.
 839     i -= 8;
 840   }
 841
 842   TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
 843 }
 844
 845 LIBYUV_API
 846 void RotatePlane90(const uint8* src, int src_stride,
 847                    uint8* dst, int dst_stride,
 848                    int width, int height) {
 849   // Rotate by 90 is a transpose with the source read
 850   // from bottom to top. So set the source pointer to the end
 851   // of the buffer and flip the sign of the source stride.
 852   src += src_stride * (height - 1);
 853   src_stride = -src_stride;
 854   TransposePlane(src, src_stride, dst, dst_stride, width, height);
 855 }
 856
 857 LIBYUV_API
 858 void RotatePlane270(const uint8* src, int src_stride,
 859                     uint8* dst, int dst_stride,
 860                     int width, int height) {
 861   // Rotate by 270 is a transpose with the destination written
 862   // from bottom to top. So set the destination pointer to the end
 863   // of the buffer and flip the sign of the destination stride.
 864   dst += dst_stride * (width - 1);
 865   dst_stride = -dst_stride;
 866   TransposePlane(src, src_stride, dst, dst_stride, width, height);
 867 }
 868
 869 LIBYUV_API
 870 void RotatePlane180(const uint8* src, int src_stride,
 871                     uint8* dst, int dst_stride,
 872                     int width, int height) {
 873   // Swap first and last row and mirror the content. Uses a temporary row.
 874   align_buffer_64(row, width);
 875   const uint8* src_bot = src + src_stride * (height - 1);
 876   uint8* dst_bot = dst + dst_stride * (height - 1);
 877   int half_height = (height + 1) >> 1;
 878   int y;
 879   void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
 880   void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
 881 #if defined(HAS_MIRRORROW_NEON)
 882   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
 883     MirrorRow = MirrorRow_NEON;
 884   }
 885 #endif
 886 #if defined(HAS_MIRRORROW_SSE2)
 887   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
 888     MirrorRow = MirrorRow_SSE2;
 889   }
 890 #endif
 891 #if defined(HAS_MIRRORROW_SSSE3)
 892   if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
 893     MirrorRow = MirrorRow_SSSE3;
 894   }
 895 #endif
 896 #if defined(HAS_MIRRORROW_AVX2)
 897   if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
 898     MirrorRow = MirrorRow_AVX2;
 899   }
 900 #endif
 901 // TODO(fbarchard): Mirror on mips handle unaligned memory.
 902 #if defined(HAS_MIRRORROW_MIPS_DSPR2)
 903   if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
 904       IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
 905       IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {
 906     MirrorRow = MirrorRow_MIPS_DSPR2;
 907   }
 908 #endif
 909 #if defined(HAS_COPYROW_NEON)
 910   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
 911     CopyRow = CopyRow_NEON;
 912   }
 913 #endif
 914 #if defined(HAS_COPYROW_X86)
 915   if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
 916     CopyRow = CopyRow_X86;
 917   }
 918 #endif
 919 #if defined(HAS_COPYROW_SSE2)
 920   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
 921     CopyRow = CopyRow_SSE2;
 922   }
 923 #endif
 924 #if defined(HAS_COPYROW_AVX)
 925   if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) {
 926     CopyRow = CopyRow_AVX;
 927   }
 928 #endif
 929 #if defined(HAS_COPYROW_ERMS)
 930   if (TestCpuFlag(kCpuHasERMS)) {
 931     CopyRow = CopyRow_ERMS;
 932   }
 933 #endif
 934 #if defined(HAS_COPYROW_MIPS)
 935   if (TestCpuFlag(kCpuHasMIPS)) {
 936     CopyRow = CopyRow_MIPS;
 937   }
 938 #endif
 939
 940   // Odd height will harmlessly mirror the middle row twice.
 941   for (y = 0; y < half_height; ++y) {
 942     MirrorRow(src, row, width);  // Mirror first row into a buffer
 943     src += src_stride;
 944     MirrorRow(src_bot, dst, width);  // Mirror last row into first row
 945     dst += dst_stride;
 946     CopyRow(row, dst_bot, width);  // Copy first mirrored row into last
 947     src_bot -= src_stride;
 948     dst_bot -= dst_stride;
 949   }
 950   free_aligned_buffer_64(row);
 951 }
 952
 953 static void TransposeUVWx8_C(const uint8* src, int src_stride,
 954                              uint8* dst_a, int dst_stride_a,
 955                              uint8* dst_b, int dst_stride_b,
 956                              int width) {
 957   int i;
 958   for (i = 0; i < width; ++i) {
 959     dst_a[0] = src[0 * src_stride + 0];
 960     dst_b[0] = src[0 * src_stride + 1];
 961     dst_a[1] = src[1 * src_stride + 0];
 962     dst_b[1] = src[1 * src_stride + 1];
 963     dst_a[2] = src[2 * src_stride + 0];
 964     dst_b[2] = src[2 * src_stride + 1];
 965     dst_a[3] = src[3 * src_stride + 0];
 966     dst_b[3] = src[3 * src_stride + 1];
 967     dst_a[4] = src[4 * src_stride + 0];
 968     dst_b[4] = src[4 * src_stride + 1];
 969     dst_a[5] = src[5 * src_stride + 0];
 970     dst_b[5] = src[5 * src_stride + 1];
 971     dst_a[6] = src[6 * src_stride + 0];
 972     dst_b[6] = src[6 * src_stride + 1];
 973     dst_a[7] = src[7 * src_stride + 0];
 974     dst_b[7] = src[7 * src_stride + 1];
 975     src += 2;
 976     dst_a += dst_stride_a;
 977     dst_b += dst_stride_b;
 978   }
 979 }
 980
 981 static void TransposeUVWxH_C(const uint8* src, int src_stride,
 982                              uint8* dst_a, int dst_stride_a,
 983                              uint8* dst_b, int dst_stride_b,
 984                              int width, int height) {
 985   int i;
 986   for (i = 0; i < width * 2; i += 2) {
 987     int j;
 988     for (j = 0; j < height; ++j) {
 989       dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
 990       dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
 991     }
 992   }
 993 }
 994
 995 LIBYUV_API
 996 void TransposeUV(const uint8* src, int src_stride,
 997                  uint8* dst_a, int dst_stride_a,
 998                  uint8* dst_b, int dst_stride_b,
 999                  int width, int height) {
1000   int i = height;
1001   void (*TransposeUVWx8)(const uint8* src, int src_stride,
1002                          uint8* dst_a, int dst_stride_a,
1003                          uint8* dst_b, int dst_stride_b,
1004                          int width) = TransposeUVWx8_C;
1005 #if defined(HAS_TRANSPOSE_UVWX8_NEON)
1006   if (TestCpuFlag(kCpuHasNEON)) {
1007     TransposeUVWx8 = TransposeUVWx8_NEON;
1008   }
1009 #endif
1010 #if defined(HAS_TRANSPOSE_UVWX8_SSE2)
1011   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
1012     TransposeUVWx8 = TransposeUVWx8_SSE2;
1013   }
1014 #endif
1015 #if defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2)
1016   if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) &&
1017       IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
1018     TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2;
1019   }
1020 #endif
1021
1022   // Work through the source in 8x8 tiles.
1023   while (i >= 8) {
1024     TransposeUVWx8(src, src_stride,
1025                    dst_a, dst_stride_a,
1026                    dst_b, dst_stride_b,
1027                    width);
1028     src += 8 * src_stride;    // Go down 8 rows.
1029     dst_a += 8;               // Move over 8 columns.
1030     dst_b += 8;               // Move over 8 columns.
1031     i -= 8;
1032   }
1033
1034   TransposeUVWxH_C(src, src_stride,
1035                    dst_a, dst_stride_a,
1036                    dst_b, dst_stride_b,
1037                    width, i);
1038 }
1039
1040 LIBYUV_API
1041 void RotateUV90(const uint8* src, int src_stride,
1042                 uint8* dst_a, int dst_stride_a,
1043                 uint8* dst_b, int dst_stride_b,
1044                 int width, int height) {
1045   src += src_stride * (height - 1);
1046   src_stride = -src_stride;
1047
1048   TransposeUV(src, src_stride,
1049               dst_a, dst_stride_a,
1050               dst_b, dst_stride_b,
1051               width, height);
1052 }
1053
1054 LIBYUV_API
1055 void RotateUV270(const uint8* src, int src_stride,
1056                  uint8* dst_a, int dst_stride_a,
1057                  uint8* dst_b, int dst_stride_b,
1058                  int width, int height) {
1059   dst_a += dst_stride_a * (width - 1);
1060   dst_b += dst_stride_b * (width - 1);
1061   dst_stride_a = -dst_stride_a;
1062   dst_stride_b = -dst_stride_b;
1063
1064   TransposeUV(src, src_stride,
1065               dst_a, dst_stride_a,
1066               dst_b, dst_stride_b,
1067               width, height);
1068 }
1069
1070 // Rotate 180 is a horizontal and vertical flip.
1071 LIBYUV_API
1072 void RotateUV180(const uint8* src, int src_stride,
1073                  uint8* dst_a, int dst_stride_a,
1074                  uint8* dst_b, int dst_stride_b,
1075                  int width, int height) {
1076   int i;
1077   void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
1078       MirrorUVRow_C;
1079 #if defined(HAS_MIRRORUVROW_NEON)
1080   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
1081     MirrorRowUV = MirrorUVRow_NEON;
1082   }
1083 #endif
1084 #if defined(HAS_MIRRORROW_UV_SSSE3)
1085   if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
1086     MirrorRowUV = MirrorUVRow_SSSE3;
1087   }
1088 #endif
1089 #if defined(HAS_MIRRORUVROW_MIPS_DSPR2)
1090   if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
1091       IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
1092     MirrorRowUV = MirrorUVRow_MIPS_DSPR2;
1093   }
1094 #endif
1095
1096   dst_a += dst_stride_a * (height - 1);
1097   dst_b += dst_stride_b * (height - 1);
1098
1099   for (i = 0; i < height; ++i) {
1100     MirrorRowUV(src, dst_a, dst_b, width);
1101     src += src_stride;
1102     dst_a -= dst_stride_a;
1103     dst_b -= dst_stride_b;
1104   }
1105 }
1106
1107 LIBYUV_API
1108 int RotatePlane(const uint8* src, int src_stride,
1109                 uint8* dst, int dst_stride,
1110                 int width, int height,
1111                 enum RotationMode mode) {
1112   if (!src || width <= 0 || height == 0 || !dst) {
1113     return -1;
1114   }
1115
1116   // Negative height means invert the image.
1117   if (height < 0) {
1118     height = -height;
1119     src = src + (height - 1) * src_stride;
1120     src_stride = -src_stride;
1121   }
1122
1123   switch (mode) {
1124     case kRotate0:
1125       // copy frame
1126       CopyPlane(src, src_stride,
1127                 dst, dst_stride,
1128                 width, height);
1129       return 0;
1130     case kRotate90:
1131       RotatePlane90(src, src_stride,
1132                     dst, dst_stride,
1133                     width, height);
1134       return 0;
1135     case kRotate270:
1136       RotatePlane270(src, src_stride,
1137                      dst, dst_stride,
1138                      width, height);
1139       return 0;
1140     case kRotate180:
1141       RotatePlane180(src, src_stride,
1142                      dst, dst_stride,
1143                      width, height);
1144       return 0;
1145     default:
1146       break;
1147   }
1148   return -1;
1149 }
1150
1151 LIBYUV_API
1152 int I420Rotate(const uint8* src_y, int src_stride_y,
1153                const uint8* src_u, int src_stride_u,
1154                const uint8* src_v, int src_stride_v,
1155                uint8* dst_y, int dst_stride_y,
1156                uint8* dst_u, int dst_stride_u,
1157                uint8* dst_v, int dst_stride_v,
1158                int width, int height,
1159                enum RotationMode mode) {
1160   int halfwidth = (width + 1) >> 1;
1161   int halfheight = (height + 1) >> 1;
1162   if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
1163       !dst_y || !dst_u || !dst_v) {
1164     return -1;
1165   }
1166
1167   // Negative height means invert the image.
1168   if (height < 0) {
1169     height = -height;
1170     halfheight = (height + 1) >> 1;
1171     src_y = src_y + (height - 1) * src_stride_y;
1172     src_u = src_u + (halfheight - 1) * src_stride_u;
1173     src_v = src_v + (halfheight - 1) * src_stride_v;
1174     src_stride_y = -src_stride_y;
1175     src_stride_u = -src_stride_u;
1176     src_stride_v = -src_stride_v;
1177   }
1178
1179   switch (mode) {
1180     case kRotate0:
1181       // copy frame
1182       return I420Copy(src_y, src_stride_y,
1183                       src_u, src_stride_u,
1184                       src_v, src_stride_v,
1185                       dst_y, dst_stride_y,
1186                       dst_u, dst_stride_u,
1187                       dst_v, dst_stride_v,
1188                       width, height);
1189     case kRotate90:
1190       RotatePlane90(src_y, src_stride_y,
1191                     dst_y, dst_stride_y,
1192                     width, height);
1193       RotatePlane90(src_u, src_stride_u,
1194                     dst_u, dst_stride_u,
1195                     halfwidth, halfheight);
1196       RotatePlane90(src_v, src_stride_v,
1197                     dst_v, dst_stride_v,
1198                     halfwidth, halfheight);
1199       return 0;
1200     case kRotate270:
1201       RotatePlane270(src_y, src_stride_y,
1202                      dst_y, dst_stride_y,
1203                      width, height);
1204       RotatePlane270(src_u, src_stride_u,
1205                      dst_u, dst_stride_u,
1206                      halfwidth, halfheight);
1207       RotatePlane270(src_v, src_stride_v,
1208                      dst_v, dst_stride_v,
1209                      halfwidth, halfheight);
1210       return 0;
1211     case kRotate180:
1212       RotatePlane180(src_y, src_stride_y,
1213                      dst_y, dst_stride_y,
1214                      width, height);
1215       RotatePlane180(src_u, src_stride_u,
1216                      dst_u, dst_stride_u,
1217                      halfwidth, halfheight);
1218       RotatePlane180(src_v, src_stride_v,
1219                      dst_v, dst_stride_v,
1220                      halfwidth, halfheight);
1221       return 0;
1222     default:
1223       break;
1224   }
1225   return -1;
1226 }
1227
1228 LIBYUV_API
1229 int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
1230                      const uint8* src_uv, int src_stride_uv,
1231                      uint8* dst_y, int dst_stride_y,
1232                      uint8* dst_u, int dst_stride_u,
1233                      uint8* dst_v, int dst_stride_v,
1234                      int width, int height,
1235                      enum RotationMode mode) {
1236   int halfwidth = (width + 1) >> 1;
1237   int halfheight = (height + 1) >> 1;
1238   if (!src_y || !src_uv || width <= 0 || height == 0 ||
1239       !dst_y || !dst_u || !dst_v) {
1240     return -1;
1241   }
1242
1243   // Negative height means invert the image.
1244   if (height < 0) {
1245     height = -height;
1246     halfheight = (height + 1) >> 1;
1247     src_y = src_y + (height - 1) * src_stride_y;
1248     src_uv = src_uv + (halfheight - 1) * src_stride_uv;
1249     src_stride_y = -src_stride_y;
1250     src_stride_uv = -src_stride_uv;
1251   }
1252
1253   switch (mode) {
1254     case kRotate0:
1255       // copy frame
1256       return NV12ToI420(src_y, src_stride_y,
1257                         src_uv, src_stride_uv,
1258                         dst_y, dst_stride_y,
1259                         dst_u, dst_stride_u,
1260                         dst_v, dst_stride_v,
1261                         width, height);
1262     case kRotate90:
1263       RotatePlane90(src_y, src_stride_y,
1264                     dst_y, dst_stride_y,
1265                     width, height);
1266       RotateUV90(src_uv, src_stride_uv,
1267                  dst_u, dst_stride_u,
1268                  dst_v, dst_stride_v,
1269                  halfwidth, halfheight);
1270       return 0;
1271     case kRotate270:
1272       RotatePlane270(src_y, src_stride_y,
1273                      dst_y, dst_stride_y,
1274                      width, height);
1275       RotateUV270(src_uv, src_stride_uv,
1276                   dst_u, dst_stride_u,
1277                   dst_v, dst_stride_v,
1278                   halfwidth, halfheight);
1279       return 0;
1280     case kRotate180:
1281       RotatePlane180(src_y, src_stride_y,
1282                      dst_y, dst_stride_y,
1283                      width, height);
1284       RotateUV180(src_uv, src_stride_uv,
1285                   dst_u, dst_stride_u,
1286                   dst_v, dst_stride_v,
1287                   halfwidth, halfheight);
1288       return 0;
1289     default:
1290       break;
1291   }
1292   return -1;
1293 }
1294
1295 #ifdef __cplusplus
1296 }  // extern "C"
1297 }  // namespace libyuv
1298 #endif