src/third_party/libvpx/source/libvpx/third_party/libyuv/source/rotate.cc

   1 /*
   2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS. All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10
  11 #include "libyuv/rotate.h"
  12
  13 #include "libyuv/cpu_id.h"
  14 #include "libyuv/convert.h"
  15 #include "libyuv/planar_functions.h"
  16 #include "libyuv/row.h"
  17
  18 #ifdef __cplusplus
  19 namespace libyuv {
  20 extern "C" {
  21 #endif
  22
  23 #if !defined(LIBYUV_DISABLE_X86) && \
  24     (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
  25 #if defined(__APPLE__) && defined(__i386__)
  26 #define DECLARE_FUNCTION(name)                                                 \
  27     ".text                                     \n"                             \
  28     ".private_extern _" #name "                \n"                             \
  29     ".align 4,0x90                             \n"                             \
  30 "_" #name ":                                   \n"
  31 #elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
  32 #define DECLARE_FUNCTION(name)                                                 \
  33     ".text                                     \n"                             \
  34     ".align 4,0x90                             \n"                             \
  35 "_" #name ":                                   \n"
  36 #else
  37 #define DECLARE_FUNCTION(name)                                                 \
  38     ".text                                     \n"                             \
  39     ".align 4,0x90                             \n"                             \
  40 #name ":                                       \n"
  41 #endif
  42 #endif
  43
  44 #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
  45     (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
  46 #define HAS_MIRRORROW_NEON
  47 void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
  48 #define HAS_MIRRORROW_UV_NEON
  49 void MirrorUVRow_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width);
  50 #define HAS_TRANSPOSE_WX8_NEON
  51 void TransposeWx8_NEON(const uint8* src, int src_stride,
  52                        uint8* dst, int dst_stride, int width);
  53 #define HAS_TRANSPOSE_UVWX8_NEON
  54 void TransposeUVWx8_NEON(const uint8* src, int src_stride,
  55                          uint8* dst_a, int dst_stride_a,
  56                          uint8* dst_b, int dst_stride_b,
  57                          int width);
  58 #endif  // defined(__ARM_NEON__)
  59
  60 #if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
  61     defined(__mips__) && \
  62     defined(__mips_dsp) && (__mips_dsp_rev >= 2)
  63 #define HAS_TRANSPOSE_WX8_MIPS_DSPR2
  64 void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
  65                              uint8* dst, int dst_stride, int width);
  66
  67 void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
  68                                   uint8* dst, int dst_stride, int width);
  69 #define HAS_TRANSPOSE_UVWx8_MIPS_DSPR2
  70 void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
  71                                uint8* dst_a, int dst_stride_a,
  72                                uint8* dst_b, int dst_stride_b,
  73                                int width);
  74 #endif  // defined(__mips__)
  75
  76 #if !defined(LIBYUV_DISABLE_X86) && \
  77     defined(_M_IX86) && defined(_MSC_VER)
  78 #define HAS_TRANSPOSE_WX8_SSSE3
  79 __declspec(naked) __declspec(align(16))
  80 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
  81                                uint8* dst, int dst_stride, int width) {
  82   __asm {
  83     push      edi
  84     push      esi
  85     push      ebp
  86     mov       eax, [esp + 12 + 4]   // src
  87     mov       edi, [esp + 12 + 8]   // src_stride
  88     mov       edx, [esp + 12 + 12]  // dst
  89     mov       esi, [esp + 12 + 16]  // dst_stride
  90     mov       ecx, [esp + 12 + 20]  // width
  91
  92     // Read in the data from the source pointer.
  93     // First round of bit swap.
  94     align      4
  95  convertloop:
  96     movq      xmm0, qword ptr [eax]
  97     lea       ebp, [eax + 8]
  98     movq      xmm1, qword ptr [eax + edi]
  99     lea       eax, [eax + 2 * edi]
 100     punpcklbw xmm0, xmm1
 101     movq      xmm2, qword ptr [eax]
 102     movdqa    xmm1, xmm0
 103     palignr   xmm1, xmm1, 8
 104     movq      xmm3, qword ptr [eax + edi]
 105     lea       eax, [eax + 2 * edi]
 106     punpcklbw xmm2, xmm3
 107     movdqa    xmm3, xmm2
 108     movq      xmm4, qword ptr [eax]
 109     palignr   xmm3, xmm3, 8
 110     movq      xmm5, qword ptr [eax + edi]
 111     punpcklbw xmm4, xmm5
 112     lea       eax, [eax + 2 * edi]
 113     movdqa    xmm5, xmm4
 114     movq      xmm6, qword ptr [eax]
 115     palignr   xmm5, xmm5, 8
 116     movq      xmm7, qword ptr [eax + edi]
 117     punpcklbw xmm6, xmm7
 118     mov       eax, ebp
 119     movdqa    xmm7, xmm6
 120     palignr   xmm7, xmm7, 8
 121     // Second round of bit swap.
 122     punpcklwd xmm0, xmm2
 123     punpcklwd xmm1, xmm3
 124     movdqa    xmm2, xmm0
 125     movdqa    xmm3, xmm1
 126     palignr   xmm2, xmm2, 8
 127     palignr   xmm3, xmm3, 8
 128     punpcklwd xmm4, xmm6
 129     punpcklwd xmm5, xmm7
 130     movdqa    xmm6, xmm4
 131     movdqa    xmm7, xmm5
 132     palignr   xmm6, xmm6, 8
 133     palignr   xmm7, xmm7, 8
 134     // Third round of bit swap.
 135     // Write to the destination pointer.
 136     punpckldq xmm0, xmm4
 137     movq      qword ptr [edx], xmm0
 138     movdqa    xmm4, xmm0
 139     palignr   xmm4, xmm4, 8
 140     movq      qword ptr [edx + esi], xmm4
 141     lea       edx, [edx + 2 * esi]
 142     punpckldq xmm2, xmm6
 143     movdqa    xmm6, xmm2
 144     palignr   xmm6, xmm6, 8
 145     movq      qword ptr [edx], xmm2
 146     punpckldq xmm1, xmm5
 147     movq      qword ptr [edx + esi], xmm6
 148     lea       edx, [edx + 2 * esi]
 149     movdqa    xmm5, xmm1
 150     movq      qword ptr [edx], xmm1
 151     palignr   xmm5, xmm5, 8
 152     punpckldq xmm3, xmm7
 153     movq      qword ptr [edx + esi], xmm5
 154     lea       edx, [edx + 2 * esi]
 155     movq      qword ptr [edx], xmm3
 156     movdqa    xmm7, xmm3
 157     palignr   xmm7, xmm7, 8
 158     sub       ecx, 8
 159     movq      qword ptr [edx + esi], xmm7
 160     lea       edx, [edx + 2 * esi]
 161     jg        convertloop
 162
 163     pop       ebp
 164     pop       esi
 165     pop       edi
 166     ret
 167   }
 168 }
 169
 170 #define HAS_TRANSPOSE_UVWX8_SSE2
 171 __declspec(naked) __declspec(align(16))
 172 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
 173                                 uint8* dst_a, int dst_stride_a,
 174                                 uint8* dst_b, int dst_stride_b,
 175                                 int w) {
 176   __asm {
 177     push      ebx
 178     push      esi
 179     push      edi
 180     push      ebp
 181     mov       eax, [esp + 16 + 4]   // src
 182     mov       edi, [esp + 16 + 8]   // src_stride
 183     mov       edx, [esp + 16 + 12]  // dst_a
 184     mov       esi, [esp + 16 + 16]  // dst_stride_a
 185     mov       ebx, [esp + 16 + 20]  // dst_b
 186     mov       ebp, [esp + 16 + 24]  // dst_stride_b
 187     mov       ecx, esp
 188     sub       esp, 4 + 16
 189     and       esp, ~15
 190     mov       [esp + 16], ecx
 191     mov       ecx, [ecx + 16 + 28]  // w
 192
 193     align      4
 194  convertloop:
 195     // Read in the data from the source pointer.
 196     // First round of bit swap.
 197     movdqa    xmm0, [eax]
 198     movdqa    xmm1, [eax + edi]
 199     lea       eax, [eax + 2 * edi]
 200     movdqa    xmm7, xmm0  // use xmm7 as temp register.
 201     punpcklbw xmm0, xmm1
 202     punpckhbw xmm7, xmm1
 203     movdqa    xmm1, xmm7
 204     movdqa    xmm2, [eax]
 205     movdqa    xmm3, [eax + edi]
 206     lea       eax, [eax + 2 * edi]
 207     movdqa    xmm7, xmm2
 208     punpcklbw xmm2, xmm3
 209     punpckhbw xmm7, xmm3
 210     movdqa    xmm3, xmm7
 211     movdqa    xmm4, [eax]
 212     movdqa    xmm5, [eax + edi]
 213     lea       eax, [eax + 2 * edi]
 214     movdqa    xmm7, xmm4
 215     punpcklbw xmm4, xmm5
 216     punpckhbw xmm7, xmm5
 217     movdqa    xmm5, xmm7
 218     movdqa    xmm6, [eax]
 219     movdqa    xmm7, [eax + edi]
 220     lea       eax, [eax + 2 * edi]
 221     movdqa    [esp], xmm5  // backup xmm5
 222     neg       edi
 223     movdqa    xmm5, xmm6   // use xmm5 as temp register.
 224     punpcklbw xmm6, xmm7
 225     punpckhbw xmm5, xmm7
 226     movdqa    xmm7, xmm5
 227     lea       eax, [eax + 8 * edi + 16]
 228     neg       edi
 229     // Second round of bit swap.
 230     movdqa    xmm5, xmm0
 231     punpcklwd xmm0, xmm2
 232     punpckhwd xmm5, xmm2
 233     movdqa    xmm2, xmm5
 234     movdqa    xmm5, xmm1
 235     punpcklwd xmm1, xmm3
 236     punpckhwd xmm5, xmm3
 237     movdqa    xmm3, xmm5
 238     movdqa    xmm5, xmm4
 239     punpcklwd xmm4, xmm6
 240     punpckhwd xmm5, xmm6
 241     movdqa    xmm6, xmm5
 242     movdqa    xmm5, [esp]  // restore xmm5
 243     movdqa    [esp], xmm6  // backup xmm6
 244     movdqa    xmm6, xmm5    // use xmm6 as temp register.
 245     punpcklwd xmm5, xmm7
 246     punpckhwd xmm6, xmm7
 247     movdqa    xmm7, xmm6
 248     // Third round of bit swap.
 249     // Write to the destination pointer.
 250     movdqa    xmm6, xmm0
 251     punpckldq xmm0, xmm4
 252     punpckhdq xmm6, xmm4
 253     movdqa    xmm4, xmm6
 254     movdqa    xmm6, [esp]  // restore xmm6
 255     movlpd    qword ptr [edx], xmm0
 256     movhpd    qword ptr [ebx], xmm0
 257     movlpd    qword ptr [edx + esi], xmm4
 258     lea       edx, [edx + 2 * esi]
 259     movhpd    qword ptr [ebx + ebp], xmm4
 260     lea       ebx, [ebx + 2 * ebp]
 261     movdqa    xmm0, xmm2   // use xmm0 as the temp register.
 262     punpckldq xmm2, xmm6
 263     movlpd    qword ptr [edx], xmm2
 264     movhpd    qword ptr [ebx], xmm2
 265     punpckhdq xmm0, xmm6
 266     movlpd    qword ptr [edx + esi], xmm0
 267     lea       edx, [edx + 2 * esi]
 268     movhpd    qword ptr [ebx + ebp], xmm0
 269     lea       ebx, [ebx + 2 * ebp]
 270     movdqa    xmm0, xmm1   // use xmm0 as the temp register.
 271     punpckldq xmm1, xmm5
 272     movlpd    qword ptr [edx], xmm1
 273     movhpd    qword ptr [ebx], xmm1
 274     punpckhdq xmm0, xmm5
 275     movlpd    qword ptr [edx + esi], xmm0
 276     lea       edx, [edx + 2 * esi]
 277     movhpd    qword ptr [ebx + ebp], xmm0
 278     lea       ebx, [ebx + 2 * ebp]
 279     movdqa    xmm0, xmm3   // use xmm0 as the temp register.
 280     punpckldq xmm3, xmm7
 281     movlpd    qword ptr [edx], xmm3
 282     movhpd    qword ptr [ebx], xmm3
 283     punpckhdq xmm0, xmm7
 284     sub       ecx, 8
 285     movlpd    qword ptr [edx + esi], xmm0
 286     lea       edx, [edx + 2 * esi]
 287     movhpd    qword ptr [ebx + ebp], xmm0
 288     lea       ebx, [ebx + 2 * ebp]
 289     jg        convertloop
 290
 291     mov       esp, [esp + 16]
 292     pop       ebp
 293     pop       edi
 294     pop       esi
 295     pop       ebx
 296     ret
 297   }
 298 }
 299 #elif !defined(LIBYUV_DISABLE_X86) && \
 300     (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
 301 #define HAS_TRANSPOSE_WX8_SSSE3
 302 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
 303                                uint8* dst, int dst_stride, int width) {
 304   asm volatile (
 305     // Read in the data from the source pointer.
 306     // First round of bit swap.
 307     ".p2align  2                                 \n"
 308   "1:                                            \n"
 309     "movq       (%0),%%xmm0                      \n"
 310     "movq       (%0,%3),%%xmm1                   \n"
 311     "lea        (%0,%3,2),%0                     \n"
 312     "punpcklbw  %%xmm1,%%xmm0                    \n"
 313     "movq       (%0),%%xmm2                      \n"
 314     "movdqa     %%xmm0,%%xmm1                    \n"
 315     "palignr    $0x8,%%xmm1,%%xmm1               \n"
 316     "movq       (%0,%3),%%xmm3                   \n"
 317     "lea        (%0,%3,2),%0                     \n"
 318     "punpcklbw  %%xmm3,%%xmm2                    \n"
 319     "movdqa     %%xmm2,%%xmm3                    \n"
 320     "movq       (%0),%%xmm4                      \n"
 321     "palignr    $0x8,%%xmm3,%%xmm3               \n"
 322     "movq       (%0,%3),%%xmm5                   \n"
 323     "lea        (%0,%3,2),%0                     \n"
 324     "punpcklbw  %%xmm5,%%xmm4                    \n"
 325     "movdqa     %%xmm4,%%xmm5                    \n"
 326     "movq       (%0),%%xmm6                      \n"
 327     "palignr    $0x8,%%xmm5,%%xmm5               \n"
 328     "movq       (%0,%3),%%xmm7                   \n"
 329     "lea        (%0,%3,2),%0                     \n"
 330     "punpcklbw  %%xmm7,%%xmm6                    \n"
 331     "neg        %3                               \n"
 332     "movdqa     %%xmm6,%%xmm7                    \n"
 333     "lea        0x8(%0,%3,8),%0                  \n"
 334     "palignr    $0x8,%%xmm7,%%xmm7               \n"
 335     "neg        %3                               \n"
 336      // Second round of bit swap.
 337     "punpcklwd  %%xmm2,%%xmm0                    \n"
 338     "punpcklwd  %%xmm3,%%xmm1                    \n"
 339     "movdqa     %%xmm0,%%xmm2                    \n"
 340     "movdqa     %%xmm1,%%xmm3                    \n"
 341     "palignr    $0x8,%%xmm2,%%xmm2               \n"
 342     "palignr    $0x8,%%xmm3,%%xmm3               \n"
 343     "punpcklwd  %%xmm6,%%xmm4                    \n"
 344     "punpcklwd  %%xmm7,%%xmm5                    \n"
 345     "movdqa     %%xmm4,%%xmm6                    \n"
 346     "movdqa     %%xmm5,%%xmm7                    \n"
 347     "palignr    $0x8,%%xmm6,%%xmm6               \n"
 348     "palignr    $0x8,%%xmm7,%%xmm7               \n"
 349     // Third round of bit swap.
 350     // Write to the destination pointer.
 351     "punpckldq  %%xmm4,%%xmm0                    \n"
 352     "movq       %%xmm0,(%1)                      \n"
 353     "movdqa     %%xmm0,%%xmm4                    \n"
 354     "palignr    $0x8,%%xmm4,%%xmm4               \n"
 355     "movq       %%xmm4,(%1,%4)                   \n"
 356     "lea        (%1,%4,2),%1                     \n"
 357     "punpckldq  %%xmm6,%%xmm2                    \n"
 358     "movdqa     %%xmm2,%%xmm6                    \n"
 359     "movq       %%xmm2,(%1)                      \n"
 360     "palignr    $0x8,%%xmm6,%%xmm6               \n"
 361     "punpckldq  %%xmm5,%%xmm1                    \n"
 362     "movq       %%xmm6,(%1,%4)                   \n"
 363     "lea        (%1,%4,2),%1                     \n"
 364     "movdqa     %%xmm1,%%xmm5                    \n"
 365     "movq       %%xmm1,(%1)                      \n"
 366     "palignr    $0x8,%%xmm5,%%xmm5               \n"
 367     "movq       %%xmm5,(%1,%4)                   \n"
 368     "lea        (%1,%4,2),%1                     \n"
 369     "punpckldq  %%xmm7,%%xmm3                    \n"
 370     "movq       %%xmm3,(%1)                      \n"
 371     "movdqa     %%xmm3,%%xmm7                    \n"
 372     "palignr    $0x8,%%xmm7,%%xmm7               \n"
 373     "sub        $0x8,%2                          \n"
 374     "movq       %%xmm7,(%1,%4)                   \n"
 375     "lea        (%1,%4,2),%1                     \n"
 376     "jg         1b                               \n"
 377     : "+r"(src),    // %0
 378       "+r"(dst),    // %1
 379       "+r"(width)   // %2
 380     : "r"((intptr_t)(src_stride)),  // %3
 381       "r"((intptr_t)(dst_stride))   // %4
 382     : "memory", "cc"
 383   #if defined(__SSE2__)
 384       , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
 385   #endif
 386   );
 387 }
 388
 389 #if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)
 390 #define HAS_TRANSPOSE_UVWX8_SSE2
 391 void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
 392                          uint8* dst_a, int dst_stride_a,
 393                          uint8* dst_b, int dst_stride_b,
 394                          int w);
 395   asm (
 396     DECLARE_FUNCTION(TransposeUVWx8_SSE2)
 397     "push   %ebx                               \n"
 398     "push   %esi                               \n"
 399     "push   %edi                               \n"
 400     "push   %ebp                               \n"
 401     "mov    0x14(%esp),%eax                    \n"
 402     "mov    0x18(%esp),%edi                    \n"
 403     "mov    0x1c(%esp),%edx                    \n"
 404     "mov    0x20(%esp),%esi                    \n"
 405     "mov    0x24(%esp),%ebx                    \n"
 406     "mov    0x28(%esp),%ebp                    \n"
 407     "mov    %esp,%ecx                          \n"
 408     "sub    $0x14,%esp                         \n"
 409     "and    $0xfffffff0,%esp                   \n"
 410     "mov    %ecx,0x10(%esp)                    \n"
 411     "mov    0x2c(%ecx),%ecx                    \n"
 412
 413 "1:                                            \n"
 414     "movdqa (%eax),%xmm0                       \n"
 415     "movdqa (%eax,%edi,1),%xmm1                \n"
 416     "lea    (%eax,%edi,2),%eax                 \n"
 417     "movdqa %xmm0,%xmm7                        \n"
 418     "punpcklbw %xmm1,%xmm0                     \n"
 419     "punpckhbw %xmm1,%xmm7                     \n"
 420     "movdqa %xmm7,%xmm1                        \n"
 421     "movdqa (%eax),%xmm2                       \n"
 422     "movdqa (%eax,%edi,1),%xmm3                \n"
 423     "lea    (%eax,%edi,2),%eax                 \n"
 424     "movdqa %xmm2,%xmm7                        \n"
 425     "punpcklbw %xmm3,%xmm2                     \n"
 426     "punpckhbw %xmm3,%xmm7                     \n"
 427     "movdqa %xmm7,%xmm3                        \n"
 428     "movdqa (%eax),%xmm4                       \n"
 429     "movdqa (%eax,%edi,1),%xmm5                \n"
 430     "lea    (%eax,%edi,2),%eax                 \n"
 431     "movdqa %xmm4,%xmm7                        \n"
 432     "punpcklbw %xmm5,%xmm4                     \n"
 433     "punpckhbw %xmm5,%xmm7                     \n"
 434     "movdqa %xmm7,%xmm5                        \n"
 435     "movdqa (%eax),%xmm6                       \n"
 436     "movdqa (%eax,%edi,1),%xmm7                \n"
 437     "lea    (%eax,%edi,2),%eax                 \n"
 438     "movdqa %xmm5,(%esp)                       \n"
 439     "neg    %edi                               \n"
 440     "movdqa %xmm6,%xmm5                        \n"
 441     "punpcklbw %xmm7,%xmm6                     \n"
 442     "punpckhbw %xmm7,%xmm5                     \n"
 443     "movdqa %xmm5,%xmm7                        \n"
 444     "lea    0x10(%eax,%edi,8),%eax             \n"
 445     "neg    %edi                               \n"
 446     "movdqa %xmm0,%xmm5                        \n"
 447     "punpcklwd %xmm2,%xmm0                     \n"
 448     "punpckhwd %xmm2,%xmm5                     \n"
 449     "movdqa %xmm5,%xmm2                        \n"
 450     "movdqa %xmm1,%xmm5                        \n"
 451     "punpcklwd %xmm3,%xmm1                     \n"
 452     "punpckhwd %xmm3,%xmm5                     \n"
 453     "movdqa %xmm5,%xmm3                        \n"
 454     "movdqa %xmm4,%xmm5                        \n"
 455     "punpcklwd %xmm6,%xmm4                     \n"
 456     "punpckhwd %xmm6,%xmm5                     \n"
 457     "movdqa %xmm5,%xmm6                        \n"
 458     "movdqa (%esp),%xmm5                       \n"
 459     "movdqa %xmm6,(%esp)                       \n"
 460     "movdqa %xmm5,%xmm6                        \n"
 461     "punpcklwd %xmm7,%xmm5                     \n"
 462     "punpckhwd %xmm7,%xmm6                     \n"
 463     "movdqa %xmm6,%xmm7                        \n"
 464     "movdqa %xmm0,%xmm6                        \n"
 465     "punpckldq %xmm4,%xmm0                     \n"
 466     "punpckhdq %xmm4,%xmm6                     \n"
 467     "movdqa %xmm6,%xmm4                        \n"
 468     "movdqa (%esp),%xmm6                       \n"
 469     "movlpd %xmm0,(%edx)                       \n"
 470     "movhpd %xmm0,(%ebx)                       \n"
 471     "movlpd %xmm4,(%edx,%esi,1)                \n"
 472     "lea    (%edx,%esi,2),%edx                 \n"
 473     "movhpd %xmm4,(%ebx,%ebp,1)                \n"
 474     "lea    (%ebx,%ebp,2),%ebx                 \n"
 475     "movdqa %xmm2,%xmm0                        \n"
 476     "punpckldq %xmm6,%xmm2                     \n"
 477     "movlpd %xmm2,(%edx)                       \n"
 478     "movhpd %xmm2,(%ebx)                       \n"
 479     "punpckhdq %xmm6,%xmm0                     \n"
 480     "movlpd %xmm0,(%edx,%esi,1)                \n"
 481     "lea    (%edx,%esi,2),%edx                 \n"
 482     "movhpd %xmm0,(%ebx,%ebp,1)                \n"
 483     "lea    (%ebx,%ebp,2),%ebx                 \n"
 484     "movdqa %xmm1,%xmm0                        \n"
 485     "punpckldq %xmm5,%xmm1                     \n"
 486     "movlpd %xmm1,(%edx)                       \n"
 487     "movhpd %xmm1,(%ebx)                       \n"
 488     "punpckhdq %xmm5,%xmm0                     \n"
 489     "movlpd %xmm0,(%edx,%esi,1)                \n"
 490     "lea    (%edx,%esi,2),%edx                 \n"
 491     "movhpd %xmm0,(%ebx,%ebp,1)                \n"
 492     "lea    (%ebx,%ebp,2),%ebx                 \n"
 493     "movdqa %xmm3,%xmm0                        \n"
 494     "punpckldq %xmm7,%xmm3                     \n"
 495     "movlpd %xmm3,(%edx)                       \n"
 496     "movhpd %xmm3,(%ebx)                       \n"
 497     "punpckhdq %xmm7,%xmm0                     \n"
 498     "sub    $0x8,%ecx                          \n"
 499     "movlpd %xmm0,(%edx,%esi,1)                \n"
 500     "lea    (%edx,%esi,2),%edx                 \n"
 501     "movhpd %xmm0,(%ebx,%ebp,1)                \n"
 502     "lea    (%ebx,%ebp,2),%ebx                 \n"
 503     "jg     1b                                 \n"
 504     "mov    0x10(%esp),%esp                    \n"
 505     "pop    %ebp                               \n"
 506     "pop    %edi                               \n"
 507     "pop    %esi                               \n"
 508     "pop    %ebx                               \n"
 509 #if defined(__native_client__)
 510     "pop    %ecx                               \n"
 511     "and    $0xffffffe0,%ecx                   \n"
 512     "jmp    *%ecx                              \n"
 513 #else
 514     "ret                                       \n"
 515 #endif
 516 );
 517 #elif !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
 518     defined(__x86_64__)
 519 // 64 bit version has enough registers to do 16x8 to 8x16 at a time.
 520 #define HAS_TRANSPOSE_WX8_FAST_SSSE3
 521 static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
 522                                     uint8* dst, int dst_stride, int width) {
 523   asm volatile (
 524   // Read in the data from the source pointer.
 525   // First round of bit swap.
 526   ".p2align  2                                 \n"
 527 "1:                                            \n"
 528   "movdqa     (%0),%%xmm0                      \n"
 529   "movdqa     (%0,%3),%%xmm1                   \n"
 530   "lea        (%0,%3,2),%0                     \n"
 531   "movdqa     %%xmm0,%%xmm8                    \n"
 532   "punpcklbw  %%xmm1,%%xmm0                    \n"
 533   "punpckhbw  %%xmm1,%%xmm8                    \n"
 534   "movdqa     (%0),%%xmm2                      \n"
 535   "movdqa     %%xmm0,%%xmm1                    \n"
 536   "movdqa     %%xmm8,%%xmm9                    \n"
 537   "palignr    $0x8,%%xmm1,%%xmm1               \n"
 538   "palignr    $0x8,%%xmm9,%%xmm9               \n"
 539   "movdqa     (%0,%3),%%xmm3                   \n"
 540   "lea        (%0,%3,2),%0                     \n"
 541   "movdqa     %%xmm2,%%xmm10                   \n"
 542   "punpcklbw  %%xmm3,%%xmm2                    \n"
 543   "punpckhbw  %%xmm3,%%xmm10                   \n"
 544   "movdqa     %%xmm2,%%xmm3                    \n"
 545   "movdqa     %%xmm10,%%xmm11                  \n"
 546   "movdqa     (%0),%%xmm4                      \n"
 547   "palignr    $0x8,%%xmm3,%%xmm3               \n"
 548   "palignr    $0x8,%%xmm11,%%xmm11             \n"
 549   "movdqa     (%0,%3),%%xmm5                   \n"
 550   "lea        (%0,%3,2),%0                     \n"
 551   "movdqa     %%xmm4,%%xmm12                   \n"
 552   "punpcklbw  %%xmm5,%%xmm4                    \n"
 553   "punpckhbw  %%xmm5,%%xmm12                   \n"
 554   "movdqa     %%xmm4,%%xmm5                    \n"
 555   "movdqa     %%xmm12,%%xmm13                  \n"
 556   "movdqa     (%0),%%xmm6                      \n"
 557   "palignr    $0x8,%%xmm5,%%xmm5               \n"
 558   "palignr    $0x8,%%xmm13,%%xmm13             \n"
 559   "movdqa     (%0,%3),%%xmm7                   \n"
 560   "lea        (%0,%3,2),%0                     \n"
 561   "movdqa     %%xmm6,%%xmm14                   \n"
 562   "punpcklbw  %%xmm7,%%xmm6                    \n"
 563   "punpckhbw  %%xmm7,%%xmm14                   \n"
 564   "neg        %3                               \n"
 565   "movdqa     %%xmm6,%%xmm7                    \n"
 566   "movdqa     %%xmm14,%%xmm15                  \n"
 567   "lea        0x10(%0,%3,8),%0                 \n"
 568   "palignr    $0x8,%%xmm7,%%xmm7               \n"
 569   "palignr    $0x8,%%xmm15,%%xmm15             \n"
 570   "neg        %3                               \n"
 571    // Second round of bit swap.
 572   "punpcklwd  %%xmm2,%%xmm0                    \n"
 573   "punpcklwd  %%xmm3,%%xmm1                    \n"
 574   "movdqa     %%xmm0,%%xmm2                    \n"
 575   "movdqa     %%xmm1,%%xmm3                    \n"
 576   "palignr    $0x8,%%xmm2,%%xmm2               \n"
 577   "palignr    $0x8,%%xmm3,%%xmm3               \n"
 578   "punpcklwd  %%xmm6,%%xmm4                    \n"
 579   "punpcklwd  %%xmm7,%%xmm5                    \n"
 580   "movdqa     %%xmm4,%%xmm6                    \n"
 581   "movdqa     %%xmm5,%%xmm7                    \n"
 582   "palignr    $0x8,%%xmm6,%%xmm6               \n"
 583   "palignr    $0x8,%%xmm7,%%xmm7               \n"
 584   "punpcklwd  %%xmm10,%%xmm8                   \n"
 585   "punpcklwd  %%xmm11,%%xmm9                   \n"
 586   "movdqa     %%xmm8,%%xmm10                   \n"
 587   "movdqa     %%xmm9,%%xmm11                   \n"
 588   "palignr    $0x8,%%xmm10,%%xmm10             \n"
 589   "palignr    $0x8,%%xmm11,%%xmm11             \n"
 590   "punpcklwd  %%xmm14,%%xmm12                  \n"
 591   "punpcklwd  %%xmm15,%%xmm13                  \n"
 592   "movdqa     %%xmm12,%%xmm14                  \n"
 593   "movdqa     %%xmm13,%%xmm15                  \n"
 594   "palignr    $0x8,%%xmm14,%%xmm14             \n"
 595   "palignr    $0x8,%%xmm15,%%xmm15             \n"
 596   // Third round of bit swap.
 597   // Write to the destination pointer.
 598   "punpckldq  %%xmm4,%%xmm0                    \n"
 599   "movq       %%xmm0,(%1)                      \n"
 600   "movdqa     %%xmm0,%%xmm4                    \n"
 601   "palignr    $0x8,%%xmm4,%%xmm4               \n"
 602   "movq       %%xmm4,(%1,%4)                   \n"
 603   "lea        (%1,%4,2),%1                     \n"
 604   "punpckldq  %%xmm6,%%xmm2                    \n"
 605   "movdqa     %%xmm2,%%xmm6                    \n"
 606   "movq       %%xmm2,(%1)                      \n"
 607   "palignr    $0x8,%%xmm6,%%xmm6               \n"
 608   "punpckldq  %%xmm5,%%xmm1                    \n"
 609   "movq       %%xmm6,(%1,%4)                   \n"
 610   "lea        (%1,%4,2),%1                     \n"
 611   "movdqa     %%xmm1,%%xmm5                    \n"
 612   "movq       %%xmm1,(%1)                      \n"
 613   "palignr    $0x8,%%xmm5,%%xmm5               \n"
 614   "movq       %%xmm5,(%1,%4)                   \n"
 615   "lea        (%1,%4,2),%1                     \n"
 616   "punpckldq  %%xmm7,%%xmm3                    \n"
 617   "movq       %%xmm3,(%1)                      \n"
 618   "movdqa     %%xmm3,%%xmm7                    \n"
 619   "palignr    $0x8,%%xmm7,%%xmm7               \n"
 620   "movq       %%xmm7,(%1,%4)                   \n"
 621   "lea        (%1,%4,2),%1                     \n"
 622   "punpckldq  %%xmm12,%%xmm8                   \n"
 623   "movq       %%xmm8,(%1)                      \n"
 624   "movdqa     %%xmm8,%%xmm12                   \n"
 625   "palignr    $0x8,%%xmm12,%%xmm12             \n"
 626   "movq       %%xmm12,(%1,%4)                  \n"
 627   "lea        (%1,%4,2),%1                     \n"
 628   "punpckldq  %%xmm14,%%xmm10                  \n"
 629   "movdqa     %%xmm10,%%xmm14                  \n"
 630   "movq       %%xmm10,(%1)                     \n"
 631   "palignr    $0x8,%%xmm14,%%xmm14             \n"
 632   "punpckldq  %%xmm13,%%xmm9                   \n"
 633   "movq       %%xmm14,(%1,%4)                  \n"
 634   "lea        (%1,%4,2),%1                     \n"
 635   "movdqa     %%xmm9,%%xmm13                   \n"
 636   "movq       %%xmm9,(%1)                      \n"
 637   "palignr    $0x8,%%xmm13,%%xmm13             \n"
 638   "movq       %%xmm13,(%1,%4)                  \n"
 639   "lea        (%1,%4,2),%1                     \n"
 640   "punpckldq  %%xmm15,%%xmm11                  \n"
 641   "movq       %%xmm11,(%1)                     \n"
 642   "movdqa     %%xmm11,%%xmm15                  \n"
 643   "palignr    $0x8,%%xmm15,%%xmm15             \n"
 644   "sub        $0x10,%2                         \n"
 645   "movq       %%xmm15,(%1,%4)                  \n"
 646   "lea        (%1,%4,2),%1                     \n"
 647   "jg         1b                               \n"
 648   : "+r"(src),    // %0
 649     "+r"(dst),    // %1
 650     "+r"(width)   // %2
 651   : "r"((intptr_t)(src_stride)),  // %3
 652     "r"((intptr_t)(dst_stride))   // %4
 653   : "memory", "cc",
 654     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
 655     "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",  "xmm14",  "xmm15"
 656 );
 657 }
 658
 659 #define HAS_TRANSPOSE_UVWX8_SSE2
 660 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
 661                                 uint8* dst_a, int dst_stride_a,
 662                                 uint8* dst_b, int dst_stride_b,
 663                                 int w) {
 664   asm volatile (
 665   // Read in the data from the source pointer.
 666   // First round of bit swap.
 667   ".p2align  2                                 \n"
 668 "1:                                            \n"
 669   "movdqa     (%0),%%xmm0                      \n"
 670   "movdqa     (%0,%4),%%xmm1                   \n"
 671   "lea        (%0,%4,2),%0                     \n"
 672   "movdqa     %%xmm0,%%xmm8                    \n"
 673   "punpcklbw  %%xmm1,%%xmm0                    \n"
 674   "punpckhbw  %%xmm1,%%xmm8                    \n"
 675   "movdqa     %%xmm8,%%xmm1                    \n"
 676   "movdqa     (%0),%%xmm2                      \n"
 677   "movdqa     (%0,%4),%%xmm3                   \n"
 678   "lea        (%0,%4,2),%0                     \n"
 679   "movdqa     %%xmm2,%%xmm8                    \n"
 680   "punpcklbw  %%xmm3,%%xmm2                    \n"
 681   "punpckhbw  %%xmm3,%%xmm8                    \n"
 682   "movdqa     %%xmm8,%%xmm3                    \n"
 683   "movdqa     (%0),%%xmm4                      \n"
 684   "movdqa     (%0,%4),%%xmm5                   \n"
 685   "lea        (%0,%4,2),%0                     \n"
 686   "movdqa     %%xmm4,%%xmm8                    \n"
 687   "punpcklbw  %%xmm5,%%xmm4                    \n"
 688   "punpckhbw  %%xmm5,%%xmm8                    \n"
 689   "movdqa     %%xmm8,%%xmm5                    \n"
 690   "movdqa     (%0),%%xmm6                      \n"
 691   "movdqa     (%0,%4),%%xmm7                   \n"
 692   "lea        (%0,%4,2),%0                     \n"
 693   "movdqa     %%xmm6,%%xmm8                    \n"
 694   "punpcklbw  %%xmm7,%%xmm6                    \n"
 695   "neg        %4                               \n"
 696   "lea        0x10(%0,%4,8),%0                 \n"
 697   "punpckhbw  %%xmm7,%%xmm8                    \n"
 698   "movdqa     %%xmm8,%%xmm7                    \n"
 699   "neg        %4                               \n"
 700    // Second round of bit swap.
 701   "movdqa     %%xmm0,%%xmm8                    \n"
 702   "movdqa     %%xmm1,%%xmm9                    \n"
 703   "punpckhwd  %%xmm2,%%xmm8                    \n"
 704   "punpckhwd  %%xmm3,%%xmm9                    \n"
 705   "punpcklwd  %%xmm2,%%xmm0                    \n"
 706   "punpcklwd  %%xmm3,%%xmm1                    \n"
 707   "movdqa     %%xmm8,%%xmm2                    \n"
 708   "movdqa     %%xmm9,%%xmm3                    \n"
 709   "movdqa     %%xmm4,%%xmm8                    \n"
 710   "movdqa     %%xmm5,%%xmm9                    \n"
 711   "punpckhwd  %%xmm6,%%xmm8                    \n"
 712   "punpckhwd  %%xmm7,%%xmm9                    \n"
 713   "punpcklwd  %%xmm6,%%xmm4                    \n"
 714   "punpcklwd  %%xmm7,%%xmm5                    \n"
 715   "movdqa     %%xmm8,%%xmm6                    \n"
 716   "movdqa     %%xmm9,%%xmm7                    \n"
 717   // Third round of bit swap.
 718   // Write to the destination pointer.
 719   "movdqa     %%xmm0,%%xmm8                    \n"
 720   "punpckldq  %%xmm4,%%xmm0                    \n"
 721   "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
 722   "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
 723   "punpckhdq  %%xmm4,%%xmm8                    \n"
 724   "movlpd     %%xmm8,(%1,%5)                   \n"
 725   "lea        (%1,%5,2),%1                     \n"
 726   "movhpd     %%xmm8,(%2,%6)                   \n"
 727   "lea        (%2,%6,2),%2                     \n"
 728   "movdqa     %%xmm2,%%xmm8                    \n"
 729   "punpckldq  %%xmm6,%%xmm2                    \n"
 730   "movlpd     %%xmm2,(%1)                      \n"
 731   "movhpd     %%xmm2,(%2)                      \n"
 732   "punpckhdq  %%xmm6,%%xmm8                    \n"
 733   "movlpd     %%xmm8,(%1,%5)                   \n"
 734   "lea        (%1,%5,2),%1                     \n"
 735   "movhpd     %%xmm8,(%2,%6)                   \n"
 736   "lea        (%2,%6,2),%2                     \n"
 737   "movdqa     %%xmm1,%%xmm8                    \n"
 738   "punpckldq  %%xmm5,%%xmm1                    \n"
 739   "movlpd     %%xmm1,(%1)                      \n"
 740   "movhpd     %%xmm1,(%2)                      \n"
 741   "punpckhdq  %%xmm5,%%xmm8                    \n"
 742   "movlpd     %%xmm8,(%1,%5)                   \n"
 743   "lea        (%1,%5,2),%1                     \n"
 744   "movhpd     %%xmm8,(%2,%6)                   \n"
 745   "lea        (%2,%6,2),%2                     \n"
 746   "movdqa     %%xmm3,%%xmm8                    \n"
 747   "punpckldq  %%xmm7,%%xmm3                    \n"
 748   "movlpd     %%xmm3,(%1)                      \n"
 749   "movhpd     %%xmm3,(%2)                      \n"
 750   "punpckhdq  %%xmm7,%%xmm8                    \n"
 751   "sub        $0x8,%3                          \n"
 752   "movlpd     %%xmm8,(%1,%5)                   \n"
 753   "lea        (%1,%5,2),%1                     \n"
 754   "movhpd     %%xmm8,(%2,%6)                   \n"
 755   "lea        (%2,%6,2),%2                     \n"
 756   "jg         1b                               \n"
 757   : "+r"(src),    // %0
 758     "+r"(dst_a),  // %1
 759     "+r"(dst_b),  // %2
 760     "+r"(w)   // %3
 761   : "r"((intptr_t)(src_stride)),    // %4
 762     "r"((intptr_t)(dst_stride_a)),  // %5
 763     "r"((intptr_t)(dst_stride_b))   // %6
 764   : "memory", "cc",
 765     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
 766     "xmm8", "xmm9"
 767 );
 768 }
 769 #endif
 770 #endif
 771
 772 static void TransposeWx8_C(const uint8* src, int src_stride,
 773                            uint8* dst, int dst_stride,
 774                            int width) {
 775   int i;
 776   for (i = 0; i < width; ++i) {
 777     dst[0] = src[0 * src_stride];
 778     dst[1] = src[1 * src_stride];
 779     dst[2] = src[2 * src_stride];
 780     dst[3] = src[3 * src_stride];
 781     dst[4] = src[4 * src_stride];
 782     dst[5] = src[5 * src_stride];
 783     dst[6] = src[6 * src_stride];
 784     dst[7] = src[7 * src_stride];
 785     ++src;
 786     dst += dst_stride;
 787   }
 788 }
 789
 790 static void TransposeWxH_C(const uint8* src, int src_stride,
 791                            uint8* dst, int dst_stride,
 792                            int width, int height) {
 793   int i;
 794   for (i = 0; i < width; ++i) {
 795     int j;
 796     for (j = 0; j < height; ++j) {
 797       dst[i * dst_stride + j] = src[j * src_stride + i];
 798     }
 799   }
 800 }
 801
 802 LIBYUV_API
 803 void TransposePlane(const uint8* src, int src_stride,
 804                     uint8* dst, int dst_stride,
 805                     int width, int height) {
 806   int i = height;
 807   void (*TransposeWx8)(const uint8* src, int src_stride,
 808                        uint8* dst, int dst_stride,
 809                        int width) = TransposeWx8_C;
 810 #if defined(HAS_TRANSPOSE_WX8_NEON)
 811   if (TestCpuFlag(kCpuHasNEON)) {
 812     TransposeWx8 = TransposeWx8_NEON;
 813   }
 814 #endif
 815 #if defined(HAS_TRANSPOSE_WX8_SSSE3)
 816   if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
 817     TransposeWx8 = TransposeWx8_SSSE3;
 818   }
 819 #endif
 820 #if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
 821   if (TestCpuFlag(kCpuHasSSSE3) &&
 822       IS_ALIGNED(width, 16) &&
 823       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
 824     TransposeWx8 = TransposeWx8_FAST_SSSE3;
 825   }
 826 #endif
 827 #if defined(HAS_TRANSPOSE_WX8_MIPS_DSPR2)
 828   if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
 829     if (IS_ALIGNED(width, 4) &&
 830         IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
 831       TransposeWx8 = TransposeWx8_FAST_MIPS_DSPR2;
 832     } else {
 833       TransposeWx8 = TransposeWx8_MIPS_DSPR2;
 834     }
 835   }
 836 #endif
 837
 838   // Work across the source in 8x8 tiles
 839   while (i >= 8) {
 840     TransposeWx8(src, src_stride, dst, dst_stride, width);
 841     src += 8 * src_stride;    // Go down 8 rows.
 842     dst += 8;                 // Move over 8 columns.
 843     i -= 8;
 844   }
 845
 846   TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
 847 }
 848
 849 LIBYUV_API
 850 void RotatePlane90(const uint8* src, int src_stride,
 851                    uint8* dst, int dst_stride,
 852                    int width, int height) {
 853   // Rotate by 90 is a transpose with the source read
 854   // from bottom to top. So set the source pointer to the end
 855   // of the buffer and flip the sign of the source stride.
 856   src += src_stride * (height - 1);
 857   src_stride = -src_stride;
 858   TransposePlane(src, src_stride, dst, dst_stride, width, height);
 859 }
 860
 861 LIBYUV_API
 862 void RotatePlane270(const uint8* src, int src_stride,
 863                     uint8* dst, int dst_stride,
 864                     int width, int height) {
 865   // Rotate by 270 is a transpose with the destination written
 866   // from bottom to top. So set the destination pointer to the end
 867   // of the buffer and flip the sign of the destination stride.
 868   dst += dst_stride * (width - 1);
 869   dst_stride = -dst_stride;
 870   TransposePlane(src, src_stride, dst, dst_stride, width, height);
 871 }
 872
 873 LIBYUV_API
 874 void RotatePlane180(const uint8* src, int src_stride,
 875                     uint8* dst, int dst_stride,
 876                     int width, int height) {
 877   // Swap first and last row and mirror the content. Uses a temporary row.
 878   align_buffer_64(row, width);
 879   const uint8* src_bot = src + src_stride * (height - 1);
 880   uint8* dst_bot = dst + dst_stride * (height - 1);
 881   int half_height = (height + 1) >> 1;
 882   int y;
 883   void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
 884   void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
 885 #if defined(HAS_MIRRORROW_NEON)
 886   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
 887     MirrorRow = MirrorRow_NEON;
 888   }
 889 #endif
 890 #if defined(HAS_MIRRORROW_SSE2)
 891   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
 892       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
 893       IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
 894     MirrorRow = MirrorRow_SSE2;
 895   }
 896 #endif
 897 #if defined(HAS_MIRRORROW_SSSE3)
 898   if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
 899       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
 900       IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
 901     MirrorRow = MirrorRow_SSSE3;
 902   }
 903 #endif
 904 #if defined(HAS_MIRRORROW_AVX2)
 905   if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
 906     MirrorRow = MirrorRow_AVX2;
 907   }
 908 #endif
 909 #if defined(HAS_MIRRORROW_MIPS_DSPR2)
 910   if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
 911       IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
 912       IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {
 913     MirrorRow = MirrorRow_MIPS_DSPR2;
 914   }
 915 #endif
 916 #if defined(HAS_COPYROW_NEON)
 917   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
 918     CopyRow = CopyRow_NEON;
 919   }
 920 #endif
 921 #if defined(HAS_COPYROW_X86)
 922   if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
 923     CopyRow = CopyRow_X86;
 924   }
 925 #endif
 926 #if defined(HAS_COPYROW_SSE2)
 927   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
 928       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
 929       IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
 930     CopyRow = CopyRow_SSE2;
 931   }
 932 #endif
 933 #if defined(HAS_COPYROW_ERMS)
 934   if (TestCpuFlag(kCpuHasERMS)) {
 935     CopyRow = CopyRow_ERMS;
 936   }
 937 #endif
 938 #if defined(HAS_COPYROW_MIPS)
 939   if (TestCpuFlag(kCpuHasMIPS)) {
 940     CopyRow = CopyRow_MIPS;
 941   }
 942 #endif
 943
 944   // Odd height will harmlessly mirror the middle row twice.
 945   for (y = 0; y < half_height; ++y) {
 946     MirrorRow(src, row, width);  // Mirror first row into a buffer
 947     src += src_stride;
 948     MirrorRow(src_bot, dst, width);  // Mirror last row into first row
 949     dst += dst_stride;
 950     CopyRow(row, dst_bot, width);  // Copy first mirrored row into last
 951     src_bot -= src_stride;
 952     dst_bot -= dst_stride;
 953   }
 954   free_aligned_buffer_64(row);
 955 }
 956
 957 static void TransposeUVWx8_C(const uint8* src, int src_stride,
 958                              uint8* dst_a, int dst_stride_a,
 959                              uint8* dst_b, int dst_stride_b,
 960                              int width) {
 961   int i;
 962   for (i = 0; i < width; ++i) {
 963     dst_a[0] = src[0 * src_stride + 0];
 964     dst_b[0] = src[0 * src_stride + 1];
 965     dst_a[1] = src[1 * src_stride + 0];
 966     dst_b[1] = src[1 * src_stride + 1];
 967     dst_a[2] = src[2 * src_stride + 0];
 968     dst_b[2] = src[2 * src_stride + 1];
 969     dst_a[3] = src[3 * src_stride + 0];
 970     dst_b[3] = src[3 * src_stride + 1];
 971     dst_a[4] = src[4 * src_stride + 0];
 972     dst_b[4] = src[4 * src_stride + 1];
 973     dst_a[5] = src[5 * src_stride + 0];
 974     dst_b[5] = src[5 * src_stride + 1];
 975     dst_a[6] = src[6 * src_stride + 0];
 976     dst_b[6] = src[6 * src_stride + 1];
 977     dst_a[7] = src[7 * src_stride + 0];
 978     dst_b[7] = src[7 * src_stride + 1];
 979     src += 2;
 980     dst_a += dst_stride_a;
 981     dst_b += dst_stride_b;
 982   }
 983 }
 984
 985 static void TransposeUVWxH_C(const uint8* src, int src_stride,
 986                              uint8* dst_a, int dst_stride_a,
 987                              uint8* dst_b, int dst_stride_b,
 988                              int width, int height) {
 989   int i;
 990   for (i = 0; i < width * 2; i += 2) {
 991     int j;
 992     for (j = 0; j < height; ++j) {
 993       dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
 994       dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
 995     }
 996   }
 997 }
 998
 999 LIBYUV_API
1000 void TransposeUV(const uint8* src, int src_stride,
1001                  uint8* dst_a, int dst_stride_a,
1002                  uint8* dst_b, int dst_stride_b,
1003                  int width, int height) {
1004   int i = height;
1005   void (*TransposeUVWx8)(const uint8* src, int src_stride,
1006                          uint8* dst_a, int dst_stride_a,
1007                          uint8* dst_b, int dst_stride_b,
1008                          int width) = TransposeUVWx8_C;
1009 #if defined(HAS_TRANSPOSE_UVWX8_NEON)
1010   if (TestCpuFlag(kCpuHasNEON)) {
1011     TransposeUVWx8 = TransposeUVWx8_NEON;
1012   }
1013 #elif defined(HAS_TRANSPOSE_UVWX8_SSE2)
1014   if (TestCpuFlag(kCpuHasSSE2) &&
1015       IS_ALIGNED(width, 8) &&
1016       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
1017     TransposeUVWx8 = TransposeUVWx8_SSE2;
1018   }
1019 #elif defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2)
1020   if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) &&
1021       IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
1022     TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2;
1023   }
1024 #endif
1025
1026   // Work through the source in 8x8 tiles.
1027   while (i >= 8) {
1028     TransposeUVWx8(src, src_stride,
1029                    dst_a, dst_stride_a,
1030                    dst_b, dst_stride_b,
1031                    width);
1032     src += 8 * src_stride;    // Go down 8 rows.
1033     dst_a += 8;               // Move over 8 columns.
1034     dst_b += 8;               // Move over 8 columns.
1035     i -= 8;
1036   }
1037
1038   TransposeUVWxH_C(src, src_stride,
1039                    dst_a, dst_stride_a,
1040                    dst_b, dst_stride_b,
1041                    width, i);
1042 }
1043
1044 LIBYUV_API
1045 void RotateUV90(const uint8* src, int src_stride,
1046                 uint8* dst_a, int dst_stride_a,
1047                 uint8* dst_b, int dst_stride_b,
1048                 int width, int height) {
1049   src += src_stride * (height - 1);
1050   src_stride = -src_stride;
1051
1052   TransposeUV(src, src_stride,
1053               dst_a, dst_stride_a,
1054               dst_b, dst_stride_b,
1055               width, height);
1056 }
1057
1058 LIBYUV_API
1059 void RotateUV270(const uint8* src, int src_stride,
1060                  uint8* dst_a, int dst_stride_a,
1061                  uint8* dst_b, int dst_stride_b,
1062                  int width, int height) {
1063   dst_a += dst_stride_a * (width - 1);
1064   dst_b += dst_stride_b * (width - 1);
1065   dst_stride_a = -dst_stride_a;
1066   dst_stride_b = -dst_stride_b;
1067
1068   TransposeUV(src, src_stride,
1069               dst_a, dst_stride_a,
1070               dst_b, dst_stride_b,
1071               width, height);
1072 }
1073
1074 // Rotate 180 is a horizontal and vertical flip.
1075 LIBYUV_API
1076 void RotateUV180(const uint8* src, int src_stride,
1077                  uint8* dst_a, int dst_stride_a,
1078                  uint8* dst_b, int dst_stride_b,
1079                  int width, int height) {
1080   int i;
1081   void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
1082       MirrorUVRow_C;
1083 #if defined(HAS_MIRRORUVROW_NEON)
1084   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
1085     MirrorRowUV = MirrorUVRow_NEON;
1086   }
1087 #elif defined(HAS_MIRRORROW_UV_SSSE3)
1088   if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
1089       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
1090     MirrorRowUV = MirrorUVRow_SSSE3;
1091   }
1092 #elif defined(HAS_MIRRORUVROW_MIPS_DSPR2)
1093   if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
1094       IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
1095     MirrorRowUV = MirrorUVRow_MIPS_DSPR2;
1096   }
1097 #endif
1098
1099   dst_a += dst_stride_a * (height - 1);
1100   dst_b += dst_stride_b * (height - 1);
1101
1102   for (i = 0; i < height; ++i) {
1103     MirrorRowUV(src, dst_a, dst_b, width);
1104     src += src_stride;
1105     dst_a -= dst_stride_a;
1106     dst_b -= dst_stride_b;
1107   }
1108 }
1109
1110 LIBYUV_API
1111 int RotatePlane(const uint8* src, int src_stride,
1112                 uint8* dst, int dst_stride,
1113                 int width, int height,
1114                 enum RotationMode mode) {
1115   if (!src || width <= 0 || height == 0 || !dst) {
1116     return -1;
1117   }
1118
1119   // Negative height means invert the image.
1120   if (height < 0) {
1121     height = -height;
1122     src = src + (height - 1) * src_stride;
1123     src_stride = -src_stride;
1124   }
1125
1126   switch (mode) {
1127     case kRotate0:
1128       // copy frame
1129       CopyPlane(src, src_stride,
1130                 dst, dst_stride,
1131                 width, height);
1132       return 0;
1133     case kRotate90:
1134       RotatePlane90(src, src_stride,
1135                     dst, dst_stride,
1136                     width, height);
1137       return 0;
1138     case kRotate270:
1139       RotatePlane270(src, src_stride,
1140                      dst, dst_stride,
1141                      width, height);
1142       return 0;
1143     case kRotate180:
1144       RotatePlane180(src, src_stride,
1145                      dst, dst_stride,
1146                      width, height);
1147       return 0;
1148     default:
1149       break;
1150   }
1151   return -1;
1152 }
1153
1154 LIBYUV_API
1155 int I420Rotate(const uint8* src_y, int src_stride_y,
1156                const uint8* src_u, int src_stride_u,
1157                const uint8* src_v, int src_stride_v,
1158                uint8* dst_y, int dst_stride_y,
1159                uint8* dst_u, int dst_stride_u,
1160                uint8* dst_v, int dst_stride_v,
1161                int width, int height,
1162                enum RotationMode mode) {
1163   int halfwidth = (width + 1) >> 1;
1164   int halfheight = (height + 1) >> 1;
1165   if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
1166       !dst_y || !dst_u || !dst_v) {
1167     return -1;
1168   }
1169
1170   // Negative height means invert the image.
1171   if (height < 0) {
1172     height = -height;
1173     halfheight = (height + 1) >> 1;
1174     src_y = src_y + (height - 1) * src_stride_y;
1175     src_u = src_u + (halfheight - 1) * src_stride_u;
1176     src_v = src_v + (halfheight - 1) * src_stride_v;
1177     src_stride_y = -src_stride_y;
1178     src_stride_u = -src_stride_u;
1179     src_stride_v = -src_stride_v;
1180   }
1181
1182   switch (mode) {
1183     case kRotate0:
1184       // copy frame
1185       return I420Copy(src_y, src_stride_y,
1186                       src_u, src_stride_u,
1187                       src_v, src_stride_v,
1188                       dst_y, dst_stride_y,
1189                       dst_u, dst_stride_u,
1190                       dst_v, dst_stride_v,
1191                       width, height);
1192     case kRotate90:
1193       RotatePlane90(src_y, src_stride_y,
1194                     dst_y, dst_stride_y,
1195                     width, height);
1196       RotatePlane90(src_u, src_stride_u,
1197                     dst_u, dst_stride_u,
1198                     halfwidth, halfheight);
1199       RotatePlane90(src_v, src_stride_v,
1200                     dst_v, dst_stride_v,
1201                     halfwidth, halfheight);
1202       return 0;
1203     case kRotate270:
1204       RotatePlane270(src_y, src_stride_y,
1205                      dst_y, dst_stride_y,
1206                      width, height);
1207       RotatePlane270(src_u, src_stride_u,
1208                      dst_u, dst_stride_u,
1209                      halfwidth, halfheight);
1210       RotatePlane270(src_v, src_stride_v,
1211                      dst_v, dst_stride_v,
1212                      halfwidth, halfheight);
1213       return 0;
1214     case kRotate180:
1215       RotatePlane180(src_y, src_stride_y,
1216                      dst_y, dst_stride_y,
1217                      width, height);
1218       RotatePlane180(src_u, src_stride_u,
1219                      dst_u, dst_stride_u,
1220                      halfwidth, halfheight);
1221       RotatePlane180(src_v, src_stride_v,
1222                      dst_v, dst_stride_v,
1223                      halfwidth, halfheight);
1224       return 0;
1225     default:
1226       break;
1227   }
1228   return -1;
1229 }
1230
1231 LIBYUV_API
1232 int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
1233                      const uint8* src_uv, int src_stride_uv,
1234                      uint8* dst_y, int dst_stride_y,
1235                      uint8* dst_u, int dst_stride_u,
1236                      uint8* dst_v, int dst_stride_v,
1237                      int width, int height,
1238                      enum RotationMode mode) {
1239   int halfwidth = (width + 1) >> 1;
1240   int halfheight = (height + 1) >> 1;
1241   if (!src_y || !src_uv || width <= 0 || height == 0 ||
1242       !dst_y || !dst_u || !dst_v) {
1243     return -1;
1244   }
1245
1246   // Negative height means invert the image.
1247   if (height < 0) {
1248     height = -height;
1249     halfheight = (height + 1) >> 1;
1250     src_y = src_y + (height - 1) * src_stride_y;
1251     src_uv = src_uv + (halfheight - 1) * src_stride_uv;
1252     src_stride_y = -src_stride_y;
1253     src_stride_uv = -src_stride_uv;
1254   }
1255
1256   switch (mode) {
1257     case kRotate0:
1258       // copy frame
1259       return NV12ToI420(src_y, src_stride_y,
1260                         src_uv, src_stride_uv,
1261                         dst_y, dst_stride_y,
1262                         dst_u, dst_stride_u,
1263                         dst_v, dst_stride_v,
1264                         width, height);
1265     case kRotate90:
1266       RotatePlane90(src_y, src_stride_y,
1267                     dst_y, dst_stride_y,
1268                     width, height);
1269       RotateUV90(src_uv, src_stride_uv,
1270                  dst_u, dst_stride_u,
1271                  dst_v, dst_stride_v,
1272                  halfwidth, halfheight);
1273       return 0;
1274     case kRotate270:
1275       RotatePlane270(src_y, src_stride_y,
1276                      dst_y, dst_stride_y,
1277                      width, height);
1278       RotateUV270(src_uv, src_stride_uv,
1279                   dst_u, dst_stride_u,
1280                   dst_v, dst_stride_v,
1281                   halfwidth, halfheight);
1282       return 0;
1283     case kRotate180:
1284       RotatePlane180(src_y, src_stride_y,
1285                      dst_y, dst_stride_y,
1286                      width, height);
1287       RotateUV180(src_uv, src_stride_uv,
1288                   dst_u, dst_stride_u,
1289                   dst_v, dst_stride_v,
1290                   halfwidth, halfheight);
1291       return 0;
1292     default:
1293       break;
1294   }
1295   return -1;
1296 }
1297
1298 #ifdef __cplusplus
1299 }  // extern "C"
1300 }  // namespace libyuv
1301 #endif