src/dsp/dec_neon.c

   1 // Copyright 2012 Google Inc. All Rights Reserved.
   2 //
   3 // Use of this source code is governed by a BSD-style license
   4 // that can be found in the COPYING file in the root of the source
   5 // tree. An additional intellectual property rights grant can be found
   6 // in the file PATENTS. All contributing project authors may
   7 // be found in the AUTHORS file in the root of the source tree.
   8 // -----------------------------------------------------------------------------
   9 //
  10 // ARM NEON version of dsp functions and loop filtering.
  11 //
  12 // Authors: Somnath Banerjee (somnath@google.com)
  13 //          Johann Koenig (johannkoenig@google.com)
  14
  15 #include <unistd.h>
  16
  17 #include "src/dsp/dsp.h"
  18
  19 #if defined(WEBP_USE_NEON)
  20
  21 #include "src/dsp/neon.h"
  22 #include "src/dec/vp8i_dec.h"
  23
  24 //------------------------------------------------------------------------------
  25 // NxM Loading functions
  26
  27 #if !defined(WORK_AROUND_GCC)
  28
  29 // This intrinsics version makes gcc-4.6.3 crash during Load4x??() compilation
  30 // (register alloc, probably). The variants somewhat mitigate the problem, but
  31 // not quite. HFilter16i() remains problematic.
  32 static WEBP_INLINE uint8x8x4_t Load4x8_NEON(const uint8_t* const src,
  33                                             int stride) {
  34   const uint8x8_t zero = vdup_n_u8(0);
  35   uint8x8x4_t out;
  36   INIT_VECTOR4(out, zero, zero, zero, zero);
  37   out = vld4_lane_u8(src + 0 * stride, out, 0);
  38   out = vld4_lane_u8(src + 1 * stride, out, 1);
  39   out = vld4_lane_u8(src + 2 * stride, out, 2);
  40   out = vld4_lane_u8(src + 3 * stride, out, 3);
  41   out = vld4_lane_u8(src + 4 * stride, out, 4);
  42   out = vld4_lane_u8(src + 5 * stride, out, 5);
  43   out = vld4_lane_u8(src + 6 * stride, out, 6);
  44   out = vld4_lane_u8(src + 7 * stride, out, 7);
  45   return out;
  46 }
  47
  48 static WEBP_INLINE void Load4x16_NEON(const uint8_t* const src, int stride,
  49                                       uint8x16_t* const p1,
  50                                       uint8x16_t* const p0,
  51                                       uint8x16_t* const q0,
  52                                       uint8x16_t* const q1) {
  53   // row0 = p1[0..7]|p0[0..7]|q0[0..7]|q1[0..7]
  54   // row8 = p1[8..15]|p0[8..15]|q0[8..15]|q1[8..15]
  55   const uint8x8x4_t row0 = Load4x8_NEON(src - 2 + 0 * stride, stride);
  56   const uint8x8x4_t row8 = Load4x8_NEON(src - 2 + 8 * stride, stride);
  57   *p1 = vcombine_u8(row0.val[0], row8.val[0]);
  58   *p0 = vcombine_u8(row0.val[1], row8.val[1]);
  59   *q0 = vcombine_u8(row0.val[2], row8.val[2]);
  60   *q1 = vcombine_u8(row0.val[3], row8.val[3]);
  61 }
  62
  63 #else  // WORK_AROUND_GCC
  64
  65 #define LOADQ_LANE_32b(VALUE, LANE) do {                             \
  66   (VALUE) = vld1q_lane_u32((const uint32_t*)src, (VALUE), (LANE));   \
  67   src += stride;                                                     \
  68 } while (0)
  69
  70 static WEBP_INLINE void Load4x16_NEON(const uint8_t* src, int stride,
  71                                       uint8x16_t* const p1,
  72                                       uint8x16_t* const p0,
  73                                       uint8x16_t* const q0,
  74                                       uint8x16_t* const q1) {
  75   const uint32x4_t zero = vdupq_n_u32(0);
  76   uint32x4x4_t in;
  77   INIT_VECTOR4(in, zero, zero, zero, zero);
  78   src -= 2;
  79   LOADQ_LANE_32b(in.val[0], 0);
  80   LOADQ_LANE_32b(in.val[1], 0);
  81   LOADQ_LANE_32b(in.val[2], 0);
  82   LOADQ_LANE_32b(in.val[3], 0);
  83   LOADQ_LANE_32b(in.val[0], 1);
  84   LOADQ_LANE_32b(in.val[1], 1);
  85   LOADQ_LANE_32b(in.val[2], 1);
  86   LOADQ_LANE_32b(in.val[3], 1);
  87   LOADQ_LANE_32b(in.val[0], 2);
  88   LOADQ_LANE_32b(in.val[1], 2);
  89   LOADQ_LANE_32b(in.val[2], 2);
  90   LOADQ_LANE_32b(in.val[3], 2);
  91   LOADQ_LANE_32b(in.val[0], 3);
  92   LOADQ_LANE_32b(in.val[1], 3);
  93   LOADQ_LANE_32b(in.val[2], 3);
  94   LOADQ_LANE_32b(in.val[3], 3);
  95   // Transpose four 4x4 parts:
  96   {
  97     const uint8x16x2_t row01 = vtrnq_u8(vreinterpretq_u8_u32(in.val[0]),
  98                                         vreinterpretq_u8_u32(in.val[1]));
  99     const uint8x16x2_t row23 = vtrnq_u8(vreinterpretq_u8_u32(in.val[2]),
 100                                         vreinterpretq_u8_u32(in.val[3]));
 101     const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
 102                                          vreinterpretq_u16_u8(row23.val[0]));
 103     const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
 104                                          vreinterpretq_u16_u8(row23.val[1]));
 105     *p1 = vreinterpretq_u8_u16(row02.val[0]);
 106     *p0 = vreinterpretq_u8_u16(row13.val[0]);
 107     *q0 = vreinterpretq_u8_u16(row02.val[1]);
 108     *q1 = vreinterpretq_u8_u16(row13.val[1]);
 109   }
 110 }
 111 #undef LOADQ_LANE_32b
 112
 113 #endif  // !WORK_AROUND_GCC
 114
 115 static WEBP_INLINE void Load8x16_NEON(
 116     const uint8_t* const src, int stride,
 117     uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
 118     uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
 119     uint8x16_t* const q2, uint8x16_t* const q3) {
 120   Load4x16_NEON(src - 2, stride, p3, p2, p1, p0);
 121   Load4x16_NEON(src + 2, stride, q0, q1, q2, q3);
 122 }
 123
 124 static WEBP_INLINE void Load16x4_NEON(const uint8_t* const src, int stride,
 125                                       uint8x16_t* const p1,
 126                                       uint8x16_t* const p0,
 127                                       uint8x16_t* const q0,
 128                                       uint8x16_t* const q1) {
 129   *p1 = vld1q_u8(src - 2 * stride);
 130   *p0 = vld1q_u8(src - 1 * stride);
 131   *q0 = vld1q_u8(src + 0 * stride);
 132   *q1 = vld1q_u8(src + 1 * stride);
 133 }
 134
 135 static WEBP_INLINE void Load16x8_NEON(
 136     const uint8_t* const src, int stride,
 137     uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
 138     uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
 139     uint8x16_t* const q2, uint8x16_t* const q3) {
 140   Load16x4_NEON(src - 2  * stride, stride, p3, p2, p1, p0);
 141   Load16x4_NEON(src + 2  * stride, stride, q0, q1, q2, q3);
 142 }
 143
 144 static WEBP_INLINE void Load8x8x2_NEON(
 145     const uint8_t* const u, const uint8_t* const v, int stride,
 146     uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
 147     uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
 148     uint8x16_t* const q2, uint8x16_t* const q3) {
 149   // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
 150   // and the v-samples on the higher half.
 151   *p3 = vcombine_u8(vld1_u8(u - 4 * stride), vld1_u8(v - 4 * stride));
 152   *p2 = vcombine_u8(vld1_u8(u - 3 * stride), vld1_u8(v - 3 * stride));
 153   *p1 = vcombine_u8(vld1_u8(u - 2 * stride), vld1_u8(v - 2 * stride));
 154   *p0 = vcombine_u8(vld1_u8(u - 1 * stride), vld1_u8(v - 1 * stride));
 155   *q0 = vcombine_u8(vld1_u8(u + 0 * stride), vld1_u8(v + 0 * stride));
 156   *q1 = vcombine_u8(vld1_u8(u + 1 * stride), vld1_u8(v + 1 * stride));
 157   *q2 = vcombine_u8(vld1_u8(u + 2 * stride), vld1_u8(v + 2 * stride));
 158   *q3 = vcombine_u8(vld1_u8(u + 3 * stride), vld1_u8(v + 3 * stride));
 159 }
 160
 161 #if !defined(WORK_AROUND_GCC)
 162
 163 #define LOAD_UV_8(ROW) \
 164   vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride))
 165
 166 static WEBP_INLINE void Load8x8x2T_NEON(
 167     const uint8_t* const u, const uint8_t* const v, int stride,
 168     uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
 169     uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
 170     uint8x16_t* const q2, uint8x16_t* const q3) {
 171   // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
 172   // and the v-samples on the higher half.
 173   const uint8x16_t row0 = LOAD_UV_8(0);
 174   const uint8x16_t row1 = LOAD_UV_8(1);
 175   const uint8x16_t row2 = LOAD_UV_8(2);
 176   const uint8x16_t row3 = LOAD_UV_8(3);
 177   const uint8x16_t row4 = LOAD_UV_8(4);
 178   const uint8x16_t row5 = LOAD_UV_8(5);
 179   const uint8x16_t row6 = LOAD_UV_8(6);
 180   const uint8x16_t row7 = LOAD_UV_8(7);
 181   // Perform two side-by-side 8x8 transposes
 182   // u00 u01 u02 u03 u04 u05 u06 u07 | v00 v01 v02 v03 v04 v05 v06 v07
 183   // u10 u11 u12 u13 u14 u15 u16 u17 | v10 v11 v12 ...
 184   // u20 u21 u22 u23 u24 u25 u26 u27 | v20 v21 ...
 185   // u30 u31 u32 u33 u34 u35 u36 u37 | ...
 186   // u40 u41 u42 u43 u44 u45 u46 u47 | ...
 187   // u50 u51 u52 u53 u54 u55 u56 u57 | ...
 188   // u60 u61 u62 u63 u64 u65 u66 u67 | v60 ...
 189   // u70 u71 u72 u73 u74 u75 u76 u77 | v70 v71 v72 ...
 190   const uint8x16x2_t row01 = vtrnq_u8(row0, row1);  // u00 u10 u02 u12 ...
 191                                                     // u01 u11 u03 u13 ...
 192   const uint8x16x2_t row23 = vtrnq_u8(row2, row3);  // u20 u30 u22 u32 ...
 193                                                     // u21 u31 u23 u33 ...
 194   const uint8x16x2_t row45 = vtrnq_u8(row4, row5);  // ...
 195   const uint8x16x2_t row67 = vtrnq_u8(row6, row7);  // ...
 196   const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
 197                                        vreinterpretq_u16_u8(row23.val[0]));
 198   const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
 199                                        vreinterpretq_u16_u8(row23.val[1]));
 200   const uint16x8x2_t row46 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[0]),
 201                                        vreinterpretq_u16_u8(row67.val[0]));
 202   const uint16x8x2_t row57 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[1]),
 203                                        vreinterpretq_u16_u8(row67.val[1]));
 204   const uint32x4x2_t row04 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[0]),
 205                                        vreinterpretq_u32_u16(row46.val[0]));
 206   const uint32x4x2_t row26 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[1]),
 207                                        vreinterpretq_u32_u16(row46.val[1]));
 208   const uint32x4x2_t row15 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[0]),
 209                                        vreinterpretq_u32_u16(row57.val[0]));
 210   const uint32x4x2_t row37 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[1]),
 211                                        vreinterpretq_u32_u16(row57.val[1]));
 212   *p3 = vreinterpretq_u8_u32(row04.val[0]);
 213   *p2 = vreinterpretq_u8_u32(row15.val[0]);
 214   *p1 = vreinterpretq_u8_u32(row26.val[0]);
 215   *p0 = vreinterpretq_u8_u32(row37.val[0]);
 216   *q0 = vreinterpretq_u8_u32(row04.val[1]);
 217   *q1 = vreinterpretq_u8_u32(row15.val[1]);
 218   *q2 = vreinterpretq_u8_u32(row26.val[1]);
 219   *q3 = vreinterpretq_u8_u32(row37.val[1]);
 220 }
 221 #undef LOAD_UV_8
 222
 223 #endif  // !WORK_AROUND_GCC
 224
 225 static WEBP_INLINE void Store2x8_NEON(const uint8x8x2_t v,
 226                                       uint8_t* const dst, int stride) {
 227   vst2_lane_u8(dst + 0 * stride, v, 0);
 228   vst2_lane_u8(dst + 1 * stride, v, 1);
 229   vst2_lane_u8(dst + 2 * stride, v, 2);
 230   vst2_lane_u8(dst + 3 * stride, v, 3);
 231   vst2_lane_u8(dst + 4 * stride, v, 4);
 232   vst2_lane_u8(dst + 5 * stride, v, 5);
 233   vst2_lane_u8(dst + 6 * stride, v, 6);
 234   vst2_lane_u8(dst + 7 * stride, v, 7);
 235 }
 236
 237 static WEBP_INLINE void Store2x16_NEON(const uint8x16_t p0, const uint8x16_t q0,
 238                                        uint8_t* const dst, int stride) {
 239   uint8x8x2_t lo, hi;
 240   lo.val[0] = vget_low_u8(p0);
 241   lo.val[1] = vget_low_u8(q0);
 242   hi.val[0] = vget_high_u8(p0);
 243   hi.val[1] = vget_high_u8(q0);
 244   Store2x8_NEON(lo, dst - 1 + 0 * stride, stride);
 245   Store2x8_NEON(hi, dst - 1 + 8 * stride, stride);
 246 }
 247
 248 #if !defined(WORK_AROUND_GCC)
 249 static WEBP_INLINE void Store4x8_NEON(const uint8x8x4_t v,
 250                                       uint8_t* const dst, int stride) {
 251   vst4_lane_u8(dst + 0 * stride, v, 0);
 252   vst4_lane_u8(dst + 1 * stride, v, 1);
 253   vst4_lane_u8(dst + 2 * stride, v, 2);
 254   vst4_lane_u8(dst + 3 * stride, v, 3);
 255   vst4_lane_u8(dst + 4 * stride, v, 4);
 256   vst4_lane_u8(dst + 5 * stride, v, 5);
 257   vst4_lane_u8(dst + 6 * stride, v, 6);
 258   vst4_lane_u8(dst + 7 * stride, v, 7);
 259 }
 260
 261 static WEBP_INLINE void Store4x16_NEON(const uint8x16_t p1, const uint8x16_t p0,
 262                                        const uint8x16_t q0, const uint8x16_t q1,
 263                                        uint8_t* const dst, int stride) {
 264   uint8x8x4_t lo, hi;
 265   INIT_VECTOR4(lo,
 266                vget_low_u8(p1), vget_low_u8(p0),
 267                vget_low_u8(q0), vget_low_u8(q1));
 268   INIT_VECTOR4(hi,
 269                vget_high_u8(p1), vget_high_u8(p0),
 270                vget_high_u8(q0), vget_high_u8(q1));
 271   Store4x8_NEON(lo, dst - 2 + 0 * stride, stride);
 272   Store4x8_NEON(hi, dst - 2 + 8 * stride, stride);
 273 }
 274 #endif  // !WORK_AROUND_GCC
 275
 276 static WEBP_INLINE void Store16x2_NEON(const uint8x16_t p0, const uint8x16_t q0,
 277                                        uint8_t* const dst, int stride) {
 278   vst1q_u8(dst - stride, p0);
 279   vst1q_u8(dst, q0);
 280 }
 281
 282 static WEBP_INLINE void Store16x4_NEON(const uint8x16_t p1, const uint8x16_t p0,
 283                                        const uint8x16_t q0, const uint8x16_t q1,
 284                                        uint8_t* const dst, int stride) {
 285   Store16x2_NEON(p1, p0, dst - stride, stride);
 286   Store16x2_NEON(q0, q1, dst + stride, stride);
 287 }
 288
 289 static WEBP_INLINE void Store8x2x2_NEON(const uint8x16_t p0,
 290                                         const uint8x16_t q0,
 291                                         uint8_t* const u, uint8_t* const v,
 292                                         int stride) {
 293   // p0 and q0 contain the u+v samples packed in low/high halves.
 294   vst1_u8(u - stride, vget_low_u8(p0));
 295   vst1_u8(u,          vget_low_u8(q0));
 296   vst1_u8(v - stride, vget_high_u8(p0));
 297   vst1_u8(v,          vget_high_u8(q0));
 298 }
 299
 300 static WEBP_INLINE void Store8x4x2_NEON(const uint8x16_t p1,
 301                                         const uint8x16_t p0,
 302                                         const uint8x16_t q0,
 303                                         const uint8x16_t q1,
 304                                         uint8_t* const u, uint8_t* const v,
 305                                         int stride) {
 306   // The p1...q1 registers contain the u+v samples packed in low/high halves.
 307   Store8x2x2_NEON(p1, p0, u - stride, v - stride, stride);
 308   Store8x2x2_NEON(q0, q1, u + stride, v + stride, stride);
 309 }
 310
 311 #if !defined(WORK_AROUND_GCC)
 312
 313 #define STORE6_LANE(DST, VAL0, VAL1, LANE) do {   \
 314   vst3_lane_u8((DST) - 3, (VAL0), (LANE));        \
 315   vst3_lane_u8((DST) + 0, (VAL1), (LANE));        \
 316   (DST) += stride;                                \
 317 } while (0)
 318
 319 static WEBP_INLINE void Store6x8x2_NEON(
 320     const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
 321     const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
 322     uint8_t* u, uint8_t* v, int stride) {
 323   uint8x8x3_t u0, u1, v0, v1;
 324   INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0));
 325   INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2));
 326   INIT_VECTOR3(v0, vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0));
 327   INIT_VECTOR3(v1, vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2));
 328   STORE6_LANE(u, u0, u1, 0);
 329   STORE6_LANE(u, u0, u1, 1);
 330   STORE6_LANE(u, u0, u1, 2);
 331   STORE6_LANE(u, u0, u1, 3);
 332   STORE6_LANE(u, u0, u1, 4);
 333   STORE6_LANE(u, u0, u1, 5);
 334   STORE6_LANE(u, u0, u1, 6);
 335   STORE6_LANE(u, u0, u1, 7);
 336   STORE6_LANE(v, v0, v1, 0);
 337   STORE6_LANE(v, v0, v1, 1);
 338   STORE6_LANE(v, v0, v1, 2);
 339   STORE6_LANE(v, v0, v1, 3);
 340   STORE6_LANE(v, v0, v1, 4);
 341   STORE6_LANE(v, v0, v1, 5);
 342   STORE6_LANE(v, v0, v1, 6);
 343   STORE6_LANE(v, v0, v1, 7);
 344 }
 345 #undef STORE6_LANE
 346
 347 static WEBP_INLINE void Store4x8x2_NEON(const uint8x16_t p1,
 348                                         const uint8x16_t p0,
 349                                         const uint8x16_t q0,
 350                                         const uint8x16_t q1,
 351                                         uint8_t* const u, uint8_t* const v,
 352                                         int stride) {
 353   uint8x8x4_t u0, v0;
 354   INIT_VECTOR4(u0,
 355                vget_low_u8(p1), vget_low_u8(p0),
 356                vget_low_u8(q0), vget_low_u8(q1));
 357   INIT_VECTOR4(v0,
 358                vget_high_u8(p1), vget_high_u8(p0),
 359                vget_high_u8(q0), vget_high_u8(q1));
 360   vst4_lane_u8(u - 2 + 0 * stride, u0, 0);
 361   vst4_lane_u8(u - 2 + 1 * stride, u0, 1);
 362   vst4_lane_u8(u - 2 + 2 * stride, u0, 2);
 363   vst4_lane_u8(u - 2 + 3 * stride, u0, 3);
 364   vst4_lane_u8(u - 2 + 4 * stride, u0, 4);
 365   vst4_lane_u8(u - 2 + 5 * stride, u0, 5);
 366   vst4_lane_u8(u - 2 + 6 * stride, u0, 6);
 367   vst4_lane_u8(u - 2 + 7 * stride, u0, 7);
 368   vst4_lane_u8(v - 2 + 0 * stride, v0, 0);
 369   vst4_lane_u8(v - 2 + 1 * stride, v0, 1);
 370   vst4_lane_u8(v - 2 + 2 * stride, v0, 2);
 371   vst4_lane_u8(v - 2 + 3 * stride, v0, 3);
 372   vst4_lane_u8(v - 2 + 4 * stride, v0, 4);
 373   vst4_lane_u8(v - 2 + 5 * stride, v0, 5);
 374   vst4_lane_u8(v - 2 + 6 * stride, v0, 6);
 375   vst4_lane_u8(v - 2 + 7 * stride, v0, 7);
 376 }
 377
 378 #endif  // !WORK_AROUND_GCC
 379
 380 // Zero extend 'v' to an int16x8_t.
 381 static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint8x8_t v) {
 382   return vreinterpretq_s16_u16(vmovl_u8(v));
 383 }
 384
 385 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
 386 // to the corresponding rows of 'dst'.
 387 static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
 388                                                  const int16x8_t dst01,
 389                                                  const int16x8_t dst23) {
 390   // Unsigned saturate to 8b.
 391   const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
 392   const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
 393
 394   // Store the results.
 395   vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
 396   vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
 397   vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
 398   vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
 399 }
 400
 401 static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
 402                                     const int16x8_t row23,
 403                                     uint8_t* const dst) {
 404   uint32x2_t dst01 = vdup_n_u32(0);
 405   uint32x2_t dst23 = vdup_n_u32(0);
 406
 407   // Load the source pixels.
 408   dst01 = vld1_lane_u32((uint32_t*)(dst + 0 * BPS), dst01, 0);
 409   dst23 = vld1_lane_u32((uint32_t*)(dst + 2 * BPS), dst23, 0);
 410   dst01 = vld1_lane_u32((uint32_t*)(dst + 1 * BPS), dst01, 1);
 411   dst23 = vld1_lane_u32((uint32_t*)(dst + 3 * BPS), dst23, 1);
 412
 413   {
 414     // Convert to 16b.
 415     const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst01));
 416     const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst23));
 417
 418     // Descale with rounding.
 419     const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
 420     const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
 421     // Add the inverse transform.
 422     SaturateAndStore4x4_NEON(dst, out01, out23);
 423   }
 424 }
 425
 426 //-----------------------------------------------------------------------------
 427 // Simple In-loop filtering (Paragraph 15.2)
 428
 429 static uint8x16_t NeedsFilter_NEON(const uint8x16_t p1, const uint8x16_t p0,
 430                                    const uint8x16_t q0, const uint8x16_t q1,
 431                                    int thresh) {
 432   const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh);
 433   const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0);               // abs(p0-q0)
 434   const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1);               // abs(p1-q1)
 435   const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0);  // 2 * abs(p0-q0)
 436   const uint8x16_t a_p1_q1_2 = vshrq_n_u8(a_p1_q1, 1);       // abs(p1-q1) / 2
 437   const uint8x16_t sum = vqaddq_u8(a_p0_q0_2, a_p1_q1_2);
 438   const uint8x16_t mask = vcgeq_u8(thresh_v, sum);
 439   return mask;
 440 }
 441
 442 static int8x16_t FlipSign_NEON(const uint8x16_t v) {
 443   const uint8x16_t sign_bit = vdupq_n_u8(0x80);
 444   return vreinterpretq_s8_u8(veorq_u8(v, sign_bit));
 445 }
 446
 447 static uint8x16_t FlipSignBack_NEON(const int8x16_t v) {
 448   const int8x16_t sign_bit = vdupq_n_s8(0x80);
 449   return vreinterpretq_u8_s8(veorq_s8(v, sign_bit));
 450 }
 451
 452 static int8x16_t GetBaseDelta_NEON(const int8x16_t p1, const int8x16_t p0,
 453                                    const int8x16_t q0, const int8x16_t q1) {
 454   const int8x16_t q0_p0 = vqsubq_s8(q0, p0);      // (q0-p0)
 455   const int8x16_t p1_q1 = vqsubq_s8(p1, q1);      // (p1-q1)
 456   const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0);   // (p1-q1) + 1 * (q0 - p0)
 457   const int8x16_t s2 = vqaddq_s8(q0_p0, s1);      // (p1-q1) + 2 * (q0 - p0)
 458   const int8x16_t s3 = vqaddq_s8(q0_p0, s2);      // (p1-q1) + 3 * (q0 - p0)
 459   return s3;
 460 }
 461
 462 static int8x16_t GetBaseDelta0_NEON(const int8x16_t p0, const int8x16_t q0) {
 463   const int8x16_t q0_p0 = vqsubq_s8(q0, p0);      // (q0-p0)
 464   const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0);   // 2 * (q0 - p0)
 465   const int8x16_t s2 = vqaddq_s8(q0_p0, s1);      // 3 * (q0 - p0)
 466   return s2;
 467 }
 468
 469 //------------------------------------------------------------------------------
 470
 471 static void ApplyFilter2NoFlip_NEON(const int8x16_t p0s, const int8x16_t q0s,
 472                                     const int8x16_t delta,
 473                                     int8x16_t* const op0,
 474                                     int8x16_t* const oq0) {
 475   const int8x16_t kCst3 = vdupq_n_s8(0x03);
 476   const int8x16_t kCst4 = vdupq_n_s8(0x04);
 477   const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
 478   const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
 479   const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
 480   const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
 481   *op0 = vqaddq_s8(p0s, delta3);
 482   *oq0 = vqsubq_s8(q0s, delta4);
 483 }
 484
 485 #if defined(WEBP_USE_INTRINSICS)
 486
 487 static void ApplyFilter2_NEON(const int8x16_t p0s, const int8x16_t q0s,
 488                               const int8x16_t delta,
 489                               uint8x16_t* const op0, uint8x16_t* const oq0) {
 490   const int8x16_t kCst3 = vdupq_n_s8(0x03);
 491   const int8x16_t kCst4 = vdupq_n_s8(0x04);
 492   const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
 493   const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
 494   const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
 495   const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
 496   const int8x16_t sp0 = vqaddq_s8(p0s, delta3);
 497   const int8x16_t sq0 = vqsubq_s8(q0s, delta4);
 498   *op0 = FlipSignBack_NEON(sp0);
 499   *oq0 = FlipSignBack_NEON(sq0);
 500 }
 501
 502 static void DoFilter2_NEON(const uint8x16_t p1, const uint8x16_t p0,
 503                            const uint8x16_t q0, const uint8x16_t q1,
 504                            const uint8x16_t mask,
 505                            uint8x16_t* const op0, uint8x16_t* const oq0) {
 506   const int8x16_t p1s = FlipSign_NEON(p1);
 507   const int8x16_t p0s = FlipSign_NEON(p0);
 508   const int8x16_t q0s = FlipSign_NEON(q0);
 509   const int8x16_t q1s = FlipSign_NEON(q1);
 510   const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
 511   const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));
 512   ApplyFilter2_NEON(p0s, q0s, delta1, op0, oq0);
 513 }
 514
 515 static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
 516   uint8x16_t p1, p0, q0, q1, op0, oq0;
 517   Load16x4_NEON(p, stride, &p1, &p0, &q0, &q1);
 518   {
 519     const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
 520     DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
 521   }
 522   Store16x2_NEON(op0, oq0, p, stride);
 523 }
 524
 525 static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
 526   uint8x16_t p1, p0, q0, q1, oq0, op0;
 527   Load4x16_NEON(p, stride, &p1, &p0, &q0, &q1);
 528   {
 529     const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
 530     DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
 531   }
 532   Store2x16_NEON(op0, oq0, p, stride);
 533 }
 534
 535 #else
 536
 537 // Load/Store vertical edge
 538 #define LOAD8x4(c1, c2, c3, c4, b1, b2, stride)                                \
 539   "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \
 540   "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \
 541   "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \
 542   "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \
 543   "vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \
 544   "vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \
 545   "vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \
 546   "vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n"
 547
 548 #define STORE8x2(c1, c2, p, stride)                                            \
 549   "vst2.8   {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n"                    \
 550   "vst2.8   {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n"                    \
 551   "vst2.8   {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n"                    \
 552   "vst2.8   {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n"                    \
 553   "vst2.8   {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n"                    \
 554   "vst2.8   {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n"                    \
 555   "vst2.8   {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n"                    \
 556   "vst2.8   {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n"
 557
 558 #define QRegs "q0", "q1", "q2", "q3",                                          \
 559               "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
 560
 561 #define FLIP_SIGN_BIT2(a, b, s)                                                \
 562   "veor     " #a "," #a "," #s "               \n"                             \
 563   "veor     " #b "," #b "," #s "               \n"                             \
 564
 565 #define FLIP_SIGN_BIT4(a, b, c, d, s)                                          \
 566   FLIP_SIGN_BIT2(a, b, s)                                                      \
 567   FLIP_SIGN_BIT2(c, d, s)                                                      \
 568
 569 #define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask)                             \
 570   "vabd.u8    q15," #p0 "," #q0 "         \n"  /* abs(p0 - q0) */              \
 571   "vabd.u8    q14," #p1 "," #q1 "         \n"  /* abs(p1 - q1) */              \
 572   "vqadd.u8   q15, q15, q15               \n"  /* abs(p0 - q0) * 2 */          \
 573   "vshr.u8    q14, q14, #1                \n"  /* abs(p1 - q1) / 2 */          \
 574   "vqadd.u8   q15, q15, q14     \n"  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
 575   "vdup.8     q14, " #thresh "            \n"                                  \
 576   "vcge.u8   " #mask ", q14, q15          \n"  /* mask <= thresh */
 577
 578 #define GET_BASE_DELTA(p1, p0, q0, q1, o)                                      \
 579   "vqsub.s8   q15," #q0 "," #p0 "         \n"  /* (q0 - p0) */                 \
 580   "vqsub.s8  " #o "," #p1 "," #q1 "       \n"  /* (p1 - q1) */                 \
 581   "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 1 * (p0 - q0) */ \
 582   "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 2 * (p0 - q0) */ \
 583   "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 3 * (p0 - q0) */
 584
 585 #define DO_SIMPLE_FILTER(p0, q0, fl)                                           \
 586   "vmov.i8    q15, #0x03                  \n"                                  \
 587   "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 3 */      \
 588   "vshr.s8    q15, q15, #3                \n"  /* filter1 >> 3 */              \
 589   "vqadd.s8  " #p0 "," #p0 ", q15         \n"  /* p0 += filter1 */             \
 590                                                                                \
 591   "vmov.i8    q15, #0x04                  \n"                                  \
 592   "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 4 */      \
 593   "vshr.s8    q15, q15, #3                \n"  /* filter2 >> 3 */              \
 594   "vqsub.s8  " #q0 "," #q0 ", q15         \n"  /* q0 -= filter2 */
 595
 596 // Applies filter on 2 pixels (p0 and q0)
 597 #define DO_FILTER2(p1, p0, q0, q1, thresh)                                     \
 598   NEEDS_FILTER(p1, p0, q0, q1, thresh, q9)     /* filter mask in q9 */         \
 599   "vmov.i8    q10, #0x80                  \n"  /* sign bit */                  \
 600   FLIP_SIGN_BIT4(p1, p0, q0, q1, q10)          /* convert to signed value */   \
 601   GET_BASE_DELTA(p1, p0, q0, q1, q11)          /* get filter level  */         \
 602   "vand       q9, q9, q11                 \n"  /* apply filter mask */         \
 603   DO_SIMPLE_FILTER(p0, q0, q9)                 /* apply filter */              \
 604   FLIP_SIGN_BIT2(p0, q0, q10)
 605
 606 static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
 607   __asm__ volatile (
 608     "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
 609
 610     "vld1.u8    {q1}, [%[p]], %[stride]        \n"  // p1
 611     "vld1.u8    {q2}, [%[p]], %[stride]        \n"  // p0
 612     "vld1.u8    {q3}, [%[p]], %[stride]        \n"  // q0
 613     "vld1.u8    {q12}, [%[p]]                  \n"  // q1
 614
 615     DO_FILTER2(q1, q2, q3, q12, %[thresh])
 616
 617     "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
 618
 619     "vst1.u8    {q2}, [%[p]], %[stride]        \n"  // store op0
 620     "vst1.u8    {q3}, [%[p]]                   \n"  // store oq0
 621     : [p] "+r"(p)
 622     : [stride] "r"(stride), [thresh] "r"(thresh)
 623     : "memory", QRegs
 624   );
 625 }
 626
 627 static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
 628   __asm__ volatile (
 629     "sub        r4, %[p], #2                   \n"  // base1 = p - 2
 630     "lsl        r6, %[stride], #1              \n"  // r6 = 2 * stride
 631     "add        r5, r4, %[stride]              \n"  // base2 = base1 + stride
 632
 633     LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
 634     LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6)
 635     "vswp       d3, d24                        \n"  // p1:q1 p0:q3
 636     "vswp       d5, d26                        \n"  // q0:q2 q1:q4
 637     "vswp       q2, q12                        \n"  // p1:q1 p0:q2 q0:q3 q1:q4
 638
 639     DO_FILTER2(q1, q2, q12, q13, %[thresh])
 640
 641     "sub        %[p], %[p], #1                 \n"  // p - 1
 642
 643     "vswp        d5, d24                       \n"
 644     STORE8x2(d4, d5, [%[p]], %[stride])
 645     STORE8x2(d24, d25, [%[p]], %[stride])
 646
 647     : [p] "+r"(p)
 648     : [stride] "r"(stride), [thresh] "r"(thresh)
 649     : "memory", "r4", "r5", "r6", QRegs
 650   );
 651 }
 652
 653 #undef LOAD8x4
 654 #undef STORE8x2
 655
 656 #endif    // WEBP_USE_INTRINSICS
 657
 658 static void SimpleVFilter16i_NEON(uint8_t* p, int stride, int thresh) {
 659   uint32_t k;
 660   for (k = 3; k != 0; --k) {
 661     p += 4 * stride;
 662     SimpleVFilter16_NEON(p, stride, thresh);
 663   }
 664 }
 665
 666 static void SimpleHFilter16i_NEON(uint8_t* p, int stride, int thresh) {
 667   uint32_t k;
 668   for (k = 3; k != 0; --k) {
 669     p += 4;
 670     SimpleHFilter16_NEON(p, stride, thresh);
 671   }
 672 }
 673
 674 //------------------------------------------------------------------------------
 675 // Complex In-loop filtering (Paragraph 15.3)
 676
 677 static uint8x16_t NeedsHev_NEON(const uint8x16_t p1, const uint8x16_t p0,
 678                                 const uint8x16_t q0, const uint8x16_t q1,
 679                                 int hev_thresh) {
 680   const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);
 681   const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);  // abs(p1 - p0)
 682   const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);  // abs(q1 - q0)
 683   const uint8x16_t a_max = vmaxq_u8(a_p1_p0, a_q1_q0);
 684   const uint8x16_t mask = vcgtq_u8(a_max, hev_thresh_v);
 685   return mask;
 686 }
 687
 688 static uint8x16_t NeedsFilter2_NEON(const uint8x16_t p3, const uint8x16_t p2,
 689                                     const uint8x16_t p1, const uint8x16_t p0,
 690                                     const uint8x16_t q0, const uint8x16_t q1,
 691                                     const uint8x16_t q2, const uint8x16_t q3,
 692                                     int ithresh, int thresh) {
 693   const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh);
 694   const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2);  // abs(p3 - p2)
 695   const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1);  // abs(p2 - p1)
 696   const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);  // abs(p1 - p0)
 697   const uint8x16_t a_q3_q2 = vabdq_u8(q3, q2);  // abs(q3 - q2)
 698   const uint8x16_t a_q2_q1 = vabdq_u8(q2, q1);  // abs(q2 - q1)
 699   const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);  // abs(q1 - q0)
 700   const uint8x16_t max1 = vmaxq_u8(a_p3_p2, a_p2_p1);
 701   const uint8x16_t max2 = vmaxq_u8(a_p1_p0, a_q3_q2);
 702   const uint8x16_t max3 = vmaxq_u8(a_q2_q1, a_q1_q0);
 703   const uint8x16_t max12 = vmaxq_u8(max1, max2);
 704   const uint8x16_t max123 = vmaxq_u8(max12, max3);
 705   const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123);
 706   const uint8x16_t mask1 = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
 707   const uint8x16_t mask = vandq_u8(mask1, mask2);
 708   return mask;
 709 }
 710
 711 //  4-points filter
 712
 713 static void ApplyFilter4_NEON(
 714     const int8x16_t p1, const int8x16_t p0,
 715     const int8x16_t q0, const int8x16_t q1,
 716     const int8x16_t delta0,
 717     uint8x16_t* const op1, uint8x16_t* const op0,
 718     uint8x16_t* const oq0, uint8x16_t* const oq1) {
 719   const int8x16_t kCst3 = vdupq_n_s8(0x03);
 720   const int8x16_t kCst4 = vdupq_n_s8(0x04);
 721   const int8x16_t delta1 = vqaddq_s8(delta0, kCst4);
 722   const int8x16_t delta2 = vqaddq_s8(delta0, kCst3);
 723   const int8x16_t a1 = vshrq_n_s8(delta1, 3);
 724   const int8x16_t a2 = vshrq_n_s8(delta2, 3);
 725   const int8x16_t a3 = vrshrq_n_s8(a1, 1);   // a3 = (a1 + 1) >> 1
 726   *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a2));  // clip(p0 + a2)
 727   *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1));  // clip(q0 - a1)
 728   *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a3));  // clip(p1 + a3)
 729   *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a3));  // clip(q1 - a3)
 730 }
 731
 732 static void DoFilter4_NEON(
 733     const uint8x16_t p1, const uint8x16_t p0,
 734     const uint8x16_t q0, const uint8x16_t q1,
 735     const uint8x16_t mask, const uint8x16_t hev_mask,
 736     uint8x16_t* const op1, uint8x16_t* const op0,
 737     uint8x16_t* const oq0, uint8x16_t* const oq1) {
 738   // This is a fused version of DoFilter2() calling ApplyFilter2 directly
 739   const int8x16_t p1s = FlipSign_NEON(p1);
 740   int8x16_t p0s = FlipSign_NEON(p0);
 741   int8x16_t q0s = FlipSign_NEON(q0);
 742   const int8x16_t q1s = FlipSign_NEON(q1);
 743   const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
 744
 745   // do_filter2 part (simple loopfilter on pixels with hev)
 746   {
 747     const int8x16_t delta = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
 748     const int8x16_t simple_lf_delta =
 749         vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));
 750     ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
 751   }
 752
 753   // do_filter4 part (complex loopfilter on pixels without hev)
 754   {
 755     const int8x16_t delta0 = GetBaseDelta0_NEON(p0s, q0s);
 756     // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
 757     const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
 758     const int8x16_t complex_lf_delta =
 759         vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
 760     ApplyFilter4_NEON(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1);
 761   }
 762 }
 763
 764 //  6-points filter
 765
 766 static void ApplyFilter6_NEON(
 767     const int8x16_t p2, const int8x16_t p1, const int8x16_t p0,
 768     const int8x16_t q0, const int8x16_t q1, const int8x16_t q2,
 769     const int8x16_t delta,
 770     uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
 771     uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
 772   // We have to compute: X = (9*a+63) >> 7, Y = (18*a+63)>>7, Z = (27*a+63) >> 7
 773   // Turns out, there's a common sub-expression S=9 * a - 1 that can be used
 774   // with the special vqrshrn_n_s16 rounding-shift-and-narrow instruction:
 775   //   X = (S + 64) >> 7, Y = (S + 32) >> 6, Z = (18 * a + S + 64) >> 7
 776   const int8x8_t delta_lo = vget_low_s8(delta);
 777   const int8x8_t delta_hi = vget_high_s8(delta);
 778   const int8x8_t kCst9 = vdup_n_s8(9);
 779   const int16x8_t kCstm1 = vdupq_n_s16(-1);
 780   const int8x8_t kCst18 = vdup_n_s8(18);
 781   const int16x8_t S_lo = vmlal_s8(kCstm1, kCst9, delta_lo);  // S = 9 * a - 1
 782   const int16x8_t S_hi = vmlal_s8(kCstm1, kCst9, delta_hi);
 783   const int16x8_t Z_lo = vmlal_s8(S_lo, kCst18, delta_lo);   // S + 18 * a
 784   const int16x8_t Z_hi = vmlal_s8(S_hi, kCst18, delta_hi);
 785   const int8x8_t a3_lo = vqrshrn_n_s16(S_lo, 7);   // (9 * a + 63) >> 7
 786   const int8x8_t a3_hi = vqrshrn_n_s16(S_hi, 7);
 787   const int8x8_t a2_lo = vqrshrn_n_s16(S_lo, 6);   // (9 * a + 31) >> 6
 788   const int8x8_t a2_hi = vqrshrn_n_s16(S_hi, 6);
 789   const int8x8_t a1_lo = vqrshrn_n_s16(Z_lo, 7);   // (27 * a + 63) >> 7
 790   const int8x8_t a1_hi = vqrshrn_n_s16(Z_hi, 7);
 791   const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi);
 792   const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);
 793   const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);
 794
 795   *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a1));  // clip(p0 + a1)
 796   *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1));  // clip(q0 - q1)
 797   *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a2));  // clip(q1 - a2)
 798   *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a2));  // clip(p1 + a2)
 799   *oq2 = FlipSignBack_NEON(vqsubq_s8(q2, a3));  // clip(q2 - a3)
 800   *op2 = FlipSignBack_NEON(vqaddq_s8(p2, a3));  // clip(p2 + a3)
 801 }
 802
 803 static void DoFilter6_NEON(
 804     const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
 805     const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
 806     const uint8x16_t mask, const uint8x16_t hev_mask,
 807     uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
 808     uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
 809   // This is a fused version of DoFilter2() calling ApplyFilter2 directly
 810   const int8x16_t p2s = FlipSign_NEON(p2);
 811   const int8x16_t p1s = FlipSign_NEON(p1);
 812   int8x16_t p0s = FlipSign_NEON(p0);
 813   int8x16_t q0s = FlipSign_NEON(q0);
 814   const int8x16_t q1s = FlipSign_NEON(q1);
 815   const int8x16_t q2s = FlipSign_NEON(q2);
 816   const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
 817   const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
 818
 819   // do_filter2 part (simple loopfilter on pixels with hev)
 820   {
 821     const int8x16_t simple_lf_delta =
 822         vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));
 823     ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
 824   }
 825
 826   // do_filter6 part (complex loopfilter on pixels without hev)
 827   {
 828     // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
 829     const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
 830     const int8x16_t complex_lf_delta =
 831         vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
 832     ApplyFilter6_NEON(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta,
 833                       op2, op1, op0, oq0, oq1, oq2);
 834   }
 835 }
 836
 837 // on macroblock edges
 838
 839 static void VFilter16_NEON(uint8_t* p, int stride,
 840                            int thresh, int ithresh, int hev_thresh) {
 841   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
 842   Load16x8_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
 843   {
 844     const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
 845                                               ithresh, thresh);
 846     const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
 847     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
 848     DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
 849                    &op2, &op1, &op0, &oq0, &oq1, &oq2);
 850     Store16x2_NEON(op2, op1, p - 2 * stride, stride);
 851     Store16x2_NEON(op0, oq0, p + 0 * stride, stride);
 852     Store16x2_NEON(oq1, oq2, p + 2 * stride, stride);
 853   }
 854 }
 855
 856 static void HFilter16_NEON(uint8_t* p, int stride,
 857                            int thresh, int ithresh, int hev_thresh) {
 858   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
 859   Load8x16_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
 860   {
 861     const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
 862                                               ithresh, thresh);
 863     const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
 864     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
 865     DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
 866                    &op2, &op1, &op0, &oq0, &oq1, &oq2);
 867     Store2x16_NEON(op2, op1, p - 2, stride);
 868     Store2x16_NEON(op0, oq0, p + 0, stride);
 869     Store2x16_NEON(oq1, oq2, p + 2, stride);
 870   }
 871 }
 872
 873 // on three inner edges
 874 static void VFilter16i_NEON(uint8_t* p, int stride,
 875                             int thresh, int ithresh, int hev_thresh) {
 876   uint32_t k;
 877   uint8x16_t p3, p2, p1, p0;
 878   Load16x4_NEON(p + 2  * stride, stride, &p3, &p2, &p1, &p0);
 879   for (k = 3; k != 0; --k) {
 880     uint8x16_t q0, q1, q2, q3;
 881     p += 4 * stride;
 882 #ifdef __TIZEN__
 883         usleep(1);
 884 #endif
 885     Load16x4_NEON(p + 2  * stride, stride, &q0, &q1, &q2, &q3);
 886     {
 887       const uint8x16_t mask =
 888           NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
 889       const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
 890       // p3 and p2 are not just temporary variables here: they will be
 891       // re-used for next span. And q2/q3 will become p1/p0 accordingly.
 892       DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
 893       Store16x4_NEON(p1, p0, p3, p2, p, stride);
 894       p1 = q2;
 895       p0 = q3;
 896     }
 897   }
 898 }
 899
 900 #if !defined(WORK_AROUND_GCC)
 901 static void HFilter16i_NEON(uint8_t* p, int stride,
 902                             int thresh, int ithresh, int hev_thresh) {
 903   uint32_t k;
 904   uint8x16_t p3, p2, p1, p0;
 905   Load4x16_NEON(p + 2, stride, &p3, &p2, &p1, &p0);
 906   for (k = 3; k != 0; --k) {
 907     uint8x16_t q0, q1, q2, q3;
 908     p += 4;
 909     Load4x16_NEON(p + 2, stride, &q0, &q1, &q2, &q3);
 910     {
 911       const uint8x16_t mask =
 912           NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
 913       const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
 914       DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
 915       Store4x16_NEON(p1, p0, p3, p2, p, stride);
 916       p1 = q2;
 917       p0 = q3;
 918     }
 919   }
 920 }
 921 #endif  // !WORK_AROUND_GCC
 922
 923 // 8-pixels wide variant, for chroma filtering
 924 static void VFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
 925                           int thresh, int ithresh, int hev_thresh) {
 926   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
 927   Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
 928   {
 929     const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
 930                                               ithresh, thresh);
 931     const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
 932     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
 933     DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
 934                    &op2, &op1, &op0, &oq0, &oq1, &oq2);
 935     Store8x2x2_NEON(op2, op1, u - 2 * stride, v - 2 * stride, stride);
 936     Store8x2x2_NEON(op0, oq0, u + 0 * stride, v + 0 * stride, stride);
 937     Store8x2x2_NEON(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
 938   }
 939 }
 940 static void VFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
 941                            int thresh, int ithresh, int hev_thresh) {
 942   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
 943   u += 4 * stride;
 944   v += 4 * stride;
 945   Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
 946   {
 947     const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
 948                                               ithresh, thresh);
 949     const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
 950     uint8x16_t op1, op0, oq0, oq1;
 951     DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
 952     Store8x4x2_NEON(op1, op0, oq0, oq1, u, v, stride);
 953   }
 954 }
 955
 956 #if !defined(WORK_AROUND_GCC)
 957 static void HFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
 958                           int thresh, int ithresh, int hev_thresh) {
 959   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
 960   Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
 961   {
 962     const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
 963                                               ithresh, thresh);
 964     const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
 965     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
 966     DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
 967                    &op2, &op1, &op0, &oq0, &oq1, &oq2);
 968     Store6x8x2_NEON(op2, op1, op0, oq0, oq1, oq2, u, v, stride);
 969   }
 970 }
 971
 972 static void HFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
 973                            int thresh, int ithresh, int hev_thresh) {
 974   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
 975   u += 4;
 976   v += 4;
 977   Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
 978   {
 979     const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
 980                                               ithresh, thresh);
 981     const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
 982     uint8x16_t op1, op0, oq0, oq1;
 983     DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
 984     Store4x8x2_NEON(op1, op0, oq0, oq1, u, v, stride);
 985   }
 986 }
 987 #endif  // !WORK_AROUND_GCC
 988
 989 //-----------------------------------------------------------------------------
 990 // Inverse transforms (Paragraph 14.4)
 991
 992 // Technically these are unsigned but vqdmulh is only available in signed.
 993 // vqdmulh returns high half (effectively >> 16) but also doubles the value,
 994 // changing the >> 16 to >> 15 and requiring an additional >> 1.
 995 // We use this to our advantage with kC2. The canonical value is 35468.
 996 // However, the high bit is set so treating it as signed will give incorrect
 997 // results. We avoid this by down shifting by 1 here to clear the highest bit.
 998 // Combined with the doubling effect of vqdmulh we get >> 16.
 999 // This can not be applied to kC1 because the lowest bit is set. Down shifting
1000 // the constant would reduce precision.
1001
1002 // libwebp uses a trick to avoid some extra addition that libvpx does.
1003 // Instead of:
1004 // temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
1005 // libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
1006 // same issue with kC1 and vqdmulh that we work around by down shifting kC2
1007
1008 static const int16_t kC1 = 20091;
1009 static const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.
1010
1011 #if defined(WEBP_USE_INTRINSICS)
1012 static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
1013                                           const int16x8_t in1,
1014                                           int16x8x2_t* const out) {
1015   // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
1016   // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
1017   const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
1018                                                   // b0 d0 b1 d1 b2 d2 ...
1019   *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
1020 }
1021
1022 static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
1023   // {rows} = in0 | in4
1024   //          in8 | in12
1025   // B1 = in4 | in12
1026   const int16x8_t B1 =
1027       vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
1028   // C0 = kC1 * in4 | kC1 * in12
1029   // C1 = kC2 * in4 | kC2 * in12
1030   const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
1031   const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
1032   const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
1033                                 vget_low_s16(rows->val[1]));   // in0 + in8
1034   const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
1035                                 vget_low_s16(rows->val[1]));   // in0 - in8
1036   // c = kC2 * in4 - kC1 * in12
1037   // d = kC1 * in4 + kC2 * in12
1038   const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
1039   const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
1040   const int16x8_t D0 = vcombine_s16(a, b);      // D0 = a | b
1041   const int16x8_t D1 = vcombine_s16(d, c);      // D1 = d | c
1042   const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
1043   const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
1044   const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
1045   Transpose8x2_NEON(E0, E1, rows);
1046 }
1047
1048 static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
1049   int16x8x2_t rows;
1050   INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
1051   TransformPass_NEON(&rows);
1052   TransformPass_NEON(&rows);
1053   Add4x4_NEON(rows.val[0], rows.val[1], dst);
1054 }
1055
1056 #else
1057
1058 static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
1059   const int kBPS = BPS;
1060   // kC1, kC2. Padded because vld1.16 loads 8 bytes
1061   const int16_t constants[4] = { kC1, kC2, 0, 0 };
1062   /* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */
1063   __asm__ volatile (
1064     "vld1.16         {q1, q2}, [%[in]]           \n"
1065     "vld1.16         {d0}, [%[constants]]        \n"
1066
1067     /* d2: in[0]
1068      * d3: in[8]
1069      * d4: in[4]
1070      * d5: in[12]
1071      */
1072     "vswp            d3, d4                      \n"
1073
1074     /* q8 = {in[4], in[12]} * kC1 * 2 >> 16
1075      * q9 = {in[4], in[12]} * kC2 >> 16
1076      */
1077     "vqdmulh.s16     q8, q2, d0[0]               \n"
1078     "vqdmulh.s16     q9, q2, d0[1]               \n"
1079
1080     /* d22 = a = in[0] + in[8]
1081      * d23 = b = in[0] - in[8]
1082      */
1083     "vqadd.s16       d22, d2, d3                 \n"
1084     "vqsub.s16       d23, d2, d3                 \n"
1085
1086     /* The multiplication should be x * kC1 >> 16
1087      * However, with vqdmulh we get x * kC1 * 2 >> 16
1088      * (multiply, double, return high half)
1089      * We avoided this in kC2 by pre-shifting the constant.
1090      * q8 = in[4]/[12] * kC1 >> 16
1091      */
1092     "vshr.s16        q8, q8, #1                  \n"
1093
1094     /* Add {in[4], in[12]} back after the multiplication. This is handled by
1095      * adding 1 << 16 to kC1 in the libwebp C code.
1096      */
1097     "vqadd.s16       q8, q2, q8                  \n"
1098
1099     /* d20 = c = in[4]*kC2 - in[12]*kC1
1100      * d21 = d = in[4]*kC1 + in[12]*kC2
1101      */
1102     "vqsub.s16       d20, d18, d17               \n"
1103     "vqadd.s16       d21, d19, d16               \n"
1104
1105     /* d2 = tmp[0] = a + d
1106      * d3 = tmp[1] = b + c
1107      * d4 = tmp[2] = b - c
1108      * d5 = tmp[3] = a - d
1109      */
1110     "vqadd.s16       d2, d22, d21                \n"
1111     "vqadd.s16       d3, d23, d20                \n"
1112     "vqsub.s16       d4, d23, d20                \n"
1113     "vqsub.s16       d5, d22, d21                \n"
1114
1115     "vzip.16         q1, q2                      \n"
1116     "vzip.16         q1, q2                      \n"
1117
1118     "vswp            d3, d4                      \n"
1119
1120     /* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
1121      * q9 = {tmp[4], tmp[12]} * kC2 >> 16
1122      */
1123     "vqdmulh.s16     q8, q2, d0[0]               \n"
1124     "vqdmulh.s16     q9, q2, d0[1]               \n"
1125
1126     /* d22 = a = tmp[0] + tmp[8]
1127      * d23 = b = tmp[0] - tmp[8]
1128      */
1129     "vqadd.s16       d22, d2, d3                 \n"
1130     "vqsub.s16       d23, d2, d3                 \n"
1131
1132     /* See long winded explanations prior */
1133     "vshr.s16        q8, q8, #1                  \n"
1134     "vqadd.s16       q8, q2, q8                  \n"
1135
1136     /* d20 = c = in[4]*kC2 - in[12]*kC1
1137      * d21 = d = in[4]*kC1 + in[12]*kC2
1138      */
1139     "vqsub.s16       d20, d18, d17               \n"
1140     "vqadd.s16       d21, d19, d16               \n"
1141
1142     /* d2 = tmp[0] = a + d
1143      * d3 = tmp[1] = b + c
1144      * d4 = tmp[2] = b - c
1145      * d5 = tmp[3] = a - d
1146      */
1147     "vqadd.s16       d2, d22, d21                \n"
1148     "vqadd.s16       d3, d23, d20                \n"
1149     "vqsub.s16       d4, d23, d20                \n"
1150     "vqsub.s16       d5, d22, d21                \n"
1151
1152     "vld1.32         d6[0], [%[dst]], %[kBPS]    \n"
1153     "vld1.32         d6[1], [%[dst]], %[kBPS]    \n"
1154     "vld1.32         d7[0], [%[dst]], %[kBPS]    \n"
1155     "vld1.32         d7[1], [%[dst]], %[kBPS]    \n"
1156
1157     "sub         %[dst], %[dst], %[kBPS], lsl #2 \n"
1158
1159     /* (val) + 4 >> 3 */
1160     "vrshr.s16       d2, d2, #3                  \n"
1161     "vrshr.s16       d3, d3, #3                  \n"
1162     "vrshr.s16       d4, d4, #3                  \n"
1163     "vrshr.s16       d5, d5, #3                  \n"
1164
1165     "vzip.16         q1, q2                      \n"
1166     "vzip.16         q1, q2                      \n"
1167
1168     /* Must accumulate before saturating */
1169     "vmovl.u8        q8, d6                      \n"
1170     "vmovl.u8        q9, d7                      \n"
1171
1172     "vqadd.s16       q1, q1, q8                  \n"
1173     "vqadd.s16       q2, q2, q9                  \n"
1174
1175     "vqmovun.s16     d0, q1                      \n"
1176     "vqmovun.s16     d1, q2                      \n"
1177
1178     "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
1179     "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
1180     "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
1181     "vst1.32         d1[1], [%[dst]]             \n"
1182
1183     : [in] "+r"(in), [dst] "+r"(dst)  /* modified registers */
1184     : [kBPS] "r"(kBPS), [constants] "r"(constants)  /* constants */
1185     : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  /* clobbered */
1186   );
1187 }
1188
1189 #endif    // WEBP_USE_INTRINSICS
1190
1191 static void TransformTwo_NEON(const int16_t* in, uint8_t* dst, int do_two) {
1192   TransformOne_NEON(in, dst);
1193   if (do_two) {
1194     TransformOne_NEON(in + 16, dst + 4);
1195   }
1196 }
1197
1198 static void TransformDC_NEON(const int16_t* in, uint8_t* dst) {
1199   const int16x8_t DC = vdupq_n_s16(in[0]);
1200   Add4x4_NEON(DC, DC, dst);
1201 }
1202
1203 //------------------------------------------------------------------------------
1204
1205 #define STORE_WHT(dst, col, rows) do {                  \
1206   *dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \
1207   *dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \
1208   *dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \
1209   *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
1210 } while (0)
1211
1212 static void TransformWHT_NEON(const int16_t* in, int16_t* out) {
1213   int32x4x4_t tmp;
1214
1215   {
1216     // Load the source.
1217     const int16x4_t in00_03 = vld1_s16(in + 0);
1218     const int16x4_t in04_07 = vld1_s16(in + 4);
1219     const int16x4_t in08_11 = vld1_s16(in + 8);
1220     const int16x4_t in12_15 = vld1_s16(in + 12);
1221     const int32x4_t a0 = vaddl_s16(in00_03, in12_15);  // in[0..3] + in[12..15]
1222     const int32x4_t a1 = vaddl_s16(in04_07, in08_11);  // in[4..7] + in[8..11]
1223     const int32x4_t a2 = vsubl_s16(in04_07, in08_11);  // in[4..7] - in[8..11]
1224     const int32x4_t a3 = vsubl_s16(in00_03, in12_15);  // in[0..3] - in[12..15]
1225     tmp.val[0] = vaddq_s32(a0, a1);
1226     tmp.val[1] = vaddq_s32(a3, a2);
1227     tmp.val[2] = vsubq_s32(a0, a1);
1228     tmp.val[3] = vsubq_s32(a3, a2);
1229     // Arrange the temporary results column-wise.
1230     tmp = Transpose4x4_NEON(tmp);
1231   }
1232
1233   {
1234     const int32x4_t kCst3 = vdupq_n_s32(3);
1235     const int32x4_t dc = vaddq_s32(tmp.val[0], kCst3);  // add rounder
1236     const int32x4_t a0 = vaddq_s32(dc, tmp.val[3]);
1237     const int32x4_t a1 = vaddq_s32(tmp.val[1], tmp.val[2]);
1238     const int32x4_t a2 = vsubq_s32(tmp.val[1], tmp.val[2]);
1239     const int32x4_t a3 = vsubq_s32(dc, tmp.val[3]);
1240
1241     tmp.val[0] = vaddq_s32(a0, a1);
1242     tmp.val[1] = vaddq_s32(a3, a2);
1243     tmp.val[2] = vsubq_s32(a0, a1);
1244     tmp.val[3] = vsubq_s32(a3, a2);
1245
1246     // right shift the results by 3.
1247     tmp.val[0] = vshrq_n_s32(tmp.val[0], 3);
1248     tmp.val[1] = vshrq_n_s32(tmp.val[1], 3);
1249     tmp.val[2] = vshrq_n_s32(tmp.val[2], 3);
1250     tmp.val[3] = vshrq_n_s32(tmp.val[3], 3);
1251
1252     STORE_WHT(out, 0, tmp);
1253     STORE_WHT(out, 1, tmp);
1254     STORE_WHT(out, 2, tmp);
1255     STORE_WHT(out, 3, tmp);
1256   }
1257 }
1258
1259 #undef STORE_WHT
1260
1261 //------------------------------------------------------------------------------
1262
1263 #define MUL(a, b) (((a) * (b)) >> 16)
1264 static void TransformAC3_NEON(const int16_t* in, uint8_t* dst) {
1265   static const int kC1_full = 20091 + (1 << 16);
1266   static const int kC2_full = 35468;
1267   const int16x4_t A = vld1_dup_s16(in);
1268   const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full));
1269   const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full));
1270   const int c1 = MUL(in[1], kC2_full);
1271   const int d1 = MUL(in[1], kC1_full);
1272   const uint64_t cd = (uint64_t)( d1 & 0xffff) <<  0 |
1273                       (uint64_t)( c1 & 0xffff) << 16 |
1274                       (uint64_t)(-c1 & 0xffff) << 32 |
1275                       (uint64_t)(-d1 & 0xffff) << 48;
1276   const int16x4_t CD = vcreate_s16(cd);
1277   const int16x4_t B = vqadd_s16(A, CD);
1278   const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4));
1279   const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4));
1280   Add4x4_NEON(m0_m1, m2_m3, dst);
1281 }
1282 #undef MUL
1283
1284 //------------------------------------------------------------------------------
1285 // 4x4
1286
1287 static void DC4_NEON(uint8_t* dst) {    // DC
1288   const uint8x8_t A = vld1_u8(dst - BPS);  // top row
1289   const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
1290   const uint16x4_t p1 = vpadd_u16(p0, p0);
1291   const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
1292   const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
1293   const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
1294   const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
1295   const uint16x8_t s0 = vaddl_u8(L0, L1);
1296   const uint16x8_t s1 = vaddl_u8(L2, L3);
1297   const uint16x8_t s01 = vaddq_u16(s0, s1);
1298   const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1));
1299   const uint8x8_t dc0 = vrshrn_n_u16(sum, 3);  // (sum + 4) >> 3
1300   const uint8x8_t dc = vdup_lane_u8(dc0, 0);
1301   int i;
1302   for (i = 0; i < 4; ++i) {
1303     vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc), 0);
1304   }
1305 }
1306
1307 // TrueMotion (4x4 + 8x8)
1308 static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, int size) {
1309   const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
1310   const uint8x8_t T = vld1_u8(dst - BPS);  // top row 'A[0..3]'
1311   const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL));  // A[c] - A[-1]
1312   int y;
1313   for (y = 0; y < size; y += 4) {
1314     // left edge
1315     const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));
1316     const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));
1317     const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));
1318     const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));
1319     const int16x8_t r0 = vaddq_s16(L0, d);  // L[r] + A[c] - A[-1]
1320     const int16x8_t r1 = vaddq_s16(L1, d);
1321     const int16x8_t r2 = vaddq_s16(L2, d);
1322     const int16x8_t r3 = vaddq_s16(L3, d);
1323     // Saturate and store the result.
1324     const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0));
1325     const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1));
1326     const uint32x2_t r2_u32 = vreinterpret_u32_u8(vqmovun_s16(r2));
1327     const uint32x2_t r3_u32 = vreinterpret_u32_u8(vqmovun_s16(r3));
1328     if (size == 4) {
1329       vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0_u32, 0);
1330       vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1_u32, 0);
1331       vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2_u32, 0);
1332       vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3_u32, 0);
1333     } else {
1334       vst1_u32((uint32_t*)(dst + 0 * BPS), r0_u32);
1335       vst1_u32((uint32_t*)(dst + 1 * BPS), r1_u32);
1336       vst1_u32((uint32_t*)(dst + 2 * BPS), r2_u32);
1337       vst1_u32((uint32_t*)(dst + 3 * BPS), r3_u32);
1338     }
1339     dst += 4 * BPS;
1340   }
1341 }
1342
1343 static void TM4_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 4); }
1344
1345 static void VE4_NEON(uint8_t* dst) {    // vertical
1346   // NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS.
1347   const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1));  // top row
1348   const uint64x1_t A1 = vshr_n_u64(A0, 8);
1349   const uint64x1_t A2 = vshr_n_u64(A0, 16);
1350   const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
1351   const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
1352   const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
1353   const uint8x8_t b = vhadd_u8(ABCDEFGH, CDEFGH00);
1354   const uint8x8_t avg = vrhadd_u8(b, BCDEFGH0);
1355   int i;
1356   for (i = 0; i < 4; ++i) {
1357     vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(avg), 0);
1358   }
1359 }
1360
1361 static void RD4_NEON(uint8_t* dst) {   // Down-right
1362   const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - 1);
1363   const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
1364   const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
1365   const uint32_t I = dst[-1 + 0 * BPS];
1366   const uint32_t J = dst[-1 + 1 * BPS];
1367   const uint32_t K = dst[-1 + 2 * BPS];
1368   const uint32_t L = dst[-1 + 3 * BPS];
1369   const uint64x1_t LKJI____ =
1370       vcreate_u64((uint64_t)L | (K << 8) | (J << 16) | (I << 24));
1371   const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
1372   const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
1373   const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
1374   const uint8_t D = vget_lane_u8(XABCD_u8, 4);
1375   const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
1376   const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
1377   const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
1378   const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
1379   const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
1380   const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
1381   const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
1382   const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
1383   const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
1384   vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
1385   vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
1386   vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
1387   vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
1388 }
1389
1390 static void LD4_NEON(uint8_t* dst) {    // Down-left
1391   // Note using the same shift trick as VE4() is slower here.
1392   const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0);
1393   const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1);
1394   const uint8x8_t CDEFGH00 = vld1_u8(dst - BPS + 2);
1395   const uint8x8_t CDEFGHH0 = vset_lane_u8(dst[-BPS + 7], CDEFGH00, 6);
1396   const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGHH0);
1397   const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
1398   const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
1399   const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
1400   const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
1401   const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
1402   const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
1403   vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
1404   vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
1405   vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
1406   vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
1407 }
1408
1409 //------------------------------------------------------------------------------
1410 // Chroma
1411
1412 static void VE8uv_NEON(uint8_t* dst) {    // vertical
1413   const uint8x8_t top = vld1_u8(dst - BPS);
1414   int j;
1415   for (j = 0; j < 8; ++j) {
1416     vst1_u8(dst + j * BPS, top);
1417   }
1418 }
1419
1420 static void HE8uv_NEON(uint8_t* dst) {    // horizontal
1421   int j;
1422   for (j = 0; j < 8; ++j) {
1423     const uint8x8_t left = vld1_dup_u8(dst - 1);
1424     vst1_u8(dst, left);
1425     dst += BPS;
1426   }
1427 }
1428
1429 static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
1430   uint16x8_t sum_top;
1431   uint16x8_t sum_left;
1432   uint8x8_t dc0;
1433
1434   if (do_top) {
1435     const uint8x8_t A = vld1_u8(dst - BPS);  // top row
1436 #if WEBP_AARCH64
1437     const uint16_t p2 = vaddlv_u8(A);
1438     sum_top = vdupq_n_u16(p2);
1439 #else
1440     const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
1441     const uint16x4_t p1 = vpadd_u16(p0, p0);
1442     const uint16x4_t p2 = vpadd_u16(p1, p1);
1443     sum_top = vcombine_u16(p2, p2);
1444 #endif
1445   }
1446
1447   if (do_left) {
1448     const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
1449     const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
1450     const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
1451     const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
1452     const uint8x8_t L4 = vld1_u8(dst + 4 * BPS - 1);
1453     const uint8x8_t L5 = vld1_u8(dst + 5 * BPS - 1);
1454     const uint8x8_t L6 = vld1_u8(dst + 6 * BPS - 1);
1455     const uint8x8_t L7 = vld1_u8(dst + 7 * BPS - 1);
1456     const uint16x8_t s0 = vaddl_u8(L0, L1);
1457     const uint16x8_t s1 = vaddl_u8(L2, L3);
1458     const uint16x8_t s2 = vaddl_u8(L4, L5);
1459     const uint16x8_t s3 = vaddl_u8(L6, L7);
1460     const uint16x8_t s01 = vaddq_u16(s0, s1);
1461     const uint16x8_t s23 = vaddq_u16(s2, s3);
1462     sum_left = vaddq_u16(s01, s23);
1463   }
1464
1465   if (do_top && do_left) {
1466     const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
1467     dc0 = vrshrn_n_u16(sum, 4);
1468   } else if (do_top) {
1469     dc0 = vrshrn_n_u16(sum_top, 3);
1470   } else if (do_left) {
1471     dc0 = vrshrn_n_u16(sum_left, 3);
1472   } else {
1473     dc0 = vdup_n_u8(0x80);
1474   }
1475
1476   {
1477     const uint8x8_t dc = vdup_lane_u8(dc0, 0);
1478     int i;
1479     for (i = 0; i < 8; ++i) {
1480       vst1_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc));
1481     }
1482   }
1483 }
1484
1485 static void DC8uv_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 1); }
1486 static void DC8uvNoTop_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 1); }
1487 static void DC8uvNoLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 0); }
1488 static void DC8uvNoTopLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 0); }
1489
1490 static void TM8uv_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 8); }
1491
1492 //------------------------------------------------------------------------------
1493 // 16x16
1494
1495 static void VE16_NEON(uint8_t* dst) {     // vertical
1496   const uint8x16_t top = vld1q_u8(dst - BPS);
1497   int j;
1498   for (j = 0; j < 16; ++j) {
1499     vst1q_u8(dst + j * BPS, top);
1500   }
1501 }
1502
1503 static void HE16_NEON(uint8_t* dst) {     // horizontal
1504   int j;
1505   for (j = 0; j < 16; ++j) {
1506     const uint8x16_t left = vld1q_dup_u8(dst - 1);
1507     vst1q_u8(dst, left);
1508     dst += BPS;
1509   }
1510 }
1511
1512 static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) {
1513   uint16x8_t sum_top;
1514   uint16x8_t sum_left;
1515   uint8x8_t dc0;
1516
1517   if (do_top) {
1518     const uint8x16_t A = vld1q_u8(dst - BPS);  // top row
1519 #if WEBP_AARCH64
1520     const uint16_t p3 = vaddlvq_u8(A);
1521     sum_top = vdupq_n_u16(p3);
1522 #else
1523     const uint16x8_t p0 = vpaddlq_u8(A);  // cascading summation of the top
1524     const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
1525     const uint16x4_t p2 = vpadd_u16(p1, p1);
1526     const uint16x4_t p3 = vpadd_u16(p2, p2);
1527     sum_top = vcombine_u16(p3, p3);
1528 #endif
1529   }
1530
1531   if (do_left) {
1532     int i;
1533     sum_left = vdupq_n_u16(0);
1534     for (i = 0; i < 16; i += 8) {
1535       const uint8x8_t L0 = vld1_u8(dst + (i + 0) * BPS - 1);
1536       const uint8x8_t L1 = vld1_u8(dst + (i + 1) * BPS - 1);
1537       const uint8x8_t L2 = vld1_u8(dst + (i + 2) * BPS - 1);
1538       const uint8x8_t L3 = vld1_u8(dst + (i + 3) * BPS - 1);
1539       const uint8x8_t L4 = vld1_u8(dst + (i + 4) * BPS - 1);
1540       const uint8x8_t L5 = vld1_u8(dst + (i + 5) * BPS - 1);
1541       const uint8x8_t L6 = vld1_u8(dst + (i + 6) * BPS - 1);
1542       const uint8x8_t L7 = vld1_u8(dst + (i + 7) * BPS - 1);
1543       const uint16x8_t s0 = vaddl_u8(L0, L1);
1544       const uint16x8_t s1 = vaddl_u8(L2, L3);
1545       const uint16x8_t s2 = vaddl_u8(L4, L5);
1546       const uint16x8_t s3 = vaddl_u8(L6, L7);
1547       const uint16x8_t s01 = vaddq_u16(s0, s1);
1548       const uint16x8_t s23 = vaddq_u16(s2, s3);
1549       const uint16x8_t sum = vaddq_u16(s01, s23);
1550       sum_left = vaddq_u16(sum_left, sum);
1551     }
1552   }
1553
1554   if (do_top && do_left) {
1555     const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
1556     dc0 = vrshrn_n_u16(sum, 5);
1557   } else if (do_top) {
1558     dc0 = vrshrn_n_u16(sum_top, 4);
1559   } else if (do_left) {
1560     dc0 = vrshrn_n_u16(sum_left, 4);
1561   } else {
1562     dc0 = vdup_n_u8(0x80);
1563   }
1564
1565   {
1566     const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
1567     int i;
1568     for (i = 0; i < 16; ++i) {
1569       vst1q_u8(dst + i * BPS, dc);
1570     }
1571   }
1572 }
1573
1574 static void DC16TopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 1); }
1575 static void DC16NoTop_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 1); }
1576 static void DC16NoLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 0); }
1577 static void DC16NoTopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 0); }
1578
1579 static void TM16_NEON(uint8_t* dst) {
1580   const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
1581   const uint8x16_t T = vld1q_u8(dst - BPS);  // top row 'A[0..15]'
1582   // A[c] - A[-1]
1583   const int16x8_t d_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), TL));
1584   const int16x8_t d_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), TL));
1585   int y;
1586   for (y = 0; y < 16; y += 4) {
1587     // left edge
1588     const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));
1589     const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));
1590     const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));
1591     const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));
1592     const int16x8_t r0_lo = vaddq_s16(L0, d_lo);  // L[r] + A[c] - A[-1]
1593     const int16x8_t r1_lo = vaddq_s16(L1, d_lo);
1594     const int16x8_t r2_lo = vaddq_s16(L2, d_lo);
1595     const int16x8_t r3_lo = vaddq_s16(L3, d_lo);
1596     const int16x8_t r0_hi = vaddq_s16(L0, d_hi);
1597     const int16x8_t r1_hi = vaddq_s16(L1, d_hi);
1598     const int16x8_t r2_hi = vaddq_s16(L2, d_hi);
1599     const int16x8_t r3_hi = vaddq_s16(L3, d_hi);
1600     // Saturate and store the result.
1601     const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi));
1602     const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi));
1603     const uint8x16_t row2 = vcombine_u8(vqmovun_s16(r2_lo), vqmovun_s16(r2_hi));
1604     const uint8x16_t row3 = vcombine_u8(vqmovun_s16(r3_lo), vqmovun_s16(r3_hi));
1605     vst1q_u8(dst + 0 * BPS, row0);
1606     vst1q_u8(dst + 1 * BPS, row1);
1607     vst1q_u8(dst + 2 * BPS, row2);
1608     vst1q_u8(dst + 3 * BPS, row3);
1609     dst += 4 * BPS;
1610   }
1611 }
1612
1613 //------------------------------------------------------------------------------
1614 // Entry point
1615
1616 extern void VP8DspInitNEON(void);
1617
1618 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) {
1619   VP8Transform = TransformTwo_NEON;
1620   VP8TransformAC3 = TransformAC3_NEON;
1621   VP8TransformDC = TransformDC_NEON;
1622   VP8TransformWHT = TransformWHT_NEON;
1623
1624   VP8VFilter16 = VFilter16_NEON;
1625   VP8VFilter16i = VFilter16i_NEON;
1626   VP8HFilter16 = HFilter16_NEON;
1627 #if !defined(WORK_AROUND_GCC)
1628   VP8HFilter16i = HFilter16i_NEON;
1629 #endif
1630   VP8VFilter8 = VFilter8_NEON;
1631   VP8VFilter8i = VFilter8i_NEON;
1632 #if !defined(WORK_AROUND_GCC)
1633   VP8HFilter8 = HFilter8_NEON;
1634   VP8HFilter8i = HFilter8i_NEON;
1635 #endif
1636   VP8SimpleVFilter16 = SimpleVFilter16_NEON;
1637   VP8SimpleHFilter16 = SimpleHFilter16_NEON;
1638   VP8SimpleVFilter16i = SimpleVFilter16i_NEON;
1639   VP8SimpleHFilter16i = SimpleHFilter16i_NEON;
1640
1641   VP8PredLuma4[0] = DC4_NEON;
1642   VP8PredLuma4[1] = TM4_NEON;
1643   VP8PredLuma4[2] = VE4_NEON;
1644   VP8PredLuma4[4] = RD4_NEON;
1645   VP8PredLuma4[6] = LD4_NEON;
1646
1647   VP8PredLuma16[0] = DC16TopLeft_NEON;
1648   VP8PredLuma16[1] = TM16_NEON;
1649   VP8PredLuma16[2] = VE16_NEON;
1650   VP8PredLuma16[3] = HE16_NEON;
1651   VP8PredLuma16[4] = DC16NoTop_NEON;
1652   VP8PredLuma16[5] = DC16NoLeft_NEON;
1653   VP8PredLuma16[6] = DC16NoTopLeft_NEON;
1654
1655   VP8PredChroma8[0] = DC8uv_NEON;
1656   VP8PredChroma8[1] = TM8uv_NEON;
1657   VP8PredChroma8[2] = VE8uv_NEON;
1658   VP8PredChroma8[3] = HE8uv_NEON;
1659   VP8PredChroma8[4] = DC8uvNoTop_NEON;
1660   VP8PredChroma8[5] = DC8uvNoLeft_NEON;
1661   VP8PredChroma8[6] = DC8uvNoTopLeft_NEON;
1662 }
1663
1664 #else  // !WEBP_USE_NEON
1665
1666 WEBP_DSP_INIT_STUB(VP8DspInitNEON)
1667
1668 #endif  // WEBP_USE_NEON