1 // Copyright 2012 Google Inc. All Rights Reserved.
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
10 // ARM NEON version of dsp functions and loop filtering.
12 // Authors: Somnath Banerjee (somnath@google.com)
13 // Johann Koenig (johannkoenig@google.com)
17 #include "src/dsp/dsp.h"
19 #if defined(WEBP_USE_NEON)
21 #include "src/dsp/neon.h"
22 #include "src/dec/vp8i_dec.h"
24 //------------------------------------------------------------------------------
25 // NxM Loading functions
27 #if !defined(WORK_AROUND_GCC)
29 // This intrinsics version makes gcc-4.6.3 crash during Load4x??() compilation
30 // (register alloc, probably). The variants somewhat mitigate the problem, but
31 // not quite. HFilter16i() remains problematic.
32 static WEBP_INLINE uint8x8x4_t Load4x8_NEON(const uint8_t* const src,
34 const uint8x8_t zero = vdup_n_u8(0);
36 INIT_VECTOR4(out, zero, zero, zero, zero);
37 out = vld4_lane_u8(src + 0 * stride, out, 0);
38 out = vld4_lane_u8(src + 1 * stride, out, 1);
39 out = vld4_lane_u8(src + 2 * stride, out, 2);
40 out = vld4_lane_u8(src + 3 * stride, out, 3);
41 out = vld4_lane_u8(src + 4 * stride, out, 4);
42 out = vld4_lane_u8(src + 5 * stride, out, 5);
43 out = vld4_lane_u8(src + 6 * stride, out, 6);
44 out = vld4_lane_u8(src + 7 * stride, out, 7);
48 static WEBP_INLINE void Load4x16_NEON(const uint8_t* const src, int stride,
52 uint8x16_t* const q1) {
53 // row0 = p1[0..7]|p0[0..7]|q0[0..7]|q1[0..7]
54 // row8 = p1[8..15]|p0[8..15]|q0[8..15]|q1[8..15]
55 const uint8x8x4_t row0 = Load4x8_NEON(src - 2 + 0 * stride, stride);
56 const uint8x8x4_t row8 = Load4x8_NEON(src - 2 + 8 * stride, stride);
57 *p1 = vcombine_u8(row0.val[0], row8.val[0]);
58 *p0 = vcombine_u8(row0.val[1], row8.val[1]);
59 *q0 = vcombine_u8(row0.val[2], row8.val[2]);
60 *q1 = vcombine_u8(row0.val[3], row8.val[3]);
63 #else // WORK_AROUND_GCC
65 #define LOADQ_LANE_32b(VALUE, LANE) do { \
66 (VALUE) = vld1q_lane_u32((const uint32_t*)src, (VALUE), (LANE)); \
70 static WEBP_INLINE void Load4x16_NEON(const uint8_t* src, int stride,
74 uint8x16_t* const q1) {
75 const uint32x4_t zero = vdupq_n_u32(0);
77 INIT_VECTOR4(in, zero, zero, zero, zero);
79 LOADQ_LANE_32b(in.val[0], 0);
80 LOADQ_LANE_32b(in.val[1], 0);
81 LOADQ_LANE_32b(in.val[2], 0);
82 LOADQ_LANE_32b(in.val[3], 0);
83 LOADQ_LANE_32b(in.val[0], 1);
84 LOADQ_LANE_32b(in.val[1], 1);
85 LOADQ_LANE_32b(in.val[2], 1);
86 LOADQ_LANE_32b(in.val[3], 1);
87 LOADQ_LANE_32b(in.val[0], 2);
88 LOADQ_LANE_32b(in.val[1], 2);
89 LOADQ_LANE_32b(in.val[2], 2);
90 LOADQ_LANE_32b(in.val[3], 2);
91 LOADQ_LANE_32b(in.val[0], 3);
92 LOADQ_LANE_32b(in.val[1], 3);
93 LOADQ_LANE_32b(in.val[2], 3);
94 LOADQ_LANE_32b(in.val[3], 3);
95 // Transpose four 4x4 parts:
97 const uint8x16x2_t row01 = vtrnq_u8(vreinterpretq_u8_u32(in.val[0]),
98 vreinterpretq_u8_u32(in.val[1]));
99 const uint8x16x2_t row23 = vtrnq_u8(vreinterpretq_u8_u32(in.val[2]),
100 vreinterpretq_u8_u32(in.val[3]));
101 const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
102 vreinterpretq_u16_u8(row23.val[0]));
103 const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
104 vreinterpretq_u16_u8(row23.val[1]));
105 *p1 = vreinterpretq_u8_u16(row02.val[0]);
106 *p0 = vreinterpretq_u8_u16(row13.val[0]);
107 *q0 = vreinterpretq_u8_u16(row02.val[1]);
108 *q1 = vreinterpretq_u8_u16(row13.val[1]);
111 #undef LOADQ_LANE_32b
113 #endif // !WORK_AROUND_GCC
115 static WEBP_INLINE void Load8x16_NEON(
116 const uint8_t* const src, int stride,
117 uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
118 uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
119 uint8x16_t* const q2, uint8x16_t* const q3) {
120 Load4x16_NEON(src - 2, stride, p3, p2, p1, p0);
121 Load4x16_NEON(src + 2, stride, q0, q1, q2, q3);
124 static WEBP_INLINE void Load16x4_NEON(const uint8_t* const src, int stride,
125 uint8x16_t* const p1,
126 uint8x16_t* const p0,
127 uint8x16_t* const q0,
128 uint8x16_t* const q1) {
129 *p1 = vld1q_u8(src - 2 * stride);
130 *p0 = vld1q_u8(src - 1 * stride);
131 *q0 = vld1q_u8(src + 0 * stride);
132 *q1 = vld1q_u8(src + 1 * stride);
135 static WEBP_INLINE void Load16x8_NEON(
136 const uint8_t* const src, int stride,
137 uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
138 uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
139 uint8x16_t* const q2, uint8x16_t* const q3) {
140 Load16x4_NEON(src - 2 * stride, stride, p3, p2, p1, p0);
141 Load16x4_NEON(src + 2 * stride, stride, q0, q1, q2, q3);
144 static WEBP_INLINE void Load8x8x2_NEON(
145 const uint8_t* const u, const uint8_t* const v, int stride,
146 uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
147 uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
148 uint8x16_t* const q2, uint8x16_t* const q3) {
149 // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
150 // and the v-samples on the higher half.
151 *p3 = vcombine_u8(vld1_u8(u - 4 * stride), vld1_u8(v - 4 * stride));
152 *p2 = vcombine_u8(vld1_u8(u - 3 * stride), vld1_u8(v - 3 * stride));
153 *p1 = vcombine_u8(vld1_u8(u - 2 * stride), vld1_u8(v - 2 * stride));
154 *p0 = vcombine_u8(vld1_u8(u - 1 * stride), vld1_u8(v - 1 * stride));
155 *q0 = vcombine_u8(vld1_u8(u + 0 * stride), vld1_u8(v + 0 * stride));
156 *q1 = vcombine_u8(vld1_u8(u + 1 * stride), vld1_u8(v + 1 * stride));
157 *q2 = vcombine_u8(vld1_u8(u + 2 * stride), vld1_u8(v + 2 * stride));
158 *q3 = vcombine_u8(vld1_u8(u + 3 * stride), vld1_u8(v + 3 * stride));
161 #if !defined(WORK_AROUND_GCC)
163 #define LOAD_UV_8(ROW) \
164 vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride))
166 static WEBP_INLINE void Load8x8x2T_NEON(
167 const uint8_t* const u, const uint8_t* const v, int stride,
168 uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
169 uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
170 uint8x16_t* const q2, uint8x16_t* const q3) {
171 // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
172 // and the v-samples on the higher half.
173 const uint8x16_t row0 = LOAD_UV_8(0);
174 const uint8x16_t row1 = LOAD_UV_8(1);
175 const uint8x16_t row2 = LOAD_UV_8(2);
176 const uint8x16_t row3 = LOAD_UV_8(3);
177 const uint8x16_t row4 = LOAD_UV_8(4);
178 const uint8x16_t row5 = LOAD_UV_8(5);
179 const uint8x16_t row6 = LOAD_UV_8(6);
180 const uint8x16_t row7 = LOAD_UV_8(7);
181 // Perform two side-by-side 8x8 transposes
182 // u00 u01 u02 u03 u04 u05 u06 u07 | v00 v01 v02 v03 v04 v05 v06 v07
183 // u10 u11 u12 u13 u14 u15 u16 u17 | v10 v11 v12 ...
184 // u20 u21 u22 u23 u24 u25 u26 u27 | v20 v21 ...
185 // u30 u31 u32 u33 u34 u35 u36 u37 | ...
186 // u40 u41 u42 u43 u44 u45 u46 u47 | ...
187 // u50 u51 u52 u53 u54 u55 u56 u57 | ...
188 // u60 u61 u62 u63 u64 u65 u66 u67 | v60 ...
189 // u70 u71 u72 u73 u74 u75 u76 u77 | v70 v71 v72 ...
190 const uint8x16x2_t row01 = vtrnq_u8(row0, row1); // u00 u10 u02 u12 ...
191 // u01 u11 u03 u13 ...
192 const uint8x16x2_t row23 = vtrnq_u8(row2, row3); // u20 u30 u22 u32 ...
193 // u21 u31 u23 u33 ...
194 const uint8x16x2_t row45 = vtrnq_u8(row4, row5); // ...
195 const uint8x16x2_t row67 = vtrnq_u8(row6, row7); // ...
196 const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
197 vreinterpretq_u16_u8(row23.val[0]));
198 const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
199 vreinterpretq_u16_u8(row23.val[1]));
200 const uint16x8x2_t row46 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[0]),
201 vreinterpretq_u16_u8(row67.val[0]));
202 const uint16x8x2_t row57 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[1]),
203 vreinterpretq_u16_u8(row67.val[1]));
204 const uint32x4x2_t row04 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[0]),
205 vreinterpretq_u32_u16(row46.val[0]));
206 const uint32x4x2_t row26 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[1]),
207 vreinterpretq_u32_u16(row46.val[1]));
208 const uint32x4x2_t row15 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[0]),
209 vreinterpretq_u32_u16(row57.val[0]));
210 const uint32x4x2_t row37 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[1]),
211 vreinterpretq_u32_u16(row57.val[1]));
212 *p3 = vreinterpretq_u8_u32(row04.val[0]);
213 *p2 = vreinterpretq_u8_u32(row15.val[0]);
214 *p1 = vreinterpretq_u8_u32(row26.val[0]);
215 *p0 = vreinterpretq_u8_u32(row37.val[0]);
216 *q0 = vreinterpretq_u8_u32(row04.val[1]);
217 *q1 = vreinterpretq_u8_u32(row15.val[1]);
218 *q2 = vreinterpretq_u8_u32(row26.val[1]);
219 *q3 = vreinterpretq_u8_u32(row37.val[1]);
223 #endif // !WORK_AROUND_GCC
225 static WEBP_INLINE void Store2x8_NEON(const uint8x8x2_t v,
226 uint8_t* const dst, int stride) {
227 vst2_lane_u8(dst + 0 * stride, v, 0);
228 vst2_lane_u8(dst + 1 * stride, v, 1);
229 vst2_lane_u8(dst + 2 * stride, v, 2);
230 vst2_lane_u8(dst + 3 * stride, v, 3);
231 vst2_lane_u8(dst + 4 * stride, v, 4);
232 vst2_lane_u8(dst + 5 * stride, v, 5);
233 vst2_lane_u8(dst + 6 * stride, v, 6);
234 vst2_lane_u8(dst + 7 * stride, v, 7);
237 static WEBP_INLINE void Store2x16_NEON(const uint8x16_t p0, const uint8x16_t q0,
238 uint8_t* const dst, int stride) {
240 lo.val[0] = vget_low_u8(p0);
241 lo.val[1] = vget_low_u8(q0);
242 hi.val[0] = vget_high_u8(p0);
243 hi.val[1] = vget_high_u8(q0);
244 Store2x8_NEON(lo, dst - 1 + 0 * stride, stride);
245 Store2x8_NEON(hi, dst - 1 + 8 * stride, stride);
248 #if !defined(WORK_AROUND_GCC)
249 static WEBP_INLINE void Store4x8_NEON(const uint8x8x4_t v,
250 uint8_t* const dst, int stride) {
251 vst4_lane_u8(dst + 0 * stride, v, 0);
252 vst4_lane_u8(dst + 1 * stride, v, 1);
253 vst4_lane_u8(dst + 2 * stride, v, 2);
254 vst4_lane_u8(dst + 3 * stride, v, 3);
255 vst4_lane_u8(dst + 4 * stride, v, 4);
256 vst4_lane_u8(dst + 5 * stride, v, 5);
257 vst4_lane_u8(dst + 6 * stride, v, 6);
258 vst4_lane_u8(dst + 7 * stride, v, 7);
261 static WEBP_INLINE void Store4x16_NEON(const uint8x16_t p1, const uint8x16_t p0,
262 const uint8x16_t q0, const uint8x16_t q1,
263 uint8_t* const dst, int stride) {
266 vget_low_u8(p1), vget_low_u8(p0),
267 vget_low_u8(q0), vget_low_u8(q1));
269 vget_high_u8(p1), vget_high_u8(p0),
270 vget_high_u8(q0), vget_high_u8(q1));
271 Store4x8_NEON(lo, dst - 2 + 0 * stride, stride);
272 Store4x8_NEON(hi, dst - 2 + 8 * stride, stride);
274 #endif // !WORK_AROUND_GCC
276 static WEBP_INLINE void Store16x2_NEON(const uint8x16_t p0, const uint8x16_t q0,
277 uint8_t* const dst, int stride) {
278 vst1q_u8(dst - stride, p0);
282 static WEBP_INLINE void Store16x4_NEON(const uint8x16_t p1, const uint8x16_t p0,
283 const uint8x16_t q0, const uint8x16_t q1,
284 uint8_t* const dst, int stride) {
285 Store16x2_NEON(p1, p0, dst - stride, stride);
286 Store16x2_NEON(q0, q1, dst + stride, stride);
289 static WEBP_INLINE void Store8x2x2_NEON(const uint8x16_t p0,
291 uint8_t* const u, uint8_t* const v,
293 // p0 and q0 contain the u+v samples packed in low/high halves.
294 vst1_u8(u - stride, vget_low_u8(p0));
295 vst1_u8(u, vget_low_u8(q0));
296 vst1_u8(v - stride, vget_high_u8(p0));
297 vst1_u8(v, vget_high_u8(q0));
300 static WEBP_INLINE void Store8x4x2_NEON(const uint8x16_t p1,
304 uint8_t* const u, uint8_t* const v,
306 // The p1...q1 registers contain the u+v samples packed in low/high halves.
307 Store8x2x2_NEON(p1, p0, u - stride, v - stride, stride);
308 Store8x2x2_NEON(q0, q1, u + stride, v + stride, stride);
311 #if !defined(WORK_AROUND_GCC)
313 #define STORE6_LANE(DST, VAL0, VAL1, LANE) do { \
314 vst3_lane_u8((DST) - 3, (VAL0), (LANE)); \
315 vst3_lane_u8((DST) + 0, (VAL1), (LANE)); \
319 static WEBP_INLINE void Store6x8x2_NEON(
320 const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
321 const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
322 uint8_t* u, uint8_t* v, int stride) {
323 uint8x8x3_t u0, u1, v0, v1;
324 INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0));
325 INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2));
326 INIT_VECTOR3(v0, vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0));
327 INIT_VECTOR3(v1, vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2));
328 STORE6_LANE(u, u0, u1, 0);
329 STORE6_LANE(u, u0, u1, 1);
330 STORE6_LANE(u, u0, u1, 2);
331 STORE6_LANE(u, u0, u1, 3);
332 STORE6_LANE(u, u0, u1, 4);
333 STORE6_LANE(u, u0, u1, 5);
334 STORE6_LANE(u, u0, u1, 6);
335 STORE6_LANE(u, u0, u1, 7);
336 STORE6_LANE(v, v0, v1, 0);
337 STORE6_LANE(v, v0, v1, 1);
338 STORE6_LANE(v, v0, v1, 2);
339 STORE6_LANE(v, v0, v1, 3);
340 STORE6_LANE(v, v0, v1, 4);
341 STORE6_LANE(v, v0, v1, 5);
342 STORE6_LANE(v, v0, v1, 6);
343 STORE6_LANE(v, v0, v1, 7);
347 static WEBP_INLINE void Store4x8x2_NEON(const uint8x16_t p1,
351 uint8_t* const u, uint8_t* const v,
355 vget_low_u8(p1), vget_low_u8(p0),
356 vget_low_u8(q0), vget_low_u8(q1));
358 vget_high_u8(p1), vget_high_u8(p0),
359 vget_high_u8(q0), vget_high_u8(q1));
360 vst4_lane_u8(u - 2 + 0 * stride, u0, 0);
361 vst4_lane_u8(u - 2 + 1 * stride, u0, 1);
362 vst4_lane_u8(u - 2 + 2 * stride, u0, 2);
363 vst4_lane_u8(u - 2 + 3 * stride, u0, 3);
364 vst4_lane_u8(u - 2 + 4 * stride, u0, 4);
365 vst4_lane_u8(u - 2 + 5 * stride, u0, 5);
366 vst4_lane_u8(u - 2 + 6 * stride, u0, 6);
367 vst4_lane_u8(u - 2 + 7 * stride, u0, 7);
368 vst4_lane_u8(v - 2 + 0 * stride, v0, 0);
369 vst4_lane_u8(v - 2 + 1 * stride, v0, 1);
370 vst4_lane_u8(v - 2 + 2 * stride, v0, 2);
371 vst4_lane_u8(v - 2 + 3 * stride, v0, 3);
372 vst4_lane_u8(v - 2 + 4 * stride, v0, 4);
373 vst4_lane_u8(v - 2 + 5 * stride, v0, 5);
374 vst4_lane_u8(v - 2 + 6 * stride, v0, 6);
375 vst4_lane_u8(v - 2 + 7 * stride, v0, 7);
378 #endif // !WORK_AROUND_GCC
380 // Zero extend 'v' to an int16x8_t.
381 static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint8x8_t v) {
382 return vreinterpretq_s16_u16(vmovl_u8(v));
385 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
386 // to the corresponding rows of 'dst'.
387 static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
388 const int16x8_t dst01,
389 const int16x8_t dst23) {
390 // Unsigned saturate to 8b.
391 const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
392 const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
394 // Store the results.
395 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
396 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
397 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
398 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
401 static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
402 const int16x8_t row23,
403 uint8_t* const dst) {
404 uint32x2_t dst01 = vdup_n_u32(0);
405 uint32x2_t dst23 = vdup_n_u32(0);
407 // Load the source pixels.
408 dst01 = vld1_lane_u32((uint32_t*)(dst + 0 * BPS), dst01, 0);
409 dst23 = vld1_lane_u32((uint32_t*)(dst + 2 * BPS), dst23, 0);
410 dst01 = vld1_lane_u32((uint32_t*)(dst + 1 * BPS), dst01, 1);
411 dst23 = vld1_lane_u32((uint32_t*)(dst + 3 * BPS), dst23, 1);
415 const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst01));
416 const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst23));
418 // Descale with rounding.
419 const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
420 const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
421 // Add the inverse transform.
422 SaturateAndStore4x4_NEON(dst, out01, out23);
426 //-----------------------------------------------------------------------------
427 // Simple In-loop filtering (Paragraph 15.2)
429 static uint8x16_t NeedsFilter_NEON(const uint8x16_t p1, const uint8x16_t p0,
430 const uint8x16_t q0, const uint8x16_t q1,
432 const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh);
433 const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0); // abs(p0-q0)
434 const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1); // abs(p1-q1)
435 const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0); // 2 * abs(p0-q0)
436 const uint8x16_t a_p1_q1_2 = vshrq_n_u8(a_p1_q1, 1); // abs(p1-q1) / 2
437 const uint8x16_t sum = vqaddq_u8(a_p0_q0_2, a_p1_q1_2);
438 const uint8x16_t mask = vcgeq_u8(thresh_v, sum);
442 static int8x16_t FlipSign_NEON(const uint8x16_t v) {
443 const uint8x16_t sign_bit = vdupq_n_u8(0x80);
444 return vreinterpretq_s8_u8(veorq_u8(v, sign_bit));
447 static uint8x16_t FlipSignBack_NEON(const int8x16_t v) {
448 const int8x16_t sign_bit = vdupq_n_s8(0x80);
449 return vreinterpretq_u8_s8(veorq_s8(v, sign_bit));
452 static int8x16_t GetBaseDelta_NEON(const int8x16_t p1, const int8x16_t p0,
453 const int8x16_t q0, const int8x16_t q1) {
454 const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0)
455 const int8x16_t p1_q1 = vqsubq_s8(p1, q1); // (p1-q1)
456 const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0); // (p1-q1) + 1 * (q0 - p0)
457 const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // (p1-q1) + 2 * (q0 - p0)
458 const int8x16_t s3 = vqaddq_s8(q0_p0, s2); // (p1-q1) + 3 * (q0 - p0)
462 static int8x16_t GetBaseDelta0_NEON(const int8x16_t p0, const int8x16_t q0) {
463 const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0)
464 const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0); // 2 * (q0 - p0)
465 const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // 3 * (q0 - p0)
469 //------------------------------------------------------------------------------
471 static void ApplyFilter2NoFlip_NEON(const int8x16_t p0s, const int8x16_t q0s,
472 const int8x16_t delta,
473 int8x16_t* const op0,
474 int8x16_t* const oq0) {
475 const int8x16_t kCst3 = vdupq_n_s8(0x03);
476 const int8x16_t kCst4 = vdupq_n_s8(0x04);
477 const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
478 const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
479 const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
480 const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
481 *op0 = vqaddq_s8(p0s, delta3);
482 *oq0 = vqsubq_s8(q0s, delta4);
485 #if defined(WEBP_USE_INTRINSICS)
487 static void ApplyFilter2_NEON(const int8x16_t p0s, const int8x16_t q0s,
488 const int8x16_t delta,
489 uint8x16_t* const op0, uint8x16_t* const oq0) {
490 const int8x16_t kCst3 = vdupq_n_s8(0x03);
491 const int8x16_t kCst4 = vdupq_n_s8(0x04);
492 const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
493 const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
494 const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
495 const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
496 const int8x16_t sp0 = vqaddq_s8(p0s, delta3);
497 const int8x16_t sq0 = vqsubq_s8(q0s, delta4);
498 *op0 = FlipSignBack_NEON(sp0);
499 *oq0 = FlipSignBack_NEON(sq0);
502 static void DoFilter2_NEON(const uint8x16_t p1, const uint8x16_t p0,
503 const uint8x16_t q0, const uint8x16_t q1,
504 const uint8x16_t mask,
505 uint8x16_t* const op0, uint8x16_t* const oq0) {
506 const int8x16_t p1s = FlipSign_NEON(p1);
507 const int8x16_t p0s = FlipSign_NEON(p0);
508 const int8x16_t q0s = FlipSign_NEON(q0);
509 const int8x16_t q1s = FlipSign_NEON(q1);
510 const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
511 const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));
512 ApplyFilter2_NEON(p0s, q0s, delta1, op0, oq0);
515 static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
516 uint8x16_t p1, p0, q0, q1, op0, oq0;
517 Load16x4_NEON(p, stride, &p1, &p0, &q0, &q1);
519 const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
520 DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
522 Store16x2_NEON(op0, oq0, p, stride);
525 static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
526 uint8x16_t p1, p0, q0, q1, oq0, op0;
527 Load4x16_NEON(p, stride, &p1, &p0, &q0, &q1);
529 const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
530 DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
532 Store2x16_NEON(op0, oq0, p, stride);
537 // Load/Store vertical edge
538 #define LOAD8x4(c1, c2, c3, c4, b1, b2, stride) \
539 "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \
540 "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \
541 "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \
542 "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \
543 "vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \
544 "vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \
545 "vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \
546 "vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n"
548 #define STORE8x2(c1, c2, p, stride) \
549 "vst2.8 {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n" \
550 "vst2.8 {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n" \
551 "vst2.8 {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n" \
552 "vst2.8 {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n" \
553 "vst2.8 {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n" \
554 "vst2.8 {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n" \
555 "vst2.8 {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n" \
556 "vst2.8 {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n"
558 #define QRegs "q0", "q1", "q2", "q3", \
559 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
561 #define FLIP_SIGN_BIT2(a, b, s) \
562 "veor " #a "," #a "," #s " \n" \
563 "veor " #b "," #b "," #s " \n" \
565 #define FLIP_SIGN_BIT4(a, b, c, d, s) \
566 FLIP_SIGN_BIT2(a, b, s) \
567 FLIP_SIGN_BIT2(c, d, s) \
569 #define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask) \
570 "vabd.u8 q15," #p0 "," #q0 " \n" /* abs(p0 - q0) */ \
571 "vabd.u8 q14," #p1 "," #q1 " \n" /* abs(p1 - q1) */ \
572 "vqadd.u8 q15, q15, q15 \n" /* abs(p0 - q0) * 2 */ \
573 "vshr.u8 q14, q14, #1 \n" /* abs(p1 - q1) / 2 */ \
574 "vqadd.u8 q15, q15, q14 \n" /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
575 "vdup.8 q14, " #thresh " \n" \
576 "vcge.u8 " #mask ", q14, q15 \n" /* mask <= thresh */
578 #define GET_BASE_DELTA(p1, p0, q0, q1, o) \
579 "vqsub.s8 q15," #q0 "," #p0 " \n" /* (q0 - p0) */ \
580 "vqsub.s8 " #o "," #p1 "," #q1 " \n" /* (p1 - q1) */ \
581 "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 1 * (p0 - q0) */ \
582 "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 2 * (p0 - q0) */ \
583 "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 3 * (p0 - q0) */
585 #define DO_SIMPLE_FILTER(p0, q0, fl) \
586 "vmov.i8 q15, #0x03 \n" \
587 "vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 3 */ \
588 "vshr.s8 q15, q15, #3 \n" /* filter1 >> 3 */ \
589 "vqadd.s8 " #p0 "," #p0 ", q15 \n" /* p0 += filter1 */ \
591 "vmov.i8 q15, #0x04 \n" \
592 "vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 4 */ \
593 "vshr.s8 q15, q15, #3 \n" /* filter2 >> 3 */ \
594 "vqsub.s8 " #q0 "," #q0 ", q15 \n" /* q0 -= filter2 */
596 // Applies filter on 2 pixels (p0 and q0)
597 #define DO_FILTER2(p1, p0, q0, q1, thresh) \
598 NEEDS_FILTER(p1, p0, q0, q1, thresh, q9) /* filter mask in q9 */ \
599 "vmov.i8 q10, #0x80 \n" /* sign bit */ \
600 FLIP_SIGN_BIT4(p1, p0, q0, q1, q10) /* convert to signed value */ \
601 GET_BASE_DELTA(p1, p0, q0, q1, q11) /* get filter level */ \
602 "vand q9, q9, q11 \n" /* apply filter mask */ \
603 DO_SIMPLE_FILTER(p0, q0, q9) /* apply filter */ \
604 FLIP_SIGN_BIT2(p0, q0, q10)
606 static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
608 "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride
610 "vld1.u8 {q1}, [%[p]], %[stride] \n" // p1
611 "vld1.u8 {q2}, [%[p]], %[stride] \n" // p0
612 "vld1.u8 {q3}, [%[p]], %[stride] \n" // q0
613 "vld1.u8 {q12}, [%[p]] \n" // q1
615 DO_FILTER2(q1, q2, q3, q12, %[thresh])
617 "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride
619 "vst1.u8 {q2}, [%[p]], %[stride] \n" // store op0
620 "vst1.u8 {q3}, [%[p]] \n" // store oq0
622 : [stride] "r"(stride), [thresh] "r"(thresh)
627 static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
629 "sub r4, %[p], #2 \n" // base1 = p - 2
630 "lsl r6, %[stride], #1 \n" // r6 = 2 * stride
631 "add r5, r4, %[stride] \n" // base2 = base1 + stride
633 LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
634 LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6)
635 "vswp d3, d24 \n" // p1:q1 p0:q3
636 "vswp d5, d26 \n" // q0:q2 q1:q4
637 "vswp q2, q12 \n" // p1:q1 p0:q2 q0:q3 q1:q4
639 DO_FILTER2(q1, q2, q12, q13, %[thresh])
641 "sub %[p], %[p], #1 \n" // p - 1
644 STORE8x2(d4, d5, [%[p]], %[stride])
645 STORE8x2(d24, d25, [%[p]], %[stride])
648 : [stride] "r"(stride), [thresh] "r"(thresh)
649 : "memory", "r4", "r5", "r6", QRegs
656 #endif // WEBP_USE_INTRINSICS
658 static void SimpleVFilter16i_NEON(uint8_t* p, int stride, int thresh) {
660 for (k = 3; k != 0; --k) {
662 SimpleVFilter16_NEON(p, stride, thresh);
666 static void SimpleHFilter16i_NEON(uint8_t* p, int stride, int thresh) {
668 for (k = 3; k != 0; --k) {
670 SimpleHFilter16_NEON(p, stride, thresh);
674 //------------------------------------------------------------------------------
675 // Complex In-loop filtering (Paragraph 15.3)
677 static uint8x16_t NeedsHev_NEON(const uint8x16_t p1, const uint8x16_t p0,
678 const uint8x16_t q0, const uint8x16_t q1,
680 const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);
681 const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0)
682 const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0)
683 const uint8x16_t a_max = vmaxq_u8(a_p1_p0, a_q1_q0);
684 const uint8x16_t mask = vcgtq_u8(a_max, hev_thresh_v);
688 static uint8x16_t NeedsFilter2_NEON(const uint8x16_t p3, const uint8x16_t p2,
689 const uint8x16_t p1, const uint8x16_t p0,
690 const uint8x16_t q0, const uint8x16_t q1,
691 const uint8x16_t q2, const uint8x16_t q3,
692 int ithresh, int thresh) {
693 const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh);
694 const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2); // abs(p3 - p2)
695 const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1); // abs(p2 - p1)
696 const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0)
697 const uint8x16_t a_q3_q2 = vabdq_u8(q3, q2); // abs(q3 - q2)
698 const uint8x16_t a_q2_q1 = vabdq_u8(q2, q1); // abs(q2 - q1)
699 const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0)
700 const uint8x16_t max1 = vmaxq_u8(a_p3_p2, a_p2_p1);
701 const uint8x16_t max2 = vmaxq_u8(a_p1_p0, a_q3_q2);
702 const uint8x16_t max3 = vmaxq_u8(a_q2_q1, a_q1_q0);
703 const uint8x16_t max12 = vmaxq_u8(max1, max2);
704 const uint8x16_t max123 = vmaxq_u8(max12, max3);
705 const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123);
706 const uint8x16_t mask1 = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
707 const uint8x16_t mask = vandq_u8(mask1, mask2);
713 static void ApplyFilter4_NEON(
714 const int8x16_t p1, const int8x16_t p0,
715 const int8x16_t q0, const int8x16_t q1,
716 const int8x16_t delta0,
717 uint8x16_t* const op1, uint8x16_t* const op0,
718 uint8x16_t* const oq0, uint8x16_t* const oq1) {
719 const int8x16_t kCst3 = vdupq_n_s8(0x03);
720 const int8x16_t kCst4 = vdupq_n_s8(0x04);
721 const int8x16_t delta1 = vqaddq_s8(delta0, kCst4);
722 const int8x16_t delta2 = vqaddq_s8(delta0, kCst3);
723 const int8x16_t a1 = vshrq_n_s8(delta1, 3);
724 const int8x16_t a2 = vshrq_n_s8(delta2, 3);
725 const int8x16_t a3 = vrshrq_n_s8(a1, 1); // a3 = (a1 + 1) >> 1
726 *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a2)); // clip(p0 + a2)
727 *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1)); // clip(q0 - a1)
728 *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a3)); // clip(p1 + a3)
729 *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a3)); // clip(q1 - a3)
732 static void DoFilter4_NEON(
733 const uint8x16_t p1, const uint8x16_t p0,
734 const uint8x16_t q0, const uint8x16_t q1,
735 const uint8x16_t mask, const uint8x16_t hev_mask,
736 uint8x16_t* const op1, uint8x16_t* const op0,
737 uint8x16_t* const oq0, uint8x16_t* const oq1) {
738 // This is a fused version of DoFilter2() calling ApplyFilter2 directly
739 const int8x16_t p1s = FlipSign_NEON(p1);
740 int8x16_t p0s = FlipSign_NEON(p0);
741 int8x16_t q0s = FlipSign_NEON(q0);
742 const int8x16_t q1s = FlipSign_NEON(q1);
743 const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
745 // do_filter2 part (simple loopfilter on pixels with hev)
747 const int8x16_t delta = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
748 const int8x16_t simple_lf_delta =
749 vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));
750 ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
753 // do_filter4 part (complex loopfilter on pixels without hev)
755 const int8x16_t delta0 = GetBaseDelta0_NEON(p0s, q0s);
756 // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
757 const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
758 const int8x16_t complex_lf_delta =
759 vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
760 ApplyFilter4_NEON(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1);
766 static void ApplyFilter6_NEON(
767 const int8x16_t p2, const int8x16_t p1, const int8x16_t p0,
768 const int8x16_t q0, const int8x16_t q1, const int8x16_t q2,
769 const int8x16_t delta,
770 uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
771 uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
772 // We have to compute: X = (9*a+63) >> 7, Y = (18*a+63)>>7, Z = (27*a+63) >> 7
773 // Turns out, there's a common sub-expression S=9 * a - 1 that can be used
774 // with the special vqrshrn_n_s16 rounding-shift-and-narrow instruction:
775 // X = (S + 64) >> 7, Y = (S + 32) >> 6, Z = (18 * a + S + 64) >> 7
776 const int8x8_t delta_lo = vget_low_s8(delta);
777 const int8x8_t delta_hi = vget_high_s8(delta);
778 const int8x8_t kCst9 = vdup_n_s8(9);
779 const int16x8_t kCstm1 = vdupq_n_s16(-1);
780 const int8x8_t kCst18 = vdup_n_s8(18);
781 const int16x8_t S_lo = vmlal_s8(kCstm1, kCst9, delta_lo); // S = 9 * a - 1
782 const int16x8_t S_hi = vmlal_s8(kCstm1, kCst9, delta_hi);
783 const int16x8_t Z_lo = vmlal_s8(S_lo, kCst18, delta_lo); // S + 18 * a
784 const int16x8_t Z_hi = vmlal_s8(S_hi, kCst18, delta_hi);
785 const int8x8_t a3_lo = vqrshrn_n_s16(S_lo, 7); // (9 * a + 63) >> 7
786 const int8x8_t a3_hi = vqrshrn_n_s16(S_hi, 7);
787 const int8x8_t a2_lo = vqrshrn_n_s16(S_lo, 6); // (9 * a + 31) >> 6
788 const int8x8_t a2_hi = vqrshrn_n_s16(S_hi, 6);
789 const int8x8_t a1_lo = vqrshrn_n_s16(Z_lo, 7); // (27 * a + 63) >> 7
790 const int8x8_t a1_hi = vqrshrn_n_s16(Z_hi, 7);
791 const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi);
792 const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);
793 const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);
795 *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a1)); // clip(p0 + a1)
796 *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1)); // clip(q0 - q1)
797 *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a2)); // clip(q1 - a2)
798 *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a2)); // clip(p1 + a2)
799 *oq2 = FlipSignBack_NEON(vqsubq_s8(q2, a3)); // clip(q2 - a3)
800 *op2 = FlipSignBack_NEON(vqaddq_s8(p2, a3)); // clip(p2 + a3)
803 static void DoFilter6_NEON(
804 const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
805 const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
806 const uint8x16_t mask, const uint8x16_t hev_mask,
807 uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
808 uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
809 // This is a fused version of DoFilter2() calling ApplyFilter2 directly
810 const int8x16_t p2s = FlipSign_NEON(p2);
811 const int8x16_t p1s = FlipSign_NEON(p1);
812 int8x16_t p0s = FlipSign_NEON(p0);
813 int8x16_t q0s = FlipSign_NEON(q0);
814 const int8x16_t q1s = FlipSign_NEON(q1);
815 const int8x16_t q2s = FlipSign_NEON(q2);
816 const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
817 const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
819 // do_filter2 part (simple loopfilter on pixels with hev)
821 const int8x16_t simple_lf_delta =
822 vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));
823 ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
826 // do_filter6 part (complex loopfilter on pixels without hev)
828 // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
829 const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
830 const int8x16_t complex_lf_delta =
831 vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
832 ApplyFilter6_NEON(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta,
833 op2, op1, op0, oq0, oq1, oq2);
837 // on macroblock edges
839 static void VFilter16_NEON(uint8_t* p, int stride,
840 int thresh, int ithresh, int hev_thresh) {
841 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
842 Load16x8_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
844 const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
846 const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
847 uint8x16_t op2, op1, op0, oq0, oq1, oq2;
848 DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
849 &op2, &op1, &op0, &oq0, &oq1, &oq2);
850 Store16x2_NEON(op2, op1, p - 2 * stride, stride);
851 Store16x2_NEON(op0, oq0, p + 0 * stride, stride);
852 Store16x2_NEON(oq1, oq2, p + 2 * stride, stride);
856 static void HFilter16_NEON(uint8_t* p, int stride,
857 int thresh, int ithresh, int hev_thresh) {
858 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
859 Load8x16_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
861 const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
863 const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
864 uint8x16_t op2, op1, op0, oq0, oq1, oq2;
865 DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
866 &op2, &op1, &op0, &oq0, &oq1, &oq2);
867 Store2x16_NEON(op2, op1, p - 2, stride);
868 Store2x16_NEON(op0, oq0, p + 0, stride);
869 Store2x16_NEON(oq1, oq2, p + 2, stride);
873 // on three inner edges
874 static void VFilter16i_NEON(uint8_t* p, int stride,
875 int thresh, int ithresh, int hev_thresh) {
877 uint8x16_t p3, p2, p1, p0;
878 Load16x4_NEON(p + 2 * stride, stride, &p3, &p2, &p1, &p0);
879 for (k = 3; k != 0; --k) {
880 uint8x16_t q0, q1, q2, q3;
885 Load16x4_NEON(p + 2 * stride, stride, &q0, &q1, &q2, &q3);
887 const uint8x16_t mask =
888 NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
889 const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
890 // p3 and p2 are not just temporary variables here: they will be
891 // re-used for next span. And q2/q3 will become p1/p0 accordingly.
892 DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
893 Store16x4_NEON(p1, p0, p3, p2, p, stride);
900 #if !defined(WORK_AROUND_GCC)
901 static void HFilter16i_NEON(uint8_t* p, int stride,
902 int thresh, int ithresh, int hev_thresh) {
904 uint8x16_t p3, p2, p1, p0;
905 Load4x16_NEON(p + 2, stride, &p3, &p2, &p1, &p0);
906 for (k = 3; k != 0; --k) {
907 uint8x16_t q0, q1, q2, q3;
909 Load4x16_NEON(p + 2, stride, &q0, &q1, &q2, &q3);
911 const uint8x16_t mask =
912 NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
913 const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
914 DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
915 Store4x16_NEON(p1, p0, p3, p2, p, stride);
921 #endif // !WORK_AROUND_GCC
923 // 8-pixels wide variant, for chroma filtering
924 static void VFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
925 int thresh, int ithresh, int hev_thresh) {
926 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
927 Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
929 const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
931 const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
932 uint8x16_t op2, op1, op0, oq0, oq1, oq2;
933 DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
934 &op2, &op1, &op0, &oq0, &oq1, &oq2);
935 Store8x2x2_NEON(op2, op1, u - 2 * stride, v - 2 * stride, stride);
936 Store8x2x2_NEON(op0, oq0, u + 0 * stride, v + 0 * stride, stride);
937 Store8x2x2_NEON(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
940 static void VFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
941 int thresh, int ithresh, int hev_thresh) {
942 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
945 Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
947 const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
949 const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
950 uint8x16_t op1, op0, oq0, oq1;
951 DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
952 Store8x4x2_NEON(op1, op0, oq0, oq1, u, v, stride);
956 #if !defined(WORK_AROUND_GCC)
957 static void HFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
958 int thresh, int ithresh, int hev_thresh) {
959 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
960 Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
962 const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
964 const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
965 uint8x16_t op2, op1, op0, oq0, oq1, oq2;
966 DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
967 &op2, &op1, &op0, &oq0, &oq1, &oq2);
968 Store6x8x2_NEON(op2, op1, op0, oq0, oq1, oq2, u, v, stride);
972 static void HFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
973 int thresh, int ithresh, int hev_thresh) {
974 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
977 Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
979 const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
981 const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
982 uint8x16_t op1, op0, oq0, oq1;
983 DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
984 Store4x8x2_NEON(op1, op0, oq0, oq1, u, v, stride);
987 #endif // !WORK_AROUND_GCC
989 //-----------------------------------------------------------------------------
990 // Inverse transforms (Paragraph 14.4)
992 // Technically these are unsigned but vqdmulh is only available in signed.
993 // vqdmulh returns high half (effectively >> 16) but also doubles the value,
994 // changing the >> 16 to >> 15 and requiring an additional >> 1.
995 // We use this to our advantage with kC2. The canonical value is 35468.
996 // However, the high bit is set so treating it as signed will give incorrect
997 // results. We avoid this by down shifting by 1 here to clear the highest bit.
998 // Combined with the doubling effect of vqdmulh we get >> 16.
999 // This can not be applied to kC1 because the lowest bit is set. Down shifting
1000 // the constant would reduce precision.
1002 // libwebp uses a trick to avoid some extra addition that libvpx does.
1004 // temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
1005 // libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
1006 // same issue with kC1 and vqdmulh that we work around by down shifting kC2
1008 static const int16_t kC1 = 20091;
1009 static const int16_t kC2 = 17734; // half of kC2, actually. See comment above.
1011 #if defined(WEBP_USE_INTRINSICS)
1012 static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
1013 const int16x8_t in1,
1014 int16x8x2_t* const out) {
1015 // a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1
1016 // c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3
1017 const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ...
1018 // b0 d0 b1 d1 b2 d2 ...
1019 *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
1022 static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
1023 // {rows} = in0 | in4
1026 const int16x8_t B1 =
1027 vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
1028 // C0 = kC1 * in4 | kC1 * in12
1029 // C1 = kC2 * in4 | kC2 * in12
1030 const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
1031 const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
1032 const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
1033 vget_low_s16(rows->val[1])); // in0 + in8
1034 const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
1035 vget_low_s16(rows->val[1])); // in0 - in8
1036 // c = kC2 * in4 - kC1 * in12
1037 // d = kC1 * in4 + kC2 * in12
1038 const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
1039 const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
1040 const int16x8_t D0 = vcombine_s16(a, b); // D0 = a | b
1041 const int16x8_t D1 = vcombine_s16(d, c); // D1 = d | c
1042 const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c
1043 const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c
1044 const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
1045 Transpose8x2_NEON(E0, E1, rows);
1048 static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
1050 INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
1051 TransformPass_NEON(&rows);
1052 TransformPass_NEON(&rows);
1053 Add4x4_NEON(rows.val[0], rows.val[1], dst);
1058 static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
1059 const int kBPS = BPS;
1060 // kC1, kC2. Padded because vld1.16 loads 8 bytes
1061 const int16_t constants[4] = { kC1, kC2, 0, 0 };
1062 /* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */
1064 "vld1.16 {q1, q2}, [%[in]] \n"
1065 "vld1.16 {d0}, [%[constants]] \n"
1074 /* q8 = {in[4], in[12]} * kC1 * 2 >> 16
1075 * q9 = {in[4], in[12]} * kC2 >> 16
1077 "vqdmulh.s16 q8, q2, d0[0] \n"
1078 "vqdmulh.s16 q9, q2, d0[1] \n"
1080 /* d22 = a = in[0] + in[8]
1081 * d23 = b = in[0] - in[8]
1083 "vqadd.s16 d22, d2, d3 \n"
1084 "vqsub.s16 d23, d2, d3 \n"
1086 /* The multiplication should be x * kC1 >> 16
1087 * However, with vqdmulh we get x * kC1 * 2 >> 16
1088 * (multiply, double, return high half)
1089 * We avoided this in kC2 by pre-shifting the constant.
1090 * q8 = in[4]/[12] * kC1 >> 16
1092 "vshr.s16 q8, q8, #1 \n"
1094 /* Add {in[4], in[12]} back after the multiplication. This is handled by
1095 * adding 1 << 16 to kC1 in the libwebp C code.
1097 "vqadd.s16 q8, q2, q8 \n"
1099 /* d20 = c = in[4]*kC2 - in[12]*kC1
1100 * d21 = d = in[4]*kC1 + in[12]*kC2
1102 "vqsub.s16 d20, d18, d17 \n"
1103 "vqadd.s16 d21, d19, d16 \n"
1105 /* d2 = tmp[0] = a + d
1106 * d3 = tmp[1] = b + c
1107 * d4 = tmp[2] = b - c
1108 * d5 = tmp[3] = a - d
1110 "vqadd.s16 d2, d22, d21 \n"
1111 "vqadd.s16 d3, d23, d20 \n"
1112 "vqsub.s16 d4, d23, d20 \n"
1113 "vqsub.s16 d5, d22, d21 \n"
1120 /* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
1121 * q9 = {tmp[4], tmp[12]} * kC2 >> 16
1123 "vqdmulh.s16 q8, q2, d0[0] \n"
1124 "vqdmulh.s16 q9, q2, d0[1] \n"
1126 /* d22 = a = tmp[0] + tmp[8]
1127 * d23 = b = tmp[0] - tmp[8]
1129 "vqadd.s16 d22, d2, d3 \n"
1130 "vqsub.s16 d23, d2, d3 \n"
1132 /* See long winded explanations prior */
1133 "vshr.s16 q8, q8, #1 \n"
1134 "vqadd.s16 q8, q2, q8 \n"
1136 /* d20 = c = in[4]*kC2 - in[12]*kC1
1137 * d21 = d = in[4]*kC1 + in[12]*kC2
1139 "vqsub.s16 d20, d18, d17 \n"
1140 "vqadd.s16 d21, d19, d16 \n"
1142 /* d2 = tmp[0] = a + d
1143 * d3 = tmp[1] = b + c
1144 * d4 = tmp[2] = b - c
1145 * d5 = tmp[3] = a - d
1147 "vqadd.s16 d2, d22, d21 \n"
1148 "vqadd.s16 d3, d23, d20 \n"
1149 "vqsub.s16 d4, d23, d20 \n"
1150 "vqsub.s16 d5, d22, d21 \n"
1152 "vld1.32 d6[0], [%[dst]], %[kBPS] \n"
1153 "vld1.32 d6[1], [%[dst]], %[kBPS] \n"
1154 "vld1.32 d7[0], [%[dst]], %[kBPS] \n"
1155 "vld1.32 d7[1], [%[dst]], %[kBPS] \n"
1157 "sub %[dst], %[dst], %[kBPS], lsl #2 \n"
1159 /* (val) + 4 >> 3 */
1160 "vrshr.s16 d2, d2, #3 \n"
1161 "vrshr.s16 d3, d3, #3 \n"
1162 "vrshr.s16 d4, d4, #3 \n"
1163 "vrshr.s16 d5, d5, #3 \n"
1168 /* Must accumulate before saturating */
1169 "vmovl.u8 q8, d6 \n"
1170 "vmovl.u8 q9, d7 \n"
1172 "vqadd.s16 q1, q1, q8 \n"
1173 "vqadd.s16 q2, q2, q9 \n"
1175 "vqmovun.s16 d0, q1 \n"
1176 "vqmovun.s16 d1, q2 \n"
1178 "vst1.32 d0[0], [%[dst]], %[kBPS] \n"
1179 "vst1.32 d0[1], [%[dst]], %[kBPS] \n"
1180 "vst1.32 d1[0], [%[dst]], %[kBPS] \n"
1181 "vst1.32 d1[1], [%[dst]] \n"
1183 : [in] "+r"(in), [dst] "+r"(dst) /* modified registers */
1184 : [kBPS] "r"(kBPS), [constants] "r"(constants) /* constants */
1185 : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" /* clobbered */
1189 #endif // WEBP_USE_INTRINSICS
1191 static void TransformTwo_NEON(const int16_t* in, uint8_t* dst, int do_two) {
1192 TransformOne_NEON(in, dst);
1194 TransformOne_NEON(in + 16, dst + 4);
1198 static void TransformDC_NEON(const int16_t* in, uint8_t* dst) {
1199 const int16x8_t DC = vdupq_n_s16(in[0]);
1200 Add4x4_NEON(DC, DC, dst);
1203 //------------------------------------------------------------------------------
1205 #define STORE_WHT(dst, col, rows) do { \
1206 *dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \
1207 *dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \
1208 *dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \
1209 *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
1212 static void TransformWHT_NEON(const int16_t* in, int16_t* out) {
1217 const int16x4_t in00_03 = vld1_s16(in + 0);
1218 const int16x4_t in04_07 = vld1_s16(in + 4);
1219 const int16x4_t in08_11 = vld1_s16(in + 8);
1220 const int16x4_t in12_15 = vld1_s16(in + 12);
1221 const int32x4_t a0 = vaddl_s16(in00_03, in12_15); // in[0..3] + in[12..15]
1222 const int32x4_t a1 = vaddl_s16(in04_07, in08_11); // in[4..7] + in[8..11]
1223 const int32x4_t a2 = vsubl_s16(in04_07, in08_11); // in[4..7] - in[8..11]
1224 const int32x4_t a3 = vsubl_s16(in00_03, in12_15); // in[0..3] - in[12..15]
1225 tmp.val[0] = vaddq_s32(a0, a1);
1226 tmp.val[1] = vaddq_s32(a3, a2);
1227 tmp.val[2] = vsubq_s32(a0, a1);
1228 tmp.val[3] = vsubq_s32(a3, a2);
1229 // Arrange the temporary results column-wise.
1230 tmp = Transpose4x4_NEON(tmp);
1234 const int32x4_t kCst3 = vdupq_n_s32(3);
1235 const int32x4_t dc = vaddq_s32(tmp.val[0], kCst3); // add rounder
1236 const int32x4_t a0 = vaddq_s32(dc, tmp.val[3]);
1237 const int32x4_t a1 = vaddq_s32(tmp.val[1], tmp.val[2]);
1238 const int32x4_t a2 = vsubq_s32(tmp.val[1], tmp.val[2]);
1239 const int32x4_t a3 = vsubq_s32(dc, tmp.val[3]);
1241 tmp.val[0] = vaddq_s32(a0, a1);
1242 tmp.val[1] = vaddq_s32(a3, a2);
1243 tmp.val[2] = vsubq_s32(a0, a1);
1244 tmp.val[3] = vsubq_s32(a3, a2);
1246 // right shift the results by 3.
1247 tmp.val[0] = vshrq_n_s32(tmp.val[0], 3);
1248 tmp.val[1] = vshrq_n_s32(tmp.val[1], 3);
1249 tmp.val[2] = vshrq_n_s32(tmp.val[2], 3);
1250 tmp.val[3] = vshrq_n_s32(tmp.val[3], 3);
1252 STORE_WHT(out, 0, tmp);
1253 STORE_WHT(out, 1, tmp);
1254 STORE_WHT(out, 2, tmp);
1255 STORE_WHT(out, 3, tmp);
1261 //------------------------------------------------------------------------------
1263 #define MUL(a, b) (((a) * (b)) >> 16)
1264 static void TransformAC3_NEON(const int16_t* in, uint8_t* dst) {
1265 static const int kC1_full = 20091 + (1 << 16);
1266 static const int kC2_full = 35468;
1267 const int16x4_t A = vld1_dup_s16(in);
1268 const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full));
1269 const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full));
1270 const int c1 = MUL(in[1], kC2_full);
1271 const int d1 = MUL(in[1], kC1_full);
1272 const uint64_t cd = (uint64_t)( d1 & 0xffff) << 0 |
1273 (uint64_t)( c1 & 0xffff) << 16 |
1274 (uint64_t)(-c1 & 0xffff) << 32 |
1275 (uint64_t)(-d1 & 0xffff) << 48;
1276 const int16x4_t CD = vcreate_s16(cd);
1277 const int16x4_t B = vqadd_s16(A, CD);
1278 const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4));
1279 const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4));
1280 Add4x4_NEON(m0_m1, m2_m3, dst);
1284 //------------------------------------------------------------------------------
1287 static void DC4_NEON(uint8_t* dst) { // DC
1288 const uint8x8_t A = vld1_u8(dst - BPS); // top row
1289 const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
1290 const uint16x4_t p1 = vpadd_u16(p0, p0);
1291 const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
1292 const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
1293 const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
1294 const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
1295 const uint16x8_t s0 = vaddl_u8(L0, L1);
1296 const uint16x8_t s1 = vaddl_u8(L2, L3);
1297 const uint16x8_t s01 = vaddq_u16(s0, s1);
1298 const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1));
1299 const uint8x8_t dc0 = vrshrn_n_u16(sum, 3); // (sum + 4) >> 3
1300 const uint8x8_t dc = vdup_lane_u8(dc0, 0);
1302 for (i = 0; i < 4; ++i) {
1303 vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc), 0);
1307 // TrueMotion (4x4 + 8x8)
1308 static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, int size) {
1309 const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]'
1310 const uint8x8_t T = vld1_u8(dst - BPS); // top row 'A[0..3]'
1311 const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL)); // A[c] - A[-1]
1313 for (y = 0; y < size; y += 4) {
1315 const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));
1316 const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));
1317 const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));
1318 const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));
1319 const int16x8_t r0 = vaddq_s16(L0, d); // L[r] + A[c] - A[-1]
1320 const int16x8_t r1 = vaddq_s16(L1, d);
1321 const int16x8_t r2 = vaddq_s16(L2, d);
1322 const int16x8_t r3 = vaddq_s16(L3, d);
1323 // Saturate and store the result.
1324 const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0));
1325 const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1));
1326 const uint32x2_t r2_u32 = vreinterpret_u32_u8(vqmovun_s16(r2));
1327 const uint32x2_t r3_u32 = vreinterpret_u32_u8(vqmovun_s16(r3));
1329 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0_u32, 0);
1330 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1_u32, 0);
1331 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2_u32, 0);
1332 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3_u32, 0);
1334 vst1_u32((uint32_t*)(dst + 0 * BPS), r0_u32);
1335 vst1_u32((uint32_t*)(dst + 1 * BPS), r1_u32);
1336 vst1_u32((uint32_t*)(dst + 2 * BPS), r2_u32);
1337 vst1_u32((uint32_t*)(dst + 3 * BPS), r3_u32);
1343 static void TM4_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 4); }
1345 static void VE4_NEON(uint8_t* dst) { // vertical
1346 // NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS.
1347 const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1)); // top row
1348 const uint64x1_t A1 = vshr_n_u64(A0, 8);
1349 const uint64x1_t A2 = vshr_n_u64(A0, 16);
1350 const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
1351 const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
1352 const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
1353 const uint8x8_t b = vhadd_u8(ABCDEFGH, CDEFGH00);
1354 const uint8x8_t avg = vrhadd_u8(b, BCDEFGH0);
1356 for (i = 0; i < 4; ++i) {
1357 vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(avg), 0);
1361 static void RD4_NEON(uint8_t* dst) { // Down-right
1362 const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - 1);
1363 const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
1364 const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
1365 const uint32_t I = dst[-1 + 0 * BPS];
1366 const uint32_t J = dst[-1 + 1 * BPS];
1367 const uint32_t K = dst[-1 + 2 * BPS];
1368 const uint32_t L = dst[-1 + 3 * BPS];
1369 const uint64x1_t LKJI____ =
1370 vcreate_u64((uint64_t)L | (K << 8) | (J << 16) | (I << 24));
1371 const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
1372 const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
1373 const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
1374 const uint8_t D = vget_lane_u8(XABCD_u8, 4);
1375 const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
1376 const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
1377 const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
1378 const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
1379 const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
1380 const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
1381 const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
1382 const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
1383 const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
1384 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
1385 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
1386 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
1387 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
1390 static void LD4_NEON(uint8_t* dst) { // Down-left
1391 // Note using the same shift trick as VE4() is slower here.
1392 const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0);
1393 const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1);
1394 const uint8x8_t CDEFGH00 = vld1_u8(dst - BPS + 2);
1395 const uint8x8_t CDEFGHH0 = vset_lane_u8(dst[-BPS + 7], CDEFGH00, 6);
1396 const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGHH0);
1397 const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
1398 const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
1399 const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
1400 const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
1401 const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
1402 const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
1403 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
1404 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
1405 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
1406 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
1409 //------------------------------------------------------------------------------
1412 static void VE8uv_NEON(uint8_t* dst) { // vertical
1413 const uint8x8_t top = vld1_u8(dst - BPS);
1415 for (j = 0; j < 8; ++j) {
1416 vst1_u8(dst + j * BPS, top);
1420 static void HE8uv_NEON(uint8_t* dst) { // horizontal
1422 for (j = 0; j < 8; ++j) {
1423 const uint8x8_t left = vld1_dup_u8(dst - 1);
1429 static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
1431 uint16x8_t sum_left;
1435 const uint8x8_t A = vld1_u8(dst - BPS); // top row
1437 const uint16_t p2 = vaddlv_u8(A);
1438 sum_top = vdupq_n_u16(p2);
1440 const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
1441 const uint16x4_t p1 = vpadd_u16(p0, p0);
1442 const uint16x4_t p2 = vpadd_u16(p1, p1);
1443 sum_top = vcombine_u16(p2, p2);
1448 const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
1449 const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
1450 const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
1451 const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
1452 const uint8x8_t L4 = vld1_u8(dst + 4 * BPS - 1);
1453 const uint8x8_t L5 = vld1_u8(dst + 5 * BPS - 1);
1454 const uint8x8_t L6 = vld1_u8(dst + 6 * BPS - 1);
1455 const uint8x8_t L7 = vld1_u8(dst + 7 * BPS - 1);
1456 const uint16x8_t s0 = vaddl_u8(L0, L1);
1457 const uint16x8_t s1 = vaddl_u8(L2, L3);
1458 const uint16x8_t s2 = vaddl_u8(L4, L5);
1459 const uint16x8_t s3 = vaddl_u8(L6, L7);
1460 const uint16x8_t s01 = vaddq_u16(s0, s1);
1461 const uint16x8_t s23 = vaddq_u16(s2, s3);
1462 sum_left = vaddq_u16(s01, s23);
1465 if (do_top && do_left) {
1466 const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
1467 dc0 = vrshrn_n_u16(sum, 4);
1468 } else if (do_top) {
1469 dc0 = vrshrn_n_u16(sum_top, 3);
1470 } else if (do_left) {
1471 dc0 = vrshrn_n_u16(sum_left, 3);
1473 dc0 = vdup_n_u8(0x80);
1477 const uint8x8_t dc = vdup_lane_u8(dc0, 0);
1479 for (i = 0; i < 8; ++i) {
1480 vst1_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc));
1485 static void DC8uv_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 1); }
1486 static void DC8uvNoTop_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 1); }
1487 static void DC8uvNoLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 0); }
1488 static void DC8uvNoTopLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 0); }
1490 static void TM8uv_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 8); }
1492 //------------------------------------------------------------------------------
1495 static void VE16_NEON(uint8_t* dst) { // vertical
1496 const uint8x16_t top = vld1q_u8(dst - BPS);
1498 for (j = 0; j < 16; ++j) {
1499 vst1q_u8(dst + j * BPS, top);
1503 static void HE16_NEON(uint8_t* dst) { // horizontal
1505 for (j = 0; j < 16; ++j) {
1506 const uint8x16_t left = vld1q_dup_u8(dst - 1);
1507 vst1q_u8(dst, left);
1512 static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) {
1514 uint16x8_t sum_left;
1518 const uint8x16_t A = vld1q_u8(dst - BPS); // top row
1520 const uint16_t p3 = vaddlvq_u8(A);
1521 sum_top = vdupq_n_u16(p3);
1523 const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top
1524 const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
1525 const uint16x4_t p2 = vpadd_u16(p1, p1);
1526 const uint16x4_t p3 = vpadd_u16(p2, p2);
1527 sum_top = vcombine_u16(p3, p3);
1533 sum_left = vdupq_n_u16(0);
1534 for (i = 0; i < 16; i += 8) {
1535 const uint8x8_t L0 = vld1_u8(dst + (i + 0) * BPS - 1);
1536 const uint8x8_t L1 = vld1_u8(dst + (i + 1) * BPS - 1);
1537 const uint8x8_t L2 = vld1_u8(dst + (i + 2) * BPS - 1);
1538 const uint8x8_t L3 = vld1_u8(dst + (i + 3) * BPS - 1);
1539 const uint8x8_t L4 = vld1_u8(dst + (i + 4) * BPS - 1);
1540 const uint8x8_t L5 = vld1_u8(dst + (i + 5) * BPS - 1);
1541 const uint8x8_t L6 = vld1_u8(dst + (i + 6) * BPS - 1);
1542 const uint8x8_t L7 = vld1_u8(dst + (i + 7) * BPS - 1);
1543 const uint16x8_t s0 = vaddl_u8(L0, L1);
1544 const uint16x8_t s1 = vaddl_u8(L2, L3);
1545 const uint16x8_t s2 = vaddl_u8(L4, L5);
1546 const uint16x8_t s3 = vaddl_u8(L6, L7);
1547 const uint16x8_t s01 = vaddq_u16(s0, s1);
1548 const uint16x8_t s23 = vaddq_u16(s2, s3);
1549 const uint16x8_t sum = vaddq_u16(s01, s23);
1550 sum_left = vaddq_u16(sum_left, sum);
1554 if (do_top && do_left) {
1555 const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
1556 dc0 = vrshrn_n_u16(sum, 5);
1557 } else if (do_top) {
1558 dc0 = vrshrn_n_u16(sum_top, 4);
1559 } else if (do_left) {
1560 dc0 = vrshrn_n_u16(sum_left, 4);
1562 dc0 = vdup_n_u8(0x80);
1566 const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
1568 for (i = 0; i < 16; ++i) {
1569 vst1q_u8(dst + i * BPS, dc);
1574 static void DC16TopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 1); }
1575 static void DC16NoTop_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 1); }
1576 static void DC16NoLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 0); }
1577 static void DC16NoTopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 0); }
1579 static void TM16_NEON(uint8_t* dst) {
1580 const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]'
1581 const uint8x16_t T = vld1q_u8(dst - BPS); // top row 'A[0..15]'
1583 const int16x8_t d_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), TL));
1584 const int16x8_t d_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), TL));
1586 for (y = 0; y < 16; y += 4) {
1588 const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));
1589 const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));
1590 const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));
1591 const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));
1592 const int16x8_t r0_lo = vaddq_s16(L0, d_lo); // L[r] + A[c] - A[-1]
1593 const int16x8_t r1_lo = vaddq_s16(L1, d_lo);
1594 const int16x8_t r2_lo = vaddq_s16(L2, d_lo);
1595 const int16x8_t r3_lo = vaddq_s16(L3, d_lo);
1596 const int16x8_t r0_hi = vaddq_s16(L0, d_hi);
1597 const int16x8_t r1_hi = vaddq_s16(L1, d_hi);
1598 const int16x8_t r2_hi = vaddq_s16(L2, d_hi);
1599 const int16x8_t r3_hi = vaddq_s16(L3, d_hi);
1600 // Saturate and store the result.
1601 const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi));
1602 const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi));
1603 const uint8x16_t row2 = vcombine_u8(vqmovun_s16(r2_lo), vqmovun_s16(r2_hi));
1604 const uint8x16_t row3 = vcombine_u8(vqmovun_s16(r3_lo), vqmovun_s16(r3_hi));
1605 vst1q_u8(dst + 0 * BPS, row0);
1606 vst1q_u8(dst + 1 * BPS, row1);
1607 vst1q_u8(dst + 2 * BPS, row2);
1608 vst1q_u8(dst + 3 * BPS, row3);
1613 //------------------------------------------------------------------------------
1616 extern void VP8DspInitNEON(void);
1618 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) {
1619 VP8Transform = TransformTwo_NEON;
1620 VP8TransformAC3 = TransformAC3_NEON;
1621 VP8TransformDC = TransformDC_NEON;
1622 VP8TransformWHT = TransformWHT_NEON;
1624 VP8VFilter16 = VFilter16_NEON;
1625 VP8VFilter16i = VFilter16i_NEON;
1626 VP8HFilter16 = HFilter16_NEON;
1627 #if !defined(WORK_AROUND_GCC)
1628 VP8HFilter16i = HFilter16i_NEON;
1630 VP8VFilter8 = VFilter8_NEON;
1631 VP8VFilter8i = VFilter8i_NEON;
1632 #if !defined(WORK_AROUND_GCC)
1633 VP8HFilter8 = HFilter8_NEON;
1634 VP8HFilter8i = HFilter8i_NEON;
1636 VP8SimpleVFilter16 = SimpleVFilter16_NEON;
1637 VP8SimpleHFilter16 = SimpleHFilter16_NEON;
1638 VP8SimpleVFilter16i = SimpleVFilter16i_NEON;
1639 VP8SimpleHFilter16i = SimpleHFilter16i_NEON;
1641 VP8PredLuma4[0] = DC4_NEON;
1642 VP8PredLuma4[1] = TM4_NEON;
1643 VP8PredLuma4[2] = VE4_NEON;
1644 VP8PredLuma4[4] = RD4_NEON;
1645 VP8PredLuma4[6] = LD4_NEON;
1647 VP8PredLuma16[0] = DC16TopLeft_NEON;
1648 VP8PredLuma16[1] = TM16_NEON;
1649 VP8PredLuma16[2] = VE16_NEON;
1650 VP8PredLuma16[3] = HE16_NEON;
1651 VP8PredLuma16[4] = DC16NoTop_NEON;
1652 VP8PredLuma16[5] = DC16NoLeft_NEON;
1653 VP8PredLuma16[6] = DC16NoTopLeft_NEON;
1655 VP8PredChroma8[0] = DC8uv_NEON;
1656 VP8PredChroma8[1] = TM8uv_NEON;
1657 VP8PredChroma8[2] = VE8uv_NEON;
1658 VP8PredChroma8[3] = HE8uv_NEON;
1659 VP8PredChroma8[4] = DC8uvNoTop_NEON;
1660 VP8PredChroma8[5] = DC8uvNoLeft_NEON;
1661 VP8PredChroma8[6] = DC8uvNoTopLeft_NEON;
1664 #else // !WEBP_USE_NEON
1666 WEBP_DSP_INIT_STUB(VP8DspInitNEON)
1668 #endif // WEBP_USE_NEON