2 * Copyright 2014 The LibYuv Project Authors. All rights reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
11 #include "libyuv/row.h"
18 // This module is for GCC Neon.
19 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
20 #ifdef HAS_SCALEROWDOWN2_NEON
21 // Read 32x1 throw away even pixels, and write 16x1.
22 void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
23 uint8* dst, int dst_width) {
27 // load even pixels into q0, odd into q1
29 "vld2.8 {q0, q1}, [%0]! \n"
30 "subs %2, %2, #16 \n" // 16 processed per loop
32 "vst1.8 {q1}, [%1]! \n" // store odd pixels
34 : "+r"(src_ptr), // %0
38 : "q0", "q1" // Clobber List
41 #endif //HAS_SCALEROWDOWN2_NEON
43 #ifdef HAS_SCALEROWDOWN2_NEON
44 // Read 32x2 average down and write 16x1.
45 void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
46 uint8* dst, int dst_width) {
48 // change the stride to row 2 pointer
53 "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
55 "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
56 "subs %3, %3, #16 \n" // 16 processed per loop
57 "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
59 "vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1
61 "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
62 "vrshrn.u16 d1, q1, #2 \n"
64 "vst1.8 {q0}, [%2]! \n"
66 : "+r"(src_ptr), // %0
67 "+r"(src_stride), // %1
71 : "q0", "q1", "q2", "q3" // Clobber List
74 #endif //HAS_SCALEROWDOWN2_NEON
76 #ifdef HAS_SCALEROWDOWN4_NEON
77 void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
78 uint8* dst_ptr, int dst_width) {
83 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
84 "subs %2, %2, #8 \n" // 8 processed per loop
86 "vst1.8 {d2}, [%1]! \n"
88 : "+r"(src_ptr), // %0
92 : "q0", "q1", "memory", "cc"
95 #endif //HAS_SCALEROWDOWN4_NEON
97 #ifdef HAS_SCALEROWDOWN4_NEON
98 void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
99 uint8* dst_ptr, int dst_width) {
100 const uint8* src_ptr1 = src_ptr + src_stride;
101 const uint8* src_ptr2 = src_ptr + src_stride * 2;
102 const uint8* src_ptr3 = src_ptr + src_stride * 3;
107 "vld1.8 {q0}, [%0]! \n" // load up 16x4
109 "vld1.8 {q1}, [%3]! \n"
111 "vld1.8 {q2}, [%4]! \n"
113 "vld1.8 {q3}, [%5]! \n"
115 "vpaddl.u8 q0, q0 \n"
116 "vpadal.u8 q0, q1 \n"
117 "vpadal.u8 q0, q2 \n"
118 "vpadal.u8 q0, q3 \n"
119 "vpaddl.u16 q0, q0 \n"
120 "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
121 "vmovn.u16 d0, q0 \n"
123 "vst1.32 {d0[0]}, [%1]! \n"
125 : "+r"(src_ptr), // %0
127 "+r"(dst_width), // %2
128 "+r"(src_ptr1), // %3
129 "+r"(src_ptr2), // %4
132 : "q0", "q1", "q2", "q3", "memory", "cc"
135 #endif //HAS_SCALEROWDOWN4_NEON
137 #ifdef HAS_SCALEROWDOWN34_NEON
138 // Down scale from 4 to 3 pixels. Use the neon multilane read/write
139 // to load up the every 4th pixel into a 4 different registers.
140 // Point samples 32 pixels to 24 pixels.
141 void ScaleRowDown34_NEON(const uint8* src_ptr,
142 ptrdiff_t src_stride,
143 uint8* dst_ptr, int dst_width) {
148 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
149 "subs %2, %2, #24 \n"
150 "vmov d2, d3 \n" // order d0, d1, d2
152 "vst3.8 {d0, d1, d2}, [%1]! \n"
154 : "+r"(src_ptr), // %0
156 "+r"(dst_width) // %2
158 : "d0", "d1", "d2", "d3", "memory", "cc"
161 #endif //HAS_SCALEROWDOWN34_NEON
163 #ifdef HAS_SCALEROWDOWN34_NEON
164 void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
165 ptrdiff_t src_stride,
166 uint8* dst_ptr, int dst_width) {
173 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
175 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
176 "subs %2, %2, #24 \n"
178 // filter src line 0 with src line 1
179 // expand chars to shorts to allow for room
180 // when adding lines together
183 "vmovl.u8 q10, d6 \n"
184 "vmovl.u8 q11, d7 \n"
186 // 3 * line_0 + line_1
187 "vmlal.u8 q8, d0, d24 \n"
188 "vmlal.u8 q9, d1, d24 \n"
189 "vmlal.u8 q10, d2, d24 \n"
190 "vmlal.u8 q11, d3, d24 \n"
192 // (3 * line_0 + line_1) >> 2
193 "vqrshrn.u16 d0, q8, #2 \n"
194 "vqrshrn.u16 d1, q9, #2 \n"
195 "vqrshrn.u16 d2, q10, #2 \n"
196 "vqrshrn.u16 d3, q11, #2 \n"
198 // a0 = (src[0] * 3 + s[1] * 1) >> 2
200 "vmlal.u8 q8, d0, d24 \n"
201 "vqrshrn.u16 d0, q8, #2 \n"
203 // a1 = (src[1] * 1 + s[2] * 1) >> 1
204 "vrhadd.u8 d1, d1, d2 \n"
206 // a2 = (src[2] * 1 + s[3] * 3) >> 2
208 "vmlal.u8 q8, d3, d24 \n"
209 "vqrshrn.u16 d2, q8, #2 \n"
212 "vst3.8 {d0, d1, d2}, [%1]! \n"
215 : "+r"(src_ptr), // %0
217 "+r"(dst_width), // %2
218 "+r"(src_stride) // %3
220 : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
223 #endif //ScaleRowDown34_0_Box_NEON
225 #ifdef HAS_SCALEROWDOWN34_NEON
226 void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
227 ptrdiff_t src_stride,
228 uint8* dst_ptr, int dst_width) {
235 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
237 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
238 "subs %2, %2, #24 \n"
239 // average src line 0 with src line 1
240 "vrhadd.u8 q0, q0, q2 \n"
241 "vrhadd.u8 q1, q1, q3 \n"
243 // a0 = (src[0] * 3 + s[1] * 1) >> 2
245 "vmlal.u8 q3, d0, d24 \n"
246 "vqrshrn.u16 d0, q3, #2 \n"
248 // a1 = (src[1] * 1 + s[2] * 1) >> 1
249 "vrhadd.u8 d1, d1, d2 \n"
251 // a2 = (src[2] * 1 + s[3] * 3) >> 2
253 "vmlal.u8 q3, d3, d24 \n"
254 "vqrshrn.u16 d2, q3, #2 \n"
257 "vst3.8 {d0, d1, d2}, [%1]! \n"
259 : "+r"(src_ptr), // %0
261 "+r"(dst_width), // %2
262 "+r"(src_stride) // %3
264 : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
267 #endif //HAS_SCALEROWDOWN34_NEON
269 #ifdef HAS_SCALEROWDOWN38_NEON
270 #define HAS_SCALEROWDOWN38_NEON
271 static uvec8 kShuf38 =
272 { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
273 static uvec8 kShuf38_2 =
274 { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
275 static vec16 kMult38_Div6 =
276 { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
277 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
278 static vec16 kMult38_Div9 =
279 { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
280 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
283 void ScaleRowDown38_NEON(const uint8* src_ptr,
284 ptrdiff_t src_stride,
285 uint8* dst_ptr, int dst_width) {
288 "vld1.8 {q3}, [%3] \n"
292 "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
293 "subs %2, %2, #12 \n"
294 "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
295 "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
297 "vst1.8 {d4}, [%1]! \n"
299 "vst1.32 {d5[0]}, [%1]! \n"
301 : "+r"(src_ptr), // %0
303 "+r"(dst_width) // %2
304 : "r"(&kShuf38) // %3
305 : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
309 #endif //HAS_SCALEROWDOWN38_NEON
311 #ifdef HAS_SCALEROWDOWN38_NEON
313 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
314 ptrdiff_t src_stride,
315 uint8* dst_ptr, int dst_width) {
316 const uint8* src_ptr1 = src_ptr + src_stride * 2;
320 "vld1.16 {q13}, [%5] \n"
322 "vld1.8 {q14}, [%6] \n"
324 "vld1.8 {q15}, [%7] \n"
329 // d0 = 00 40 01 41 02 42 03 43
330 // d1 = 10 50 11 51 12 52 13 53
331 // d2 = 20 60 21 61 22 62 23 63
332 // d3 = 30 70 31 71 32 72 33 73
334 "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
336 "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
338 "vld4.8 {d16, d17, d18, d19}, [%4]! \n"
339 "subs %2, %2, #12 \n"
341 // Shuffle the input data around to get align the data
342 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
343 // d0 = 00 10 01 11 02 12 03 13
344 // d1 = 40 50 41 51 42 52 43 53
347 "vtrn.u8 d16, d17 \n"
349 // d2 = 20 30 21 31 22 32 23 33
350 // d3 = 60 70 61 71 62 72 63 73
353 "vtrn.u8 d18, d19 \n"
355 // d0 = 00+10 01+11 02+12 03+13
356 // d2 = 40+50 41+51 42+52 43+53
357 "vpaddl.u8 q0, q0 \n"
358 "vpaddl.u8 q2, q2 \n"
359 "vpaddl.u8 q8, q8 \n"
361 // d3 = 60+70 61+71 62+72 63+73
362 "vpaddl.u8 d3, d3 \n"
363 "vpaddl.u8 d7, d7 \n"
364 "vpaddl.u8 d19, d19 \n"
366 // combine source lines
369 "vadd.u16 d4, d3, d7 \n"
370 "vadd.u16 d4, d19 \n"
372 // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
373 // + s[6 + st * 1] + s[7 + st * 1]
374 // + s[6 + st * 2] + s[7 + st * 2]) / 6
375 "vqrdmulh.s16 q2, q2, q13 \n"
376 "vmovn.u16 d4, q2 \n"
378 // Shuffle 2,3 reg around so that 2 can be added to the
379 // 0,1 reg and 3 can be added to the 4,5 reg. This
380 // requires expanding from u8 to u16 as the 0,1 and 4,5
381 // registers are already expanded. Then do transposes
383 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
386 "vmovl.u8 q9, d18 \n"
388 // combine source lines
392 // d4 = xx 20 xx 30 xx 22 xx 32
393 // d5 = xx 21 xx 31 xx 23 xx 33
396 // d4 = xx 20 xx 21 xx 22 xx 23
397 // d5 = xx 30 xx 31 xx 32 xx 33
403 // Need to divide, but can't downshift as the the value
404 // isn't a power of 2. So multiply by 65536 / n
405 // and take the upper 16 bits.
406 "vqrdmulh.s16 q0, q0, q15 \n"
408 // Align for table lookup, vtbl requires registers to
412 "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
413 "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
416 "vst1.8 {d3}, [%1]! \n"
418 "vst1.32 {d4[0]}, [%1]! \n"
420 : "+r"(src_ptr), // %0
422 "+r"(dst_width), // %2
423 "+r"(src_stride), // %3
425 : "r"(&kMult38_Div6), // %5
426 "r"(&kShuf38_2), // %6
427 "r"(&kMult38_Div9) // %7
428 : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
431 #endif //HAS_SCALEROWDOWN38_NEON
433 #ifdef HAS_SCALEROWDOWN38_NEON
435 void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
436 ptrdiff_t src_stride,
437 uint8* dst_ptr, int dst_width) {
440 "vld1.16 {q13}, [%4] \n"
442 "vld1.8 {q14}, [%5] \n"
447 // d0 = 00 40 01 41 02 42 03 43
448 // d1 = 10 50 11 51 12 52 13 53
449 // d2 = 20 60 21 61 22 62 23 63
450 // d3 = 30 70 31 71 32 72 33 73
452 "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
454 "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
455 "subs %2, %2, #12 \n"
457 // Shuffle the input data around to get align the data
458 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
459 // d0 = 00 10 01 11 02 12 03 13
460 // d1 = 40 50 41 51 42 52 43 53
464 // d2 = 20 30 21 31 22 32 23 33
465 // d3 = 60 70 61 71 62 72 63 73
469 // d0 = 00+10 01+11 02+12 03+13
470 // d2 = 40+50 41+51 42+52 43+53
471 "vpaddl.u8 q0, q0 \n"
472 "vpaddl.u8 q2, q2 \n"
474 // d3 = 60+70 61+71 62+72 63+73
475 "vpaddl.u8 d3, d3 \n"
476 "vpaddl.u8 d7, d7 \n"
478 // combine source lines
480 "vadd.u16 d4, d3, d7 \n"
482 // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
483 "vqrshrn.u16 d4, q2, #2 \n"
485 // Shuffle 2,3 reg around so that 2 can be added to the
486 // 0,1 reg and 3 can be added to the 4,5 reg. This
487 // requires expanding from u8 to u16 as the 0,1 and 4,5
488 // registers are already expanded. Then do transposes
490 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
494 // combine source lines
497 // d4 = xx 20 xx 30 xx 22 xx 32
498 // d5 = xx 21 xx 31 xx 23 xx 33
501 // d4 = xx 20 xx 21 xx 22 xx 23
502 // d5 = xx 30 xx 31 xx 32 xx 33
508 // Need to divide, but can't downshift as the the value
509 // isn't a power of 2. So multiply by 65536 / n
510 // and take the upper 16 bits.
511 "vqrdmulh.s16 q0, q0, q13 \n"
513 // Align for table lookup, vtbl requires registers to
517 "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
518 "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
521 "vst1.8 {d3}, [%1]! \n"
523 "vst1.32 {d4[0]}, [%1]! \n"
525 : "+r"(src_ptr), // %0
527 "+r"(dst_width), // %2
528 "+r"(src_stride) // %3
529 : "r"(&kMult38_Div6), // %4
530 "r"(&kShuf38_2) // %5
531 : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
534 #endif //HAS_SCALEROWDOWN38_NEON
538 void ScaleFilterRows_NEON(uint8* dst_ptr,
539 const uint8* src_ptr, ptrdiff_t src_stride,
540 int dst_width, int source_y_fraction) {
555 // General purpose row blend.
558 "vld1.8 {q0}, [%1]! \n"
560 "vld1.8 {q1}, [%2]! \n"
561 "subs %3, %3, #16 \n"
562 "vmull.u8 q13, d0, d4 \n"
563 "vmull.u8 q14, d1, d4 \n"
564 "vmlal.u8 q13, d2, d5 \n"
565 "vmlal.u8 q14, d3, d5 \n"
566 "vrshrn.u16 d0, q13, #8 \n"
567 "vrshrn.u16 d1, q14, #8 \n"
569 "vst1.8 {q0}, [%0]! \n"
576 "vld1.8 {q0}, [%1]! \n"
578 "vld1.8 {q1}, [%2]! \n"
579 "subs %3, %3, #16 \n"
580 "vrhadd.u8 q0, q1 \n"
581 "vrhadd.u8 q0, q1 \n"
583 "vst1.8 {q0}, [%0]! \n"
590 "vld1.8 {q0}, [%1]! \n"
592 "vld1.8 {q1}, [%2]! \n"
593 "subs %3, %3, #16 \n"
594 "vrhadd.u8 q0, q1 \n"
596 "vst1.8 {q0}, [%0]! \n"
603 "vld1.8 {q1}, [%1]! \n"
605 "vld1.8 {q0}, [%2]! \n"
606 "subs %3, %3, #16 \n"
607 "vrhadd.u8 q0, q1 \n"
608 "vrhadd.u8 q0, q1 \n"
610 "vst1.8 {q0}, [%0]! \n"
614 // Blend 100 / 0 - Copy row unchanged.
617 "vld1.8 {q0}, [%1]! \n"
618 "subs %3, %3, #16 \n"
620 "vst1.8 {q0}, [%0]! \n"
625 "vst1.8 {d1[7]}, [%0] \n"
626 : "+r"(dst_ptr), // %0
628 "+r"(src_stride), // %2
629 "+r"(dst_width), // %3
630 "+r"(source_y_fraction) // %4
632 : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
637 #ifdef HAS_SCALEARGBROWDOWN2_NEON
638 void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
639 uint8* dst, int dst_width) {
643 // load even pixels into q0, odd into q1
645 "vld2.32 {q0, q1}, [%0]! \n"
647 "vld2.32 {q2, q3}, [%0]! \n"
648 "subs %2, %2, #8 \n" // 8 processed per loop
650 "vst1.8 {q1}, [%1]! \n" // store odd pixels
652 "vst1.8 {q3}, [%1]! \n"
654 : "+r"(src_ptr), // %0
656 "+r"(dst_width) // %2
658 : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
661 #endif //HAS_SCALEARGBROWDOWN2_NEON
663 #ifdef HAS_SCALEARGBROWDOWN2_NEON
664 void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
665 uint8* dst, int dst_width) {
667 // change the stride to row 2 pointer
672 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
674 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
675 "subs %3, %3, #8 \n" // 8 processed per loop.
676 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
677 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
678 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
679 "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
681 "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels.
683 "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels.
684 "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
685 "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
686 "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
687 "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
688 "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
689 "vrshrn.u16 d1, q1, #2 \n"
690 "vrshrn.u16 d2, q2, #2 \n"
691 "vrshrn.u16 d3, q3, #2 \n"
693 "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
695 : "+r"(src_ptr), // %0
696 "+r"(src_stride), // %1
698 "+r"(dst_width) // %3
700 : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
703 #endif //HAS_SCALEARGBROWDOWN2_NEON
705 #ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
706 // Reads 4 pixels at a time.
707 // Alignment requirement: src_argb 4 byte aligned.
708 void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
709 int src_stepx, uint8* dst_argb, int dst_width) {
711 "mov r12, %3, lsl #2 \n"
715 "vld1.32 {d0[0]}, [%0], r12 \n"
717 "vld1.32 {d0[1]}, [%0], r12 \n"
719 "vld1.32 {d1[0]}, [%0], r12 \n"
721 "vld1.32 {d1[1]}, [%0], r12 \n"
722 "subs %2, %2, #4 \n" // 4 pixels per loop.
724 "vst1.8 {q0}, [%1]! \n"
726 : "+r"(src_argb), // %0
727 "+r"(dst_argb), // %1
728 "+r"(dst_width) // %2
729 : "r"(src_stepx) // %3
730 : "memory", "cc", "r12", "q0"
733 #endif //HAS_SCALEARGBROWDOWNEVEN_NEON
735 #ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
736 // Reads 4 pixels at a time.
737 // Alignment requirement: src_argb 4 byte aligned.
738 void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
740 uint8* dst_argb, int dst_width) {
742 "mov r12, %4, lsl #2 \n"
747 "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1
749 "vld1.8 {d1}, [%1], r12 \n"
751 "vld1.8 {d2}, [%0], r12 \n"
753 "vld1.8 {d3}, [%1], r12 \n"
755 "vld1.8 {d4}, [%0], r12 \n"
757 "vld1.8 {d5}, [%1], r12 \n"
759 "vld1.8 {d6}, [%0], r12 \n"
761 "vld1.8 {d7}, [%1], r12 \n"
762 "vaddl.u8 q0, d0, d1 \n"
763 "vaddl.u8 q1, d2, d3 \n"
764 "vaddl.u8 q2, d4, d5 \n"
765 "vaddl.u8 q3, d6, d7 \n"
766 "vswp.8 d1, d2 \n" // ab_cd -> ac_bd
767 "vswp.8 d5, d6 \n" // ef_gh -> eg_fh
768 "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d)
769 "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
770 "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
771 "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
772 "subs %3, %3, #4 \n" // 4 pixels per loop.
774 "vst1.8 {q0}, [%2]! \n"
776 : "+r"(src_argb), // %0
777 "+r"(src_stride), // %1
778 "+r"(dst_argb), // %2
779 "+r"(dst_width) // %3
780 : "r"(src_stepx) // %4
781 : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
784 #endif // HAS_SCALEARGBROWDOWNEVEN_NEON
785 #endif // __aarch64__
789 } // namespace libyuv