2 * ARM NEON optimizations for libjpeg-turbo
4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
8 * This software is provided 'as-is', without any express or implied
9 * warranty. In no event will the authors be held liable for any damages
10 * arising from the use of this software.
12 * Permission is granted to anyone to use this software for any purpose,
13 * including commercial applications, and to alter it and redistribute it
14 * freely, subject to the following restrictions:
16 * 1. The origin of this software must not be misrepresented; you must not
17 * claim that you wrote the original software. If you use this software
18 * in a product, an acknowledgment in the product documentation would be
19 * appreciated but is not required.
20 * 2. Altered source versions must be plainly marked as such, and must not be
21 * misrepresented as being the original software.
22 * 3. This notice may not be removed or altered from any source distribution.
25 #if defined(__linux__) && defined(__ELF__)
26 .section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
36 #define RESPECT_STRICT_ALIGNMENT 1
38 /*****************************************************************************/
40 /* Supplementary macro for setting function attributes */
41 .macro asm_function fname
51 .type \fname, %function
57 /* Transpose a block of 4x4 coefficients in four 64-bit registers */
58 .macro transpose_4x4 x0, x1, x2, x3
65 #define CENTERJSAMPLE 128
67 /*****************************************************************************/
70 * Perform dequantization and inverse DCT on one block of coefficients.
73 * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block,
74 * JSAMPARRAY output_buf, JDIMENSION output_col)
77 #define FIX_0_298631336 (2446)
78 #define FIX_0_390180644 (3196)
79 #define FIX_0_541196100 (4433)
80 #define FIX_0_765366865 (6270)
81 #define FIX_0_899976223 (7373)
82 #define FIX_1_175875602 (9633)
83 #define FIX_1_501321110 (12299)
84 #define FIX_1_847759065 (15137)
85 #define FIX_1_961570560 (16069)
86 #define FIX_2_053119869 (16819)
87 #define FIX_2_562915447 (20995)
88 #define FIX_3_072711026 (25172)
90 #define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
91 #define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
92 #define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
93 #define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
94 #define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
95 #define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
96 #define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
97 #define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865)
100 * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
101 * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
103 #define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \
105 DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
106 INT32 q1, q2, q3, q4, q5, q6, q7; \
107 INT32 tmp11_plus_tmp2, tmp11_minus_tmp2; \
109 /* 1-D iDCT input data */ \
121 q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
122 MULTIPLY(q4, FIX_1_175875602); \
123 q7 = MULTIPLY(q5, FIX_1_175875602) + \
124 MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
125 q2 = MULTIPLY(row2, FIX_0_541196100) + \
126 MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
128 q3 = ((INT32) row0 - (INT32) row4) << 13; \
129 q6 += MULTIPLY(row5, -FIX_2_562915447) + \
130 MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
131 /* now we can use q1 (reloadable constants have been used up) */ \
133 q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
134 MULTIPLY(row1, -FIX_0_899976223); \
137 q7 += MULTIPLY(row7, -FIX_0_899976223) + \
138 MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
140 /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
141 tmp11_plus_tmp2 = q1; \
145 q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
146 MULTIPLY(row3, -FIX_2_562915447); \
148 q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
149 MULTIPLY(row6, FIX_0_541196100); \
152 /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
153 tmp11_minus_tmp2 = q1; \
155 q1 = ((INT32) row0 + (INT32) row4) << 13; \
159 /* pick up the results */ \
162 tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
165 tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
170 #define XFIX_0_899976223 d0[0]
171 #define XFIX_0_541196100 d0[1]
172 #define XFIX_2_562915447 d0[2]
173 #define XFIX_0_298631336_MINUS_0_899976223 d0[3]
174 #define XFIX_1_501321110_MINUS_0_899976223 d1[0]
175 #define XFIX_2_053119869_MINUS_2_562915447 d1[1]
176 #define XFIX_0_541196100_PLUS_0_765366865 d1[2]
177 #define XFIX_1_175875602 d1[3]
178 #define XFIX_1_175875602_MINUS_0_390180644 d2[0]
179 #define XFIX_0_541196100_MINUS_1_847759065 d2[1]
180 #define XFIX_3_072711026_MINUS_2_562915447 d2[2]
181 #define XFIX_1_175875602_MINUS_1_961570560 d2[3]
184 jsimd_idct_islow_neon_consts:
185 .short FIX_0_899976223 /* d0[0] */
186 .short FIX_0_541196100 /* d0[1] */
187 .short FIX_2_562915447 /* d0[2] */
188 .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */
189 .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */
190 .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */
191 .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */
192 .short FIX_1_175875602 /* d1[3] */
193 /* reloadable constants */
194 .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */
195 .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */
196 .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */
197 .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */
199 asm_function jsimd_idct_islow_neon
227 /* Load and dequantize coefficients into NEON registers
228 * with the following allocation:
231 * 0 | d16 | d17 ( q8 )
232 * 1 | d18 | d19 ( q9 )
233 * 2 | d20 | d21 ( q10 )
234 * 3 | d22 | d23 ( q11 )
235 * 4 | d24 | d25 ( q12 )
236 * 5 | d26 | d27 ( q13 )
237 * 6 | d28 | d29 ( q14 )
238 * 7 | d30 | d31 ( q15 )
240 adr ip, jsimd_idct_islow_neon_consts
241 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
242 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
243 vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
245 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
247 vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
248 vmul.s16 q10, q10, q2
249 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
250 vmul.s16 q11, q11, q3
251 vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]
252 vmul.s16 q12, q12, q0
253 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
254 vmul.s16 q14, q14, q2
255 vmul.s16 q13, q13, q1
256 vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */
258 vmul.s16 q15, q15, q3
259 vpush {d8-d15} /* save NEON registers */
260 /* 1-D IDCT, pass 1, left 4x8 half */
261 vadd.s16 d4, ROW7L, ROW3L
262 vadd.s16 d5, ROW5L, ROW1L
263 vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560
264 vmlal.s16 q6, d5, XFIX_1_175875602
265 vmull.s16 q7, d4, XFIX_1_175875602
266 /* Check for the zero coefficients in the right 4x8 half */
268 vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644
269 vsubl.s16 q3, ROW0L, ROW4L
270 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
271 vmull.s16 q2, ROW2L, XFIX_0_541196100
272 vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
275 vmlsl.s16 q6, ROW5L, XFIX_2_562915447
276 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
277 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
280 vmlsl.s16 q4, ROW1L, XFIX_0_899976223
283 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
287 vmlsl.s16 q7, ROW7L, XFIX_0_899976223
289 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
290 vrshrn.s32 ROW1L, q1, #11
291 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
293 vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
295 vmlsl.s16 q5, ROW3L, XFIX_2_562915447
298 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
299 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
300 vmlal.s16 q6, ROW6L, XFIX_0_541196100
303 vrshrn.s32 ROW6L, q1, #11
306 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
308 vaddl.s16 q5, ROW0L, ROW4L
310 vrshrn.s32 ROW2L, q1, #11
312 vrshrn.s32 ROW5L, q3, #11
313 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
315 vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
321 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
327 vrshrn.s32 ROW7L, q2, #11
328 vrshrn.s32 ROW3L, q5, #11
329 vrshrn.s32 ROW0L, q6, #11
330 vrshrn.s32 ROW4L, q3, #11
332 beq 3f /* Go to do some special handling for the sparse right 4x8 half */
334 /* 1-D IDCT, pass 1, right 4x8 half */
335 vld1.s16 {d2}, [ip, :64] /* reload constants */
336 vadd.s16 d10, ROW7R, ROW3R
337 vadd.s16 d8, ROW5R, ROW1R
338 /* Transpose left 4x8 half */
340 vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560
341 vmlal.s16 q6, d8, XFIX_1_175875602
343 vmull.s16 q7, d10, XFIX_1_175875602
344 vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644
346 vsubl.s16 q3, ROW0R, ROW4R
347 vmull.s16 q2, ROW2R, XFIX_0_541196100
348 vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
351 vmlsl.s16 q6, ROW5R, XFIX_2_562915447
352 vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
355 vmlsl.s16 q4, ROW1R, XFIX_0_899976223
361 vmlsl.s16 q7, ROW7R, XFIX_0_899976223
362 vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
363 vrshrn.s32 ROW1R, q1, #11
366 vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
367 vmlsl.s16 q5, ROW3R, XFIX_2_562915447
369 vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
370 vmlal.s16 q6, ROW6R, XFIX_0_541196100
372 vrshrn.s32 ROW6R, q1, #11
375 vaddl.s16 q5, ROW0R, ROW4R
376 vrshrn.s32 ROW2R, q1, #11
377 vrshrn.s32 ROW5R, q3, #11
379 vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
386 vrshrn.s32 ROW7R, q2, #11
387 vrshrn.s32 ROW3R, q5, #11
388 vrshrn.s32 ROW0R, q6, #11
389 vrshrn.s32 ROW4R, q3, #11
390 /* Transpose right 4x8 half */
400 1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
401 vld1.s16 {d2}, [ip, :64] /* reload constants */
402 vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */
403 vmlal.s16 q6, ROW1L, XFIX_1_175875602
404 vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
405 vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
406 vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */
407 vmlal.s16 q7, ROW3L, XFIX_1_175875602
408 vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
409 vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
410 vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */
411 vmull.s16 q2, ROW2L, XFIX_0_541196100
412 vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */
414 vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */
415 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
417 vmlsl.s16 q4, ROW1L, XFIX_0_899976223
421 vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */
422 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
423 vshrn.s32 ROW1L, q1, #16
425 vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */
426 vmlsl.s16 q5, ROW3L, XFIX_2_562915447
428 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
429 vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */
431 vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
434 vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */
435 vshrn.s32 ROW2L, q1, #16
436 vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
438 vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */
445 vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
446 vshrn.s32 ROW3L, q5, #16
447 vshrn.s32 ROW0L, q6, #16
448 vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
449 /* 1-D IDCT, pass 2, right 4x8 half */
450 vld1.s16 {d2}, [ip, :64] /* reload constants */
451 vmull.s16 q6, ROW5R, XFIX_1_175875602
452 vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */
453 vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
454 vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
455 vmull.s16 q7, ROW7R, XFIX_1_175875602
456 vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */
457 vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
458 vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
459 vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */
460 vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */
461 vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
463 vmlsl.s16 q6, ROW5R, XFIX_2_562915447
464 vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */
466 vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */
470 vmlsl.s16 q7, ROW7R, XFIX_0_899976223
471 vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */
472 vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
474 vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
475 vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */
477 vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */
478 vmlal.s16 q6, ROW6R, XFIX_0_541196100
480 vshrn.s32 ROW6R, q1, #16
483 vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */
484 vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
485 vshrn.s32 ROW5R, q3, #16
487 vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
494 vshrn.s32 ROW7R, q2, #16
495 vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
496 vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
497 vshrn.s32 ROW4R, q3, #16
499 2: /* Descale to 8-bit and range limit */
500 vqrshrn.s16 d16, q8, #2
501 vqrshrn.s16 d17, q9, #2
502 vqrshrn.s16 d18, q10, #2
503 vqrshrn.s16 d19, q11, #2
504 vpop {d8-d15} /* restore NEON registers */
505 vqrshrn.s16 d20, q12, #2
506 /* Transpose the final 8-bit samples and do signed->unsigned conversion */
508 vqrshrn.s16 d21, q13, #2
509 vqrshrn.s16 d22, q14, #2
510 vmov.u8 q0, #(CENTERJSAMPLE)
511 vqrshrn.s16 d23, q15, #2
517 /* Store results to the output buffer */
518 ldmia OUTPUT_BUF!, {TMP1, TMP2}
519 add TMP1, TMP1, OUTPUT_COL
520 add TMP2, TMP2, OUTPUT_COL
524 ldmia OUTPUT_BUF!, {TMP1, TMP2}
525 add TMP1, TMP1, OUTPUT_COL
526 add TMP2, TMP2, OUTPUT_COL
530 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
531 add TMP1, TMP1, OUTPUT_COL
532 add TMP2, TMP2, OUTPUT_COL
533 add TMP3, TMP3, OUTPUT_COL
534 add TMP4, TMP4, OUTPUT_COL
543 3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
545 /* Transpose left 4x8 half */
550 vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */
557 beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */
559 /* Only row 0 is non-zero for the right 4x8 half */
560 vdup.s16 ROW1R, ROW0R[1]
561 vdup.s16 ROW2R, ROW0R[2]
562 vdup.s16 ROW3R, ROW0R[3]
563 vdup.s16 ROW4R, ROW0R[0]
564 vdup.s16 ROW5R, ROW0R[1]
565 vdup.s16 ROW6R, ROW0R[2]
566 vdup.s16 ROW7R, ROW0R[3]
567 vdup.s16 ROW0R, ROW0R[0]
568 b 1b /* Go to 'normal' second pass */
570 4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
571 vld1.s16 {d2}, [ip, :64] /* reload constants */
572 vmull.s16 q6, ROW1L, XFIX_1_175875602
573 vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
574 vmull.s16 q7, ROW3L, XFIX_1_175875602
575 vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
576 vmull.s16 q2, ROW2L, XFIX_0_541196100
577 vshll.s16 q3, ROW0L, #13
579 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
580 vmlsl.s16 q4, ROW1L, XFIX_0_899976223
583 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
586 vmlsl.s16 q5, ROW3L, XFIX_2_562915447
587 vshrn.s32 ROW1L, q1, #16
589 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
591 vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
594 vshll.s16 q5, ROW0L, #13
595 vshrn.s32 ROW2L, q1, #16
596 vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
603 vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
604 vshrn.s32 ROW3L, q5, #16
605 vshrn.s32 ROW0L, q6, #16
606 vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
607 /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
608 vld1.s16 {d2}, [ip, :64] /* reload constants */
609 vmull.s16 q6, ROW5L, XFIX_1_175875602
610 vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
611 vmull.s16 q7, ROW7L, XFIX_1_175875602
612 vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
613 vmull.s16 q2, ROW6L, XFIX_0_541196100
614 vshll.s16 q3, ROW4L, #13
616 vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
617 vmlsl.s16 q4, ROW5L, XFIX_0_899976223
620 vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
623 vmlsl.s16 q5, ROW7L, XFIX_2_562915447
624 vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
626 vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
628 vshrn.s32 ROW6R, q1, #16
631 vshll.s16 q5, ROW4L, #13
632 vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
633 vshrn.s32 ROW5R, q3, #16
640 vshrn.s32 ROW7R, q2, #16
641 vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
642 vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
643 vshrn.s32 ROW4R, q3, #16
644 b 2b /* Go to epilogue */
673 /*****************************************************************************/
676 * jsimd_idct_ifast_neon
678 * This function contains a fast, not so accurate integer implementation of
679 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
680 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
681 * function from jidctfst.c
683 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
684 * But in ARM NEON case some extra additions are required because VQDMULH
685 * instruction can't handle the constants larger than 1. So the expressions
686 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
687 * which introduces an extra addition. Overall, there are 6 extra additions
688 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
691 #define XFIX_1_082392200 d0[0]
692 #define XFIX_1_414213562 d0[1]
693 #define XFIX_1_847759065 d0[2]
694 #define XFIX_2_613125930 d0[3]
697 jsimd_idct_ifast_neon_consts:
698 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
699 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
700 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
701 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
703 asm_function jsimd_idct_ifast_neon
714 /* Load and dequantize coefficients into NEON registers
715 * with the following allocation:
718 * 0 | d16 | d17 ( q8 )
719 * 1 | d18 | d19 ( q9 )
720 * 2 | d20 | d21 ( q10 )
721 * 3 | d22 | d23 ( q11 )
722 * 4 | d24 | d25 ( q12 )
723 * 5 | d26 | d27 ( q13 )
724 * 6 | d28 | d29 ( q14 )
725 * 7 | d30 | d31 ( q15 )
727 adr ip, jsimd_idct_ifast_neon_consts
728 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
729 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
730 vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
732 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
734 vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
735 vmul.s16 q10, q10, q2
736 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
737 vmul.s16 q11, q11, q3
738 vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]
739 vmul.s16 q12, q12, q0
740 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
741 vmul.s16 q14, q14, q2
742 vmul.s16 q13, q13, q1
743 vld1.16 {d0}, [ip, :64] /* load constants */
744 vmul.s16 q15, q15, q3
745 vpush {d8-d13} /* save NEON registers */
746 /* 1-D IDCT, pass 1 */
747 vsub.s16 q2, q10, q14
748 vadd.s16 q14, q10, q14
749 vsub.s16 q1, q11, q13
750 vadd.s16 q13, q11, q13
752 vadd.s16 q15, q9, q15
753 vqdmulh.s16 q4, q2, XFIX_1_414213562
754 vqdmulh.s16 q6, q1, XFIX_2_613125930
758 vqdmulh.s16 q4, q1, XFIX_1_847759065
759 vsub.s16 q2, q15, q13
761 vqdmulh.s16 q6, q2, XFIX_1_414213562
763 vqdmulh.s16 q4, q5, XFIX_1_082392200
764 vsub.s16 q10, q10, q14
767 vadd.s16 q12, q8, q12
770 vsub.s16 q10, q6, q10
771 vadd.s16 q6, q15, q13
772 vadd.s16 q8, q12, q14
774 vsub.s16 q12, q12, q14
783 vsub.s16 q13, q10, q2
784 vadd.s16 q10, q10, q2
787 vsub.s16 q11, q12, q1
789 vadd.s16 q12, q12, q1
798 /* 1-D IDCT, pass 2 */
799 vsub.s16 q2, q10, q14
801 vadd.s16 q14, q10, q14
803 vsub.s16 q1, q11, q13
804 vadd.s16 q13, q11, q13
806 vadd.s16 q15, q9, q15
807 vqdmulh.s16 q4, q2, XFIX_1_414213562
808 vqdmulh.s16 q6, q1, XFIX_2_613125930
812 vqdmulh.s16 q4, q1, XFIX_1_847759065
813 vsub.s16 q2, q15, q13
815 vqdmulh.s16 q6, q2, XFIX_1_414213562
817 vqdmulh.s16 q4, q5, XFIX_1_082392200
818 vsub.s16 q10, q10, q14
821 vadd.s16 q12, q8, q12
824 vsub.s16 q10, q6, q10
825 vadd.s16 q6, q15, q13
826 vadd.s16 q8, q12, q14
828 vsub.s16 q12, q12, q14
837 vsub.s16 q13, q10, q2
838 vpop {d8-d13} /* restore NEON registers */
839 vadd.s16 q10, q10, q2
840 vsub.s16 q11, q12, q1
841 vadd.s16 q12, q12, q1
842 /* Descale to 8-bit and range limit */
844 vqshrn.s16 d16, q8, #5
845 vqshrn.s16 d17, q9, #5
846 vqshrn.s16 d18, q10, #5
847 vqshrn.s16 d19, q11, #5
848 vqshrn.s16 d20, q12, #5
849 vqshrn.s16 d21, q13, #5
850 vqshrn.s16 d22, q14, #5
851 vqshrn.s16 d23, q15, #5
856 /* Transpose the final 8-bit samples */
863 /* Store results to the output buffer */
864 ldmia OUTPUT_BUF!, {TMP1, TMP2}
865 add TMP1, TMP1, OUTPUT_COL
866 add TMP2, TMP2, OUTPUT_COL
869 ldmia OUTPUT_BUF!, {TMP1, TMP2}
870 add TMP1, TMP1, OUTPUT_COL
871 add TMP2, TMP2, OUTPUT_COL
875 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
876 add TMP1, TMP1, OUTPUT_COL
877 add TMP2, TMP2, OUTPUT_COL
878 add TMP3, TMP3, OUTPUT_COL
879 add TMP4, TMP4, OUTPUT_COL
897 /*****************************************************************************/
900 * jsimd_idct_4x4_neon
902 * This function contains inverse-DCT code for getting reduced-size
903 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations
904 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
905 * function from jpeg-6b (jidctred.c).
907 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
908 * requires much less arithmetic operations and hence should be faster.
909 * The primary purpose of this particular NEON optimized function is
910 * bit exact compatibility with jpeg-6b.
912 * TODO: a bit better instructions scheduling can be achieved by expanding
913 * idct_helper/transpose_4x4 macros and reordering instructions,
914 * but readability will suffer somewhat.
917 #define CONST_BITS 13
919 #define FIX_0_211164243 (1730) /* FIX(0.211164243) */
920 #define FIX_0_509795579 (4176) /* FIX(0.509795579) */
921 #define FIX_0_601344887 (4926) /* FIX(0.601344887) */
922 #define FIX_0_720959822 (5906) /* FIX(0.720959822) */
923 #define FIX_0_765366865 (6270) /* FIX(0.765366865) */
924 #define FIX_0_850430095 (6967) /* FIX(0.850430095) */
925 #define FIX_0_899976223 (7373) /* FIX(0.899976223) */
926 #define FIX_1_061594337 (8697) /* FIX(1.061594337) */
927 #define FIX_1_272758580 (10426) /* FIX(1.272758580) */
928 #define FIX_1_451774981 (11893) /* FIX(1.451774981) */
929 #define FIX_1_847759065 (15137) /* FIX(1.847759065) */
930 #define FIX_2_172734803 (17799) /* FIX(2.172734803) */
931 #define FIX_2_562915447 (20995) /* FIX(2.562915447) */
932 #define FIX_3_624509785 (29692) /* FIX(3.624509785) */
935 jsimd_idct_4x4_neon_consts:
936 .short FIX_1_847759065 /* d0[0] */
937 .short -FIX_0_765366865 /* d0[1] */
938 .short -FIX_0_211164243 /* d0[2] */
939 .short FIX_1_451774981 /* d0[3] */
940 .short -FIX_2_172734803 /* d1[0] */
941 .short FIX_1_061594337 /* d1[1] */
942 .short -FIX_0_509795579 /* d1[2] */
943 .short -FIX_0_601344887 /* d1[3] */
944 .short FIX_0_899976223 /* d2[0] */
945 .short FIX_2_562915447 /* d2[1] */
946 .short 1 << (CONST_BITS+1) /* d2[2] */
949 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
950 vmull.s16 q14, \x4, d2[2]
951 vmlal.s16 q14, \x8, d0[0]
952 vmlal.s16 q14, \x14, d0[1]
954 vmull.s16 q13, \x16, d1[2]
955 vmlal.s16 q13, \x12, d1[3]
956 vmlal.s16 q13, \x10, d2[0]
957 vmlal.s16 q13, \x6, d2[1]
959 vmull.s16 q15, \x4, d2[2]
960 vmlsl.s16 q15, \x8, d0[0]
961 vmlsl.s16 q15, \x14, d0[1]
963 vmull.s16 q12, \x16, d0[2]
964 vmlal.s16 q12, \x12, d0[3]
965 vmlal.s16 q12, \x10, d1[0]
966 vmlal.s16 q12, \x6, d1[1]
968 vadd.s32 q10, q14, q13
969 vsub.s32 q14, q14, q13
972 vrshr.s32 q10, q10, #\shift
973 vrshr.s32 q14, q14, #\shift
977 vrshrn.s32 \y26, q10, #\shift
978 vrshrn.s32 \y29, q14, #\shift
981 vadd.s32 q10, q15, q12
982 vsub.s32 q15, q15, q12
985 vrshr.s32 q10, q10, #\shift
986 vrshr.s32 q15, q15, #\shift
990 vrshrn.s32 \y27, q10, #\shift
991 vrshrn.s32 \y28, q15, #\shift
996 asm_function jsimd_idct_4x4_neon
1009 /* Load constants (d3 is just used for padding) */
1010 adr TMP4, jsimd_idct_4x4_neon_consts
1011 vld1.16 {d0, d1, d2, d3}, [TMP4, :128]
1013 /* Load all COEF_BLOCK into NEON registers with the following allocation:
1015 * ---------+--------
1025 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
1026 vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK, :128]!
1027 add COEF_BLOCK, COEF_BLOCK, #16
1028 vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK, :128]!
1029 vld1.16 {d16, d17}, [COEF_BLOCK, :128]!
1031 vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]!
1033 vld1.16 {d22, d23, d24, d25}, [DCT_TABLE, :128]!
1034 vmul.s16 q3, q3, q10
1035 vmul.s16 q4, q4, q11
1036 add DCT_TABLE, DCT_TABLE, #16
1037 vld1.16 {d26, d27, d28, d29}, [DCT_TABLE, :128]!
1038 vmul.s16 q5, q5, q12
1039 vmul.s16 q6, q6, q13
1040 vld1.16 {d30, d31}, [DCT_TABLE, :128]!
1041 vmul.s16 q7, q7, q14
1042 vmul.s16 q8, q8, q15
1045 idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10
1046 transpose_4x4 d4, d6, d8, d10
1047 idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11
1048 transpose_4x4 d5, d7, d9, d11
1051 idct_helper d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29
1052 transpose_4x4 d26, d27, d28, d29
1056 vadd.s16 q13, q13, q15
1057 vadd.s16 q14, q14, q15
1058 vqmovun.s16 d26, q13
1059 vqmovun.s16 d27, q14
1061 /* Store results to the output buffer */
1062 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
1063 add TMP1, TMP1, OUTPUT_COL
1064 add TMP2, TMP2, OUTPUT_COL
1065 add TMP3, TMP3, OUTPUT_COL
1066 add TMP4, TMP4, OUTPUT_COL
1068 #if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
1069 /* We can use much less instructions on little endian systems if the
1070 * OS kernel is not configured to trap unaligned memory accesses
1072 vst1.32 {d26[0]}, [TMP1]!
1073 vst1.32 {d27[0]}, [TMP3]!
1074 vst1.32 {d26[1]}, [TMP2]!
1075 vst1.32 {d27[1]}, [TMP4]!
1077 vst1.8 {d26[0]}, [TMP1]!
1078 vst1.8 {d27[0]}, [TMP3]!
1079 vst1.8 {d26[1]}, [TMP1]!
1080 vst1.8 {d27[1]}, [TMP3]!
1081 vst1.8 {d26[2]}, [TMP1]!
1082 vst1.8 {d27[2]}, [TMP3]!
1083 vst1.8 {d26[3]}, [TMP1]!
1084 vst1.8 {d27[3]}, [TMP3]!
1086 vst1.8 {d26[4]}, [TMP2]!
1087 vst1.8 {d27[4]}, [TMP4]!
1088 vst1.8 {d26[5]}, [TMP2]!
1089 vst1.8 {d27[5]}, [TMP4]!
1090 vst1.8 {d26[6]}, [TMP2]!
1091 vst1.8 {d27[6]}, [TMP4]!
1092 vst1.8 {d26[7]}, [TMP2]!
1093 vst1.8 {d27[7]}, [TMP4]!
1111 /*****************************************************************************/
1114 * jsimd_idct_2x2_neon
1116 * This function contains inverse-DCT code for getting reduced-size
1117 * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations
1118 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
1119 * function from jpeg-6b (jidctred.c).
1121 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
1122 * requires much less arithmetic operations and hence should be faster.
1123 * The primary purpose of this particular NEON optimized function is
1124 * bit exact compatibility with jpeg-6b.
1128 jsimd_idct_2x2_neon_consts:
1129 .short -FIX_0_720959822 /* d0[0] */
1130 .short FIX_0_850430095 /* d0[1] */
1131 .short -FIX_1_272758580 /* d0[2] */
1132 .short FIX_3_624509785 /* d0[3] */
1134 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
1135 vshll.s16 q14, \x4, #15
1136 vmull.s16 q13, \x6, d0[3]
1137 vmlal.s16 q13, \x10, d0[2]
1138 vmlal.s16 q13, \x12, d0[1]
1139 vmlal.s16 q13, \x16, d0[0]
1141 vadd.s32 q10, q14, q13
1142 vsub.s32 q14, q14, q13
1145 vrshr.s32 q10, q10, #\shift
1146 vrshr.s32 q14, q14, #\shift
1150 vrshrn.s32 \y26, q10, #\shift
1151 vrshrn.s32 \y27, q14, #\shift
1156 asm_function jsimd_idct_2x2_neon
1167 /* Load constants */
1168 adr TMP2, jsimd_idct_2x2_neon_consts
1169 vld1.16 {d0}, [TMP2, :64]
1171 /* Load all COEF_BLOCK into NEON registers with the following allocation:
1173 * ---------+--------
1183 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
1184 add COEF_BLOCK, COEF_BLOCK, #16
1185 vld1.16 {d10, d11}, [COEF_BLOCK, :128]!
1186 add COEF_BLOCK, COEF_BLOCK, #16
1187 vld1.16 {d12, d13}, [COEF_BLOCK, :128]!
1188 add COEF_BLOCK, COEF_BLOCK, #16
1189 vld1.16 {d16, d17}, [COEF_BLOCK, :128]!
1191 vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]!
1193 vmul.s16 q3, q3, q10
1194 add DCT_TABLE, DCT_TABLE, #16
1195 vld1.16 {d24, d25}, [DCT_TABLE, :128]!
1196 vmul.s16 q5, q5, q12
1197 add DCT_TABLE, DCT_TABLE, #16
1198 vld1.16 {d26, d27}, [DCT_TABLE, :128]!
1199 vmul.s16 q6, q6, q13
1200 add DCT_TABLE, DCT_TABLE, #16
1201 vld1.16 {d30, d31}, [DCT_TABLE, :128]!
1202 vmul.s16 q8, q8, q15
1206 idct_helper d4, d6, d10, d12, d16, 13, d4, d6
1207 transpose_4x4 d4, d6, d8, d10
1208 idct_helper d5, d7, d11, d13, d17, 13, d5, d7
1209 transpose_4x4 d5, d7, d9, d11
1211 vmull.s16 q13, d6, d0[3]
1212 vmlal.s16 q13, d10, d0[2]
1213 vmlal.s16 q13, d12, d0[1]
1214 vmlal.s16 q13, d16, d0[0]
1215 vmull.s16 q12, d7, d0[3]
1216 vmlal.s16 q12, d11, d0[2]
1217 vmlal.s16 q12, d13, d0[1]
1218 vmlal.s16 q12, d17, d0[0]
1219 vshll.s16 q14, d4, #15
1220 vshll.s16 q15, d5, #15
1221 vadd.s32 q10, q14, q13
1222 vsub.s32 q14, q14, q13
1223 vrshrn.s32 d4, q10, #13
1224 vrshrn.s32 d6, q14, #13
1225 vadd.s32 q10, q15, q12
1226 vsub.s32 q14, q15, q12
1227 vrshrn.s32 d5, q10, #13
1228 vrshrn.s32 d7, q14, #13
1234 idct_helper d4, d6, d10, d7, d11, 20, d26, d27
1238 vadd.s16 q13, q13, q15
1239 vqmovun.s16 d26, q13
1240 vqmovun.s16 d27, q13
1242 /* Store results to the output buffer */
1243 ldmia OUTPUT_BUF, {TMP1, TMP2}
1244 add TMP1, TMP1, OUTPUT_COL
1245 add TMP2, TMP2, OUTPUT_COL
1247 vst1.8 {d26[0]}, [TMP1]!
1248 vst1.8 {d27[4]}, [TMP1]!
1249 vst1.8 {d26[1]}, [TMP2]!
1250 vst1.8 {d27[5]}, [TMP2]!
1265 /*****************************************************************************/
1268 * jsimd_ycc_extrgb_convert_neon
1269 * jsimd_ycc_extbgr_convert_neon
1270 * jsimd_ycc_extrgbx_convert_neon
1271 * jsimd_ycc_extbgrx_convert_neon
1272 * jsimd_ycc_extxbgr_convert_neon
1273 * jsimd_ycc_extxrgb_convert_neon
1275 * Colorspace conversion YCbCr -> RGB
1281 vld1.8 {d4}, [U, :64]!
1282 vld1.8 {d5}, [V, :64]!
1283 vld1.8 {d0}, [Y, :64]!
1288 vld1.8 {d4[0]}, [U]!
1289 vld1.8 {d4[1]}, [U]!
1290 vld1.8 {d4[2]}, [U]!
1291 vld1.8 {d4[3]}, [U]!
1292 vld1.8 {d5[0]}, [V]!
1293 vld1.8 {d5[1]}, [V]!
1294 vld1.8 {d5[2]}, [V]!
1295 vld1.8 {d5[3]}, [V]!
1296 vld1.8 {d0[0]}, [Y]!
1297 vld1.8 {d0[1]}, [Y]!
1298 vld1.8 {d0[2]}, [Y]!
1299 vld1.8 {d0[3]}, [Y]!
1301 vld1.8 {d4[4]}, [U]!
1302 vld1.8 {d4[5]}, [U]!
1303 vld1.8 {d5[4]}, [V]!
1304 vld1.8 {d5[5]}, [V]!
1305 vld1.8 {d0[4]}, [Y]!
1306 vld1.8 {d0[5]}, [Y]!
1308 vld1.8 {d4[6]}, [U]!
1309 vld1.8 {d5[6]}, [V]!
1310 vld1.8 {d0[6]}, [Y]!
1312 .error unsupported macroblock size
1316 .macro do_store bpp, size
1319 vst3.8 {d10, d11, d12}, [RGB]!
1321 vst3.8 {d10[0], d11[0], d12[0]}, [RGB]!
1322 vst3.8 {d10[1], d11[1], d12[1]}, [RGB]!
1323 vst3.8 {d10[2], d11[2], d12[2]}, [RGB]!
1324 vst3.8 {d10[3], d11[3], d12[3]}, [RGB]!
1326 vst3.8 {d10[4], d11[4], d12[4]}, [RGB]!
1327 vst3.8 {d10[5], d11[5], d12[5]}, [RGB]!
1329 vst3.8 {d10[6], d11[6], d12[6]}, [RGB]!
1331 .error unsupported macroblock size
1335 vst4.8 {d10, d11, d12, d13}, [RGB]!
1337 vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
1338 vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
1339 vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
1340 vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
1342 vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
1343 vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
1345 vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
1347 .error unsupported macroblock size
1350 .error unsupported bpp
1354 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
1357 * 2 stage pipelined YCbCr->RGB conversion
1360 .macro do_yuv_to_rgb_stage1
1361 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */
1362 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */
1363 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */
1364 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */
1365 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */
1366 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */
1367 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */
1368 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
1369 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
1370 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
1373 .macro do_yuv_to_rgb_stage2
1374 vrshrn.s32 d20, q10, #15
1375 vrshrn.s32 d21, q11, #15
1376 vrshrn.s32 d24, q12, #14
1377 vrshrn.s32 d25, q13, #14
1378 vrshrn.s32 d28, q14, #14
1379 vrshrn.s32 d29, q15, #14
1380 vaddw.u8 q10, q10, d0
1381 vaddw.u8 q12, q12, d0
1382 vaddw.u8 q14, q14, d0
1383 vqmovun.s16 d1\g_offs, q10
1384 vqmovun.s16 d1\r_offs, q12
1385 vqmovun.s16 d1\b_offs, q14
1388 .macro do_yuv_to_rgb_stage2_store_load_stage1
1389 vld1.8 {d4}, [U, :64]!
1390 vrshrn.s32 d20, q10, #15
1391 vrshrn.s32 d21, q11, #15
1392 vrshrn.s32 d24, q12, #14
1393 vrshrn.s32 d25, q13, #14
1394 vrshrn.s32 d28, q14, #14
1395 vld1.8 {d5}, [V, :64]!
1396 vrshrn.s32 d29, q15, #14
1397 vaddw.u8 q10, q10, d0
1398 vaddw.u8 q12, q12, d0
1399 vaddw.u8 q14, q14, d0
1400 vqmovun.s16 d1\g_offs, q10
1401 vld1.8 {d0}, [Y, :64]!
1402 vqmovun.s16 d1\r_offs, q12
1406 vqmovun.s16 d1\b_offs, q14
1407 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */
1408 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */
1410 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */
1411 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */
1412 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */
1413 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */
1414 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */
1415 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
1416 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
1417 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
1420 .macro do_yuv_to_rgb
1421 do_yuv_to_rgb_stage1
1422 do_yuv_to_rgb_stage2
1425 /* Apple gas crashes on adrl, work around that by using adr.
1426 * But this requires a copy of these constants for each function.
1430 jsimd_ycc_\colorid\()_neon_consts:
1432 .short 22971, -11277, -23401, 29033
1433 .short -128, -128, -128, -128
1434 .short -128, -128, -128, -128
1436 asm_function jsimd_ycc_\colorid\()_convert_neon
1437 OUTPUT_WIDTH .req r0
1445 INPUT_BUF2 .req INPUT_BUF
1453 /* Load constants to d1, d2, d3 (d0 is just used for padding) */
1454 adr ip, jsimd_ycc_\colorid\()_neon_consts
1455 vld1.16 {d0, d1, d2, d3}, [ip, :128]
1457 /* Save ARM registers and handle input arguments */
1458 push {r4, r5, r6, r7, r8, r9, r10, lr}
1459 ldr NUM_ROWS, [sp, #(4 * 8)]
1460 ldr INPUT_BUF0, [INPUT_BUF]
1461 ldr INPUT_BUF1, [INPUT_BUF, #4]
1462 ldr INPUT_BUF2, [INPUT_BUF, #8]
1465 /* Save NEON registers */
1468 /* Initially set d10, d11, d12, d13 to 0xFF */
1472 /* Outer loop over scanlines */
1476 ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2]
1477 ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2]
1479 ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2]
1480 add INPUT_ROW, INPUT_ROW, #1
1481 ldr RGB, [OUTPUT_BUF], #4
1483 /* Inner loop over pixels */
1487 do_yuv_to_rgb_stage1
1491 do_yuv_to_rgb_stage2_store_load_stage1
1495 do_yuv_to_rgb_stage2
1525 subs NUM_ROWS, NUM_ROWS, #1
1528 /* Restore all registers and return */
1530 pop {r4, r5, r6, r7, r8, r9, r10, pc}
1546 .purgem do_yuv_to_rgb
1547 .purgem do_yuv_to_rgb_stage1
1548 .purgem do_yuv_to_rgb_stage2
1549 .purgem do_yuv_to_rgb_stage2_store_load_stage1
1553 /*--------------------------------- id ----- bpp R G B */
1554 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2
1555 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0
1556 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2
1557 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0
1558 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1
1559 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3
1564 /*****************************************************************************/
1567 * jsimd_extrgb_ycc_convert_neon
1568 * jsimd_extbgr_ycc_convert_neon
1569 * jsimd_extrgbx_ycc_convert_neon
1570 * jsimd_extbgrx_ycc_convert_neon
1571 * jsimd_extxbgr_ycc_convert_neon
1572 * jsimd_extxrgb_ycc_convert_neon
1574 * Colorspace conversion RGB -> YCbCr
1577 .macro do_store size
1583 vst1.8 {d20[0]}, [Y]!
1584 vst1.8 {d20[1]}, [Y]!
1585 vst1.8 {d20[2]}, [Y]!
1586 vst1.8 {d20[3]}, [Y]!
1587 vst1.8 {d21[0]}, [U]!
1588 vst1.8 {d21[1]}, [U]!
1589 vst1.8 {d21[2]}, [U]!
1590 vst1.8 {d21[3]}, [U]!
1591 vst1.8 {d22[0]}, [V]!
1592 vst1.8 {d22[1]}, [V]!
1593 vst1.8 {d22[2]}, [V]!
1594 vst1.8 {d22[3]}, [V]!
1596 vst1.8 {d20[4]}, [Y]!
1597 vst1.8 {d20[5]}, [Y]!
1598 vst1.8 {d21[4]}, [U]!
1599 vst1.8 {d21[5]}, [U]!
1600 vst1.8 {d22[4]}, [V]!
1601 vst1.8 {d22[5]}, [V]!
1603 vst1.8 {d20[6]}, [Y]!
1604 vst1.8 {d21[6]}, [U]!
1605 vst1.8 {d22[6]}, [V]!
1607 .error unsupported macroblock size
1611 .macro do_load bpp, size
1614 vld3.8 {d10, d11, d12}, [RGB]!
1617 vld3.8 {d10[0], d11[0], d12[0]}, [RGB]!
1618 vld3.8 {d10[1], d11[1], d12[1]}, [RGB]!
1619 vld3.8 {d10[2], d11[2], d12[2]}, [RGB]!
1620 vld3.8 {d10[3], d11[3], d12[3]}, [RGB]!
1622 vld3.8 {d10[4], d11[4], d12[4]}, [RGB]!
1623 vld3.8 {d10[5], d11[5], d12[5]}, [RGB]!
1625 vld3.8 {d10[6], d11[6], d12[6]}, [RGB]!
1627 .error unsupported macroblock size
1631 vld4.8 {d10, d11, d12, d13}, [RGB]!
1634 vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
1635 vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
1636 vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
1637 vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
1639 vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
1640 vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
1642 vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
1644 .error unsupported macroblock size
1647 .error unsupported bpp
1651 .macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
1654 * 2 stage pipelined RGB->YCbCr conversion
1657 .macro do_rgb_to_yuv_stage1
1658 vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
1659 vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
1660 vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
1661 vmull.u16 q7, d4, d0[0]
1662 vmlal.u16 q7, d6, d0[1]
1663 vmlal.u16 q7, d8, d0[2]
1664 vmull.u16 q8, d5, d0[0]
1665 vmlal.u16 q8, d7, d0[1]
1666 vmlal.u16 q8, d9, d0[2]
1669 vmlsl.u16 q9, d4, d0[3]
1670 vmlsl.u16 q9, d6, d1[0]
1671 vmlal.u16 q9, d8, d1[1]
1672 vmlsl.u16 q13, d5, d0[3]
1673 vmlsl.u16 q13, d7, d1[0]
1674 vmlal.u16 q13, d9, d1[1]
1677 vmlal.u16 q14, d4, d1[1]
1678 vmlsl.u16 q14, d6, d1[2]
1679 vmlsl.u16 q14, d8, d1[3]
1680 vmlal.u16 q15, d5, d1[1]
1681 vmlsl.u16 q15, d7, d1[2]
1682 vmlsl.u16 q15, d9, d1[3]
1685 .macro do_rgb_to_yuv_stage2
1686 vrshrn.u32 d20, q7, #16
1687 vrshrn.u32 d21, q8, #16
1688 vshrn.u32 d22, q9, #16
1689 vshrn.u32 d23, q13, #16
1690 vshrn.u32 d24, q14, #16
1691 vshrn.u32 d25, q15, #16
1692 vmovn.u16 d20, q10 /* d20 = y */
1693 vmovn.u16 d21, q11 /* d21 = u */
1694 vmovn.u16 d22, q12 /* d22 = v */
1697 .macro do_rgb_to_yuv
1698 do_rgb_to_yuv_stage1
1699 do_rgb_to_yuv_stage2
1702 .macro do_rgb_to_yuv_stage2_store_load_stage1
1703 vrshrn.u32 d20, q7, #16
1704 vrshrn.u32 d21, q8, #16
1705 vshrn.u32 d22, q9, #16
1707 vshrn.u32 d23, q13, #16
1709 vshrn.u32 d24, q14, #16
1710 vshrn.u32 d25, q15, #16
1712 vmovn.u16 d20, q10 /* d20 = y */
1713 vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
1714 vmovn.u16 d21, q11 /* d21 = u */
1715 vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
1716 vmovn.u16 d22, q12 /* d22 = v */
1717 vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
1718 vmull.u16 q7, d4, d0[0]
1719 vmlal.u16 q7, d6, d0[1]
1720 vmlal.u16 q7, d8, d0[2]
1722 vmull.u16 q8, d5, d0[0]
1723 vmlal.u16 q8, d7, d0[1]
1724 vmlal.u16 q8, d9, d0[2]
1725 vmlsl.u16 q9, d4, d0[3]
1726 vmlsl.u16 q9, d6, d1[0]
1727 vmlal.u16 q9, d8, d1[1]
1729 vmlsl.u16 q13, d5, d0[3]
1730 vmlsl.u16 q13, d7, d1[0]
1731 vmlal.u16 q13, d9, d1[1]
1734 vmlal.u16 q14, d4, d1[1]
1735 vmlsl.u16 q14, d6, d1[2]
1736 vmlsl.u16 q14, d8, d1[3]
1738 vmlal.u16 q15, d5, d1[1]
1739 vmlsl.u16 q15, d7, d1[2]
1740 vmlsl.u16 q15, d9, d1[3]
1744 jsimd_\colorid\()_ycc_neon_consts:
1745 .short 19595, 38470, 7471, 11059
1746 .short 21709, 32768, 27439, 5329
1747 .short 32767, 128, 32767, 128
1748 .short 32767, 128, 32767, 128
1750 asm_function jsimd_\colorid\()_ycc_convert_neon
1751 OUTPUT_WIDTH .req r0
1759 OUTPUT_BUF2 .req OUTPUT_BUF
1767 /* Load constants to d0, d1, d2, d3 */
1768 adr ip, jsimd_\colorid\()_ycc_neon_consts
1769 vld1.16 {d0, d1, d2, d3}, [ip, :128]
1771 /* Save ARM registers and handle input arguments */
1772 push {r4, r5, r6, r7, r8, r9, r10, lr}
1773 ldr NUM_ROWS, [sp, #(4 * 8)]
1774 ldr OUTPUT_BUF0, [OUTPUT_BUF]
1775 ldr OUTPUT_BUF1, [OUTPUT_BUF, #4]
1776 ldr OUTPUT_BUF2, [OUTPUT_BUF, #8]
1779 /* Save NEON registers */
1782 /* Outer loop over scanlines */
1786 ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
1787 ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
1789 ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
1790 add OUTPUT_ROW, OUTPUT_ROW, #1
1791 ldr RGB, [INPUT_BUF], #4
1793 /* Inner loop over pixels */
1797 do_rgb_to_yuv_stage1
1801 do_rgb_to_yuv_stage2_store_load_stage1
1805 do_rgb_to_yuv_stage2
1835 subs NUM_ROWS, NUM_ROWS, #1
1838 /* Restore all registers and return */
1840 pop {r4, r5, r6, r7, r8, r9, r10, pc}
1856 .purgem do_rgb_to_yuv
1857 .purgem do_rgb_to_yuv_stage1
1858 .purgem do_rgb_to_yuv_stage2
1859 .purgem do_rgb_to_yuv_stage2_store_load_stage1
1863 /*--------------------------------- id ----- bpp R G B */
1864 generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2
1865 generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0
1866 generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
1867 generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
1868 generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
1869 generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
1874 /*****************************************************************************/
1877 * Load data into workspace, applying unsigned->signed conversion
1879 * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
1880 * rid of VST1.16 instructions
1883 asm_function jsimd_convsamp_neon
1895 ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
1896 add TMP1, TMP1, START_COL
1897 add TMP2, TMP2, START_COL
1898 add TMP3, TMP3, START_COL
1899 add TMP4, TMP4, START_COL
1900 vld1.8 {d16}, [TMP1]
1901 vsubl.u8 q8, d16, d0
1902 vld1.8 {d18}, [TMP2]
1903 vsubl.u8 q9, d18, d0
1904 vld1.8 {d20}, [TMP3]
1905 vsubl.u8 q10, d20, d0
1906 vld1.8 {d22}, [TMP4]
1907 ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
1908 vsubl.u8 q11, d22, d0
1909 vst1.16 {d16, d17, d18, d19}, [WORKSPACE, :128]!
1910 add TMP1, TMP1, START_COL
1911 add TMP2, TMP2, START_COL
1912 vst1.16 {d20, d21, d22, d23}, [WORKSPACE, :128]!
1913 add TMP3, TMP3, START_COL
1914 add TMP4, TMP4, START_COL
1915 vld1.8 {d24}, [TMP1]
1916 vsubl.u8 q12, d24, d0
1917 vld1.8 {d26}, [TMP2]
1918 vsubl.u8 q13, d26, d0
1919 vld1.8 {d28}, [TMP3]
1920 vsubl.u8 q14, d28, d0
1921 vld1.8 {d30}, [TMP4]
1922 vsubl.u8 q15, d30, d0
1923 vst1.16 {d24, d25, d26, d27}, [WORKSPACE, :128]!
1924 vst1.16 {d28, d29, d30, d31}, [WORKSPACE, :128]!
1937 /*****************************************************************************/
1940 * jsimd_fdct_ifast_neon
1942 * This function contains a fast, not so accurate integer implementation of
1943 * the forward DCT (Discrete Cosine Transform). It uses the same calculations
1944 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
1945 * function from jfdctfst.c
1947 * TODO: can be combined with 'jsimd_convsamp_neon' to get
1948 * rid of a bunch of VLD1.16 instructions
1951 #define XFIX_0_382683433 d0[0]
1952 #define XFIX_0_541196100 d0[1]
1953 #define XFIX_0_707106781 d0[2]
1954 #define XFIX_1_306562965 d0[3]
1957 jsimd_fdct_ifast_neon_consts:
1958 .short (98 * 128) /* XFIX_0_382683433 */
1959 .short (139 * 128) /* XFIX_0_541196100 */
1960 .short (181 * 128) /* XFIX_0_707106781 */
1961 .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */
1963 asm_function jsimd_fdct_ifast_neon
1970 /* Load constants */
1971 adr TMP, jsimd_fdct_ifast_neon_consts
1972 vld1.16 {d0}, [TMP, :64]
1974 /* Load all DATA into NEON registers with the following allocation:
1976 * ---------+--------
1977 * 0 | d16 | d17 | q8
1978 * 1 | d18 | d19 | q9
1979 * 2 | d20 | d21 | q10
1980 * 3 | d22 | d23 | q11
1981 * 4 | d24 | d25 | q12
1982 * 5 | d26 | d27 | q13
1983 * 6 | d28 | d29 | q14
1984 * 7 | d30 | d31 | q15
1987 vld1.16 {d16, d17, d18, d19}, [DATA, :128]!
1988 vld1.16 {d20, d21, d22, d23}, [DATA, :128]!
1989 vld1.16 {d24, d25, d26, d27}, [DATA, :128]!
1990 vld1.16 {d28, d29, d30, d31}, [DATA, :128]
1991 sub DATA, DATA, #(128 - 32)
2008 vadd.s16 q2, q11, q12
2010 vsub.s16 q12, q11, q12
2011 vsub.s16 q6, q10, q13
2012 vadd.s16 q10, q10, q13
2013 vsub.s16 q7, q9, q14
2014 vadd.s16 q9, q9, q14
2015 vsub.s16 q1, q8, q15
2016 vadd.s16 q8, q8, q15
2017 vsub.s16 q4, q9, q10
2019 vadd.s16 q3, q9, q10
2022 vqdmulh.s16 q4, q4, XFIX_0_707106781
2023 vadd.s16 q11, q12, q6
2025 vsub.s16 q12, q2, q3
2028 vqdmulh.s16 q3, q3, XFIX_0_707106781
2029 vsub.s16 q6, q11, q7
2030 vadd.s16 q10, q5, q4
2031 vqdmulh.s16 q6, q6, XFIX_0_382683433
2032 vsub.s16 q14, q5, q4
2033 vqdmulh.s16 q11, q11, XFIX_0_541196100
2034 vqdmulh.s16 q5, q7, XFIX_1_306562965
2038 vadd.s16 q11, q11, q6
2040 vadd.s16 q13, q3, q11
2041 vsub.s16 q11, q3, q11
2043 vsub.s16 q15, q4, q7
2048 vst1.16 {d16, d17, d18, d19}, [DATA, :128]!
2049 vst1.16 {d20, d21, d22, d23}, [DATA, :128]!
2050 vst1.16 {d24, d25, d26, d27}, [DATA, :128]!
2051 vst1.16 {d28, d29, d30, d31}, [DATA, :128]
2060 /*****************************************************************************/
2064 * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM * divisors,
2065 * DCTELEM * workspace);
2067 * Note: the code uses 2 stage pipelining in order to improve instructions
2068 * scheduling and eliminate stalls (this provides ~15% better
2069 * performance for this function on both ARM Cortex-A8 and
2070 * ARM Cortex-A9 when compared to the non-pipelined variant).
2071 * The instructions which belong to the second stage use different
2072 * indentation for better readiability.
2074 asm_function jsimd_quantize_neon
2080 RECIPROCAL .req DIVISORS
2085 vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]!
2087 add CORRECTION, DIVISORS, #(64 * 2)
2088 add SHIFT, DIVISORS, #(64 * 6)
2089 vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]!
2091 vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]!
2092 vadd.u16 q12, q12, q10 /* add correction */
2093 vadd.u16 q13, q13, q11
2094 vmull.u16 q10, d24, d16 /* multiply by reciprocal */
2095 vmull.u16 q11, d25, d17
2096 vmull.u16 q8, d26, d18
2097 vmull.u16 q9, d27, d19
2098 vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]!
2099 vshrn.u32 d20, q10, #16
2100 vshrn.u32 d21, q11, #16
2101 vshrn.u32 d22, q8, #16
2102 vshrn.u32 d23, q9, #16
2105 vshr.s16 q2, q0, #15 /* extract sign */
2106 vshr.s16 q3, q1, #15
2107 vshl.u16 q14, q10, q12 /* shift */
2108 vshl.u16 q15, q11, q13
2113 vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]!
2114 veor.u16 q14, q14, q2 /* restore sign */
2116 vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]!
2118 veor.u16 q15, q15, q3
2119 vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]!
2120 vadd.u16 q12, q12, q10 /* add correction */
2121 vadd.u16 q13, q13, q11
2122 vmull.u16 q10, d24, d16 /* multiply by reciprocal */
2123 vmull.u16 q11, d25, d17
2124 vmull.u16 q8, d26, d18
2125 vmull.u16 q9, d27, d19
2126 vsub.u16 q14, q14, q2
2127 vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]!
2128 vsub.u16 q15, q15, q3
2129 vshrn.u32 d20, q10, #16
2130 vshrn.u32 d21, q11, #16
2131 vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
2132 vshrn.u32 d22, q8, #16
2133 vshrn.u32 d23, q9, #16
2136 vshr.s16 q2, q0, #15 /* extract sign */
2137 vshr.s16 q3, q1, #15
2138 vshl.u16 q14, q10, q12 /* shift */
2139 vshl.u16 q15, q11, q13
2140 subs LOOP_COUNT, LOOP_COUNT, #1
2144 veor.u16 q14, q14, q2 /* restore sign */
2145 veor.u16 q15, q15, q3
2146 vsub.u16 q14, q14, q2
2147 vsub.u16 q15, q15, q3
2148 vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
2161 /*****************************************************************************/
2165 * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor,
2166 * JDIMENSION downsampled_width,
2167 * JSAMPARRAY input_data,
2168 * JSAMPARRAY * output_data_ptr);
2170 * Note: the use of unaligned writes is the main remaining bottleneck in
2171 * this code, which can be potentially solved to get up to tens
2172 * of percents performance improvement on Cortex-A8/Cortex-A9.
2176 * Upsample 16 source pixels to 32 destination pixels. The new 16 source
2177 * pixels are loaded to q0. The previous 16 source pixels are in q1. The
2178 * shifted-by-one source pixels are constructed in q2 by using q0 and q1.
2179 * Register d28 is used for multiplication by 3. Register q15 is used
2180 * for adding +1 bias.
2182 .macro upsample16 OUTPTR, INPTR
2183 vld1.8 {q0}, [\INPTR]!
2185 vext.8 q2, q1, q0, #15
2187 vaddw.u8 q10, q15, d4
2188 vaddw.u8 q11, q15, d5
2189 vmlal.u8 q8, d4, d28
2190 vmlal.u8 q9, d5, d28
2191 vmlal.u8 q10, d0, d28
2192 vmlal.u8 q11, d1, d28
2193 vmov q1, q0 /* backup source pixels to q1 */
2194 vrshrn.u16 d6, q8, #2
2195 vrshrn.u16 d7, q9, #2
2196 vshrn.u16 d8, q10, #2
2197 vshrn.u16 d9, q11, #2
2198 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
2202 * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
2203 * macro, the roles of q0 and q1 registers are reversed for even and odd
2204 * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
2205 * Also this unrolling allows to reorder loads and stores to compensate
2206 * multiplication latency and reduce stalls.
2208 .macro upsample32 OUTPTR, INPTR
2209 /* even 16 pixels group */
2210 vld1.8 {q0}, [\INPTR]!
2212 vext.8 q2, q1, q0, #15
2214 vaddw.u8 q10, q15, d4
2215 vaddw.u8 q11, q15, d5
2216 vmlal.u8 q8, d4, d28
2217 vmlal.u8 q9, d5, d28
2218 vmlal.u8 q10, d0, d28
2219 vmlal.u8 q11, d1, d28
2220 /* odd 16 pixels group */
2221 vld1.8 {q1}, [\INPTR]!
2222 vrshrn.u16 d6, q8, #2
2223 vrshrn.u16 d7, q9, #2
2224 vshrn.u16 d8, q10, #2
2225 vshrn.u16 d9, q11, #2
2227 vext.8 q2, q0, q1, #15
2229 vaddw.u8 q10, q15, d4
2230 vaddw.u8 q11, q15, d5
2231 vmlal.u8 q8, d4, d28
2232 vmlal.u8 q9, d5, d28
2233 vmlal.u8 q10, d2, d28
2234 vmlal.u8 q11, d3, d28
2235 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
2236 vrshrn.u16 d6, q8, #2
2237 vrshrn.u16 d7, q9, #2
2238 vshrn.u16 d8, q10, #2
2239 vshrn.u16 d9, q11, #2
2240 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
2244 * Upsample a row of WIDTH pixels from INPTR to OUTPTR.
2246 .macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
2247 /* special case for the first and last pixels */
2248 sub \WIDTH, \WIDTH, #1
2249 add \OUTPTR, \OUTPTR, #1
2250 ldrb \TMP1, [\INPTR, \WIDTH]
2251 strb \TMP1, [\OUTPTR, \WIDTH, asl #1]
2252 ldrb \TMP1, [\INPTR], #1
2253 strb \TMP1, [\OUTPTR, #-1]
2256 subs \WIDTH, \WIDTH, #32
2258 0: /* process 32 pixels per iteration */
2259 upsample32 \OUTPTR, \INPTR
2260 subs \WIDTH, \WIDTH, #32
2263 adds \WIDTH, \WIDTH, #16
2265 0: /* process 16 pixels if needed */
2266 upsample16 \OUTPTR, \INPTR
2267 subs \WIDTH, \WIDTH, #16
2269 adds \WIDTH, \WIDTH, #16
2272 /* load the remaining 1-15 pixels */
2273 add \INPTR, \INPTR, \WIDTH
2276 sub \INPTR, \INPTR, #1
2277 vld1.8 {d0[0]}, [\INPTR]
2281 vext.8 d0, d0, d0, #6
2282 sub \INPTR, \INPTR, #1
2283 vld1.8 {d0[1]}, [\INPTR]
2284 sub \INPTR, \INPTR, #1
2285 vld1.8 {d0[0]}, [\INPTR]
2290 sub \INPTR, \INPTR, #1
2291 vld1.8 {d0[3]}, [\INPTR]
2292 sub \INPTR, \INPTR, #1
2293 vld1.8 {d0[2]}, [\INPTR]
2294 sub \INPTR, \INPTR, #1
2295 vld1.8 {d0[1]}, [\INPTR]
2296 sub \INPTR, \INPTR, #1
2297 vld1.8 {d0[0]}, [\INPTR]
2302 sub \INPTR, \INPTR, #8
2303 vld1.8 {d0}, [\INPTR]
2304 2: /* upsample the remaining pixels */
2306 vext.8 q2, q1, q0, #15
2308 vaddw.u8 q10, q15, d4
2309 vaddw.u8 q11, q15, d5
2310 vmlal.u8 q8, d4, d28
2311 vmlal.u8 q9, d5, d28
2312 vmlal.u8 q10, d0, d28
2313 vmlal.u8 q11, d1, d28
2314 vrshrn.u16 d10, q8, #2
2315 vrshrn.u16 d12, q9, #2
2316 vshrn.u16 d11, q10, #2
2317 vshrn.u16 d13, q11, #2
2320 /* store the remaining pixels */
2323 vst1.8 {d10, d11}, [\OUTPTR]!
2328 vst1.8 {d10}, [\OUTPTR]!
2333 vst1.8 {d10[0]}, [\OUTPTR]!
2334 vst1.8 {d10[1]}, [\OUTPTR]!
2335 vst1.8 {d10[2]}, [\OUTPTR]!
2336 vst1.8 {d10[3]}, [\OUTPTR]!
2337 vext.8 d10, d10, d10, #4
2341 vst1.8 {d10[0]}, [\OUTPTR]!
2342 vst1.8 {d10[1]}, [\OUTPTR]!
2347 asm_function jsimd_h2v1_fancy_upsample_neon
2349 MAX_V_SAMP_FACTOR .req r0
2350 DOWNSAMPLED_WIDTH .req r1
2352 OUTPUT_DATA_PTR .req r3
2353 OUTPUT_DATA .req OUTPUT_DATA_PTR
2360 push {r4, r5, r6, lr}
2363 ldr OUTPUT_DATA, [OUTPUT_DATA_PTR]
2364 cmp MAX_V_SAMP_FACTOR, #0
2367 /* initialize constants */
2371 ldr INPTR, [INPUT_DATA], #4
2372 ldr OUTPTR, [OUTPUT_DATA], #4
2373 mov WIDTH, DOWNSAMPLED_WIDTH
2374 upsample_row OUTPTR, INPTR, WIDTH, TMP
2375 subs MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
2380 pop {r4, r5, r6, pc}
2382 .unreq MAX_V_SAMP_FACTOR
2383 .unreq DOWNSAMPLED_WIDTH
2385 .unreq OUTPUT_DATA_PTR
2397 .purgem upsample_row