2 * ARMv7 NEON optimizations for libjpeg-turbo
4 * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
7 * Copyright (C) 2014, Siarhei Siamashka. All Rights Reserved.
8 * Copyright (C) 2014, Linaro Limited. All Rights Reserved.
9 * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
10 * Copyright (C) 2015-2016, 2018, Matthieu Darbois. All Rights Reserved.
12 * This software is provided 'as-is', without any express or implied
13 * warranty. In no event will the authors be held liable for any damages
14 * arising from the use of this software.
16 * Permission is granted to anyone to use this software for any purpose,
17 * including commercial applications, and to alter it and redistribute it
18 * freely, subject to the following restrictions:
20 * 1. The origin of this software must not be misrepresented; you must not
21 * claim that you wrote the original software. If you use this software
22 * in a product, an acknowledgment in the product documentation would be
23 * appreciated but is not required.
24 * 2. Altered source versions must be plainly marked as such, and must not be
25 * misrepresented as being the original software.
26 * 3. This notice may not be removed or altered from any source distribution.
29 #if defined(__linux__) && defined(__ELF__)
30 .section .note.GNU-stack, "", %progbits /* mark stack as non-executable */
41 #define RESPECT_STRICT_ALIGNMENT 1
44 /*****************************************************************************/
46 /* Supplementary macro for setting function attributes */
47 .macro asm_function fname
49 .private_extern _\fname
56 .type \fname, %function
62 /* Transpose a block of 4x4 coefficients in four 64-bit registers */
63 .macro transpose_4x4 x0, x1, x2, x3
71 #define CENTERJSAMPLE 128
73 /*****************************************************************************/
76 * Perform dequantization and inverse DCT on one block of coefficients.
79 * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
80 * JSAMPARRAY output_buf, JDIMENSION output_col)
83 #define FIX_0_298631336 (2446)
84 #define FIX_0_390180644 (3196)
85 #define FIX_0_541196100 (4433)
86 #define FIX_0_765366865 (6270)
87 #define FIX_0_899976223 (7373)
88 #define FIX_1_175875602 (9633)
89 #define FIX_1_501321110 (12299)
90 #define FIX_1_847759065 (15137)
91 #define FIX_1_961570560 (16069)
92 #define FIX_2_053119869 (16819)
93 #define FIX_2_562915447 (20995)
94 #define FIX_3_072711026 (25172)
96 #define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
97 #define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
98 #define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
99 #define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
100 #define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
101 #define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
102 #define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
103 #define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865)
106 * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
107 * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
109 #define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) { \
110 DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
111 JLONG q1, q2, q3, q4, q5, q6, q7; \
112 JLONG tmp11_plus_tmp2, tmp11_minus_tmp2; \
114 /* 1-D iDCT input data */ \
126 q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
127 MULTIPLY(q4, FIX_1_175875602); \
128 q7 = MULTIPLY(q5, FIX_1_175875602) + \
129 MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
130 q2 = MULTIPLY(row2, FIX_0_541196100) + \
131 MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
133 q3 = ((JLONG)row0 - (JLONG)row4) << 13; \
134 q6 += MULTIPLY(row5, -FIX_2_562915447) + \
135 MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
136 /* now we can use q1 (reloadable constants have been used up) */ \
138 q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
139 MULTIPLY(row1, -FIX_0_899976223); \
142 q7 += MULTIPLY(row7, -FIX_0_899976223) + \
143 MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
145 /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
146 tmp11_plus_tmp2 = q1; \
150 q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
151 MULTIPLY(row3, -FIX_2_562915447); \
153 q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
154 MULTIPLY(row6, FIX_0_541196100); \
157 /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
158 tmp11_minus_tmp2 = q1; \
160 q1 = ((JLONG)row0 + (JLONG)row4) << 13; \
164 /* pick up the results */ \
167 tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
170 tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
175 #define XFIX_0_899976223 d0[0]
176 #define XFIX_0_541196100 d0[1]
177 #define XFIX_2_562915447 d0[2]
178 #define XFIX_0_298631336_MINUS_0_899976223 d0[3]
179 #define XFIX_1_501321110_MINUS_0_899976223 d1[0]
180 #define XFIX_2_053119869_MINUS_2_562915447 d1[1]
181 #define XFIX_0_541196100_PLUS_0_765366865 d1[2]
182 #define XFIX_1_175875602 d1[3]
183 #define XFIX_1_175875602_MINUS_0_390180644 d2[0]
184 #define XFIX_0_541196100_MINUS_1_847759065 d2[1]
185 #define XFIX_3_072711026_MINUS_2_562915447 d2[2]
186 #define XFIX_1_175875602_MINUS_1_961570560 d2[3]
189 jsimd_idct_islow_neon_consts:
190 .short FIX_0_899976223 /* d0[0] */
191 .short FIX_0_541196100 /* d0[1] */
192 .short FIX_2_562915447 /* d0[2] */
193 .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */
194 .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */
195 .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */
196 .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */
197 .short FIX_1_175875602 /* d1[3] */
198 /* reloadable constants */
199 .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */
200 .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */
201 .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */
202 .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */
204 asm_function jsimd_idct_islow_neon
232 /* Load and dequantize coefficients into NEON registers
233 * with the following allocation:
236 * 0 | d16 | d17 ( q8 )
237 * 1 | d18 | d19 ( q9 )
238 * 2 | d20 | d21 ( q10 )
239 * 3 | d22 | d23 ( q11 )
240 * 4 | d24 | d25 ( q12 )
241 * 5 | d26 | d27 ( q13 )
242 * 6 | d28 | d29 ( q14 )
243 * 7 | d30 | d31 ( q15 )
245 adr ip, jsimd_idct_islow_neon_consts
246 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
247 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
248 vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
250 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
252 vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
253 vmul.s16 q10, q10, q2
254 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
255 vmul.s16 q11, q11, q3
256 vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]
257 vmul.s16 q12, q12, q0
258 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
259 vmul.s16 q14, q14, q2
260 vmul.s16 q13, q13, q1
261 vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */
263 vmul.s16 q15, q15, q3
264 vpush {d8-d15} /* save NEON registers */
265 /* 1-D IDCT, pass 1, left 4x8 half */
266 vadd.s16 d4, ROW7L, ROW3L
267 vadd.s16 d5, ROW5L, ROW1L
268 vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560
269 vmlal.s16 q6, d5, XFIX_1_175875602
270 vmull.s16 q7, d4, XFIX_1_175875602
271 /* Check for the zero coefficients in the right 4x8 half */
273 vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644
274 vsubl.s16 q3, ROW0L, ROW4L
275 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
276 vmull.s16 q2, ROW2L, XFIX_0_541196100
277 vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
280 vmlsl.s16 q6, ROW5L, XFIX_2_562915447
281 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
282 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
285 vmlsl.s16 q4, ROW1L, XFIX_0_899976223
288 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
292 vmlsl.s16 q7, ROW7L, XFIX_0_899976223
294 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
295 vrshrn.s32 ROW1L, q1, #11
296 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
298 vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
300 vmlsl.s16 q5, ROW3L, XFIX_2_562915447
303 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
304 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
305 vmlal.s16 q6, ROW6L, XFIX_0_541196100
308 vrshrn.s32 ROW6L, q1, #11
311 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
313 vaddl.s16 q5, ROW0L, ROW4L
315 vrshrn.s32 ROW2L, q1, #11
317 vrshrn.s32 ROW5L, q3, #11
318 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
320 vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
326 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
332 vrshrn.s32 ROW7L, q2, #11
333 vrshrn.s32 ROW3L, q5, #11
334 vrshrn.s32 ROW0L, q6, #11
335 vrshrn.s32 ROW4L, q3, #11
337 beq 3f /* Go to do some special handling for the sparse
340 /* 1-D IDCT, pass 1, right 4x8 half */
341 vld1.s16 {d2}, [ip, :64] /* reload constants */
342 vadd.s16 d10, ROW7R, ROW3R
343 vadd.s16 d8, ROW5R, ROW1R
344 /* Transpose left 4x8 half */
346 vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560
347 vmlal.s16 q6, d8, XFIX_1_175875602
349 vmull.s16 q7, d10, XFIX_1_175875602
350 vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644
352 vsubl.s16 q3, ROW0R, ROW4R
353 vmull.s16 q2, ROW2R, XFIX_0_541196100
354 vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
357 vmlsl.s16 q6, ROW5R, XFIX_2_562915447
358 vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
361 vmlsl.s16 q4, ROW1R, XFIX_0_899976223
367 vmlsl.s16 q7, ROW7R, XFIX_0_899976223
368 vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
369 vrshrn.s32 ROW1R, q1, #11
372 vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
373 vmlsl.s16 q5, ROW3R, XFIX_2_562915447
375 vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
376 vmlal.s16 q6, ROW6R, XFIX_0_541196100
378 vrshrn.s32 ROW6R, q1, #11
381 vaddl.s16 q5, ROW0R, ROW4R
382 vrshrn.s32 ROW2R, q1, #11
383 vrshrn.s32 ROW5R, q3, #11
385 vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
392 vrshrn.s32 ROW7R, q2, #11
393 vrshrn.s32 ROW3R, q5, #11
394 vrshrn.s32 ROW0R, q6, #11
395 vrshrn.s32 ROW4R, q3, #11
396 /* Transpose right 4x8 half */
406 1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
407 vld1.s16 {d2}, [ip, :64] /* reload constants */
408 vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */
409 vmlal.s16 q6, ROW1L, XFIX_1_175875602
410 vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
411 vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
412 vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */
413 vmlal.s16 q7, ROW3L, XFIX_1_175875602
414 vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
415 vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
416 vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */
417 vmull.s16 q2, ROW2L, XFIX_0_541196100
418 vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */
420 vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */
421 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
423 vmlsl.s16 q4, ROW1L, XFIX_0_899976223
427 vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */
428 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
429 vshrn.s32 ROW1L, q1, #16
431 vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */
432 vmlsl.s16 q5, ROW3L, XFIX_2_562915447
434 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
435 vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */
437 vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
440 vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */
441 vshrn.s32 ROW2L, q1, #16
442 vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
444 vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */
451 vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
452 vshrn.s32 ROW3L, q5, #16
453 vshrn.s32 ROW0L, q6, #16
454 vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
455 /* 1-D IDCT, pass 2, right 4x8 half */
456 vld1.s16 {d2}, [ip, :64] /* reload constants */
457 vmull.s16 q6, ROW5R, XFIX_1_175875602
458 vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */
459 vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
460 vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
461 vmull.s16 q7, ROW7R, XFIX_1_175875602
462 vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */
463 vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
464 vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
465 vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */
466 vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */
467 vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
469 vmlsl.s16 q6, ROW5R, XFIX_2_562915447
470 vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */
472 vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */
476 vmlsl.s16 q7, ROW7R, XFIX_0_899976223
477 vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */
478 vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
480 vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
481 vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */
483 vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */
484 vmlal.s16 q6, ROW6R, XFIX_0_541196100
486 vshrn.s32 ROW6R, q1, #16
489 vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */
490 vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
491 vshrn.s32 ROW5R, q3, #16
493 vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
500 vshrn.s32 ROW7R, q2, #16
501 vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
502 vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
503 vshrn.s32 ROW4R, q3, #16
505 2: /* Descale to 8-bit and range limit */
506 vqrshrn.s16 d16, q8, #2
507 vqrshrn.s16 d17, q9, #2
508 vqrshrn.s16 d18, q10, #2
509 vqrshrn.s16 d19, q11, #2
510 vpop {d8-d15} /* restore NEON registers */
511 vqrshrn.s16 d20, q12, #2
512 /* Transpose the final 8-bit samples and do signed->unsigned conversion */
514 vqrshrn.s16 d21, q13, #2
515 vqrshrn.s16 d22, q14, #2
516 vmov.u8 q0, #(CENTERJSAMPLE)
517 vqrshrn.s16 d23, q15, #2
523 /* Store results to the output buffer */
524 ldmia OUTPUT_BUF!, {TMP1, TMP2}
525 add TMP1, TMP1, OUTPUT_COL
526 add TMP2, TMP2, OUTPUT_COL
530 ldmia OUTPUT_BUF!, {TMP1, TMP2}
531 add TMP1, TMP1, OUTPUT_COL
532 add TMP2, TMP2, OUTPUT_COL
536 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
537 add TMP1, TMP1, OUTPUT_COL
538 add TMP2, TMP2, OUTPUT_COL
539 add TMP3, TMP3, OUTPUT_COL
540 add TMP4, TMP4, OUTPUT_COL
549 3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
551 /* Transpose left 4x8 half */
556 vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */
563 beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second
566 /* Only row 0 is non-zero for the right 4x8 half */
567 vdup.s16 ROW1R, ROW0R[1]
568 vdup.s16 ROW2R, ROW0R[2]
569 vdup.s16 ROW3R, ROW0R[3]
570 vdup.s16 ROW4R, ROW0R[0]
571 vdup.s16 ROW5R, ROW0R[1]
572 vdup.s16 ROW6R, ROW0R[2]
573 vdup.s16 ROW7R, ROW0R[3]
574 vdup.s16 ROW0R, ROW0R[0]
575 b 1b /* Go to 'normal' second pass */
577 4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
578 vld1.s16 {d2}, [ip, :64] /* reload constants */
579 vmull.s16 q6, ROW1L, XFIX_1_175875602
580 vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
581 vmull.s16 q7, ROW3L, XFIX_1_175875602
582 vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
583 vmull.s16 q2, ROW2L, XFIX_0_541196100
584 vshll.s16 q3, ROW0L, #13
586 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
587 vmlsl.s16 q4, ROW1L, XFIX_0_899976223
590 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
593 vmlsl.s16 q5, ROW3L, XFIX_2_562915447
594 vshrn.s32 ROW1L, q1, #16
596 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
598 vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
601 vshll.s16 q5, ROW0L, #13
602 vshrn.s32 ROW2L, q1, #16
603 vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
610 vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
611 vshrn.s32 ROW3L, q5, #16
612 vshrn.s32 ROW0L, q6, #16
613 vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
614 /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
615 vld1.s16 {d2}, [ip, :64] /* reload constants */
616 vmull.s16 q6, ROW5L, XFIX_1_175875602
617 vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
618 vmull.s16 q7, ROW7L, XFIX_1_175875602
619 vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
620 vmull.s16 q2, ROW6L, XFIX_0_541196100
621 vshll.s16 q3, ROW4L, #13
623 vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
624 vmlsl.s16 q4, ROW5L, XFIX_0_899976223
627 vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
630 vmlsl.s16 q5, ROW7L, XFIX_2_562915447
631 vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
633 vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
635 vshrn.s32 ROW6R, q1, #16
638 vshll.s16 q5, ROW4L, #13
639 vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
640 vshrn.s32 ROW5R, q3, #16
647 vshrn.s32 ROW7R, q2, #16
648 vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
649 vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
650 vshrn.s32 ROW4R, q3, #16
651 b 2b /* Go to epilogue */
680 /*****************************************************************************/
683 * jsimd_idct_ifast_neon
685 * This function contains a fast, not so accurate integer implementation of
686 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
687 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
688 * function from jidctfst.c
690 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
691 * But in ARM NEON case some extra additions are required because VQDMULH
692 * instruction can't handle the constants larger than 1. So the expressions
693 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
694 * which introduces an extra addition. Overall, there are 6 extra additions
695 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
698 #define XFIX_1_082392200 d0[0]
699 #define XFIX_1_414213562 d0[1]
700 #define XFIX_1_847759065 d0[2]
701 #define XFIX_2_613125930 d0[3]
704 jsimd_idct_ifast_neon_consts:
705 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
706 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
707 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
708 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
710 asm_function jsimd_idct_ifast_neon
721 /* Load and dequantize coefficients into NEON registers
722 * with the following allocation:
725 * 0 | d16 | d17 ( q8 )
726 * 1 | d18 | d19 ( q9 )
727 * 2 | d20 | d21 ( q10 )
728 * 3 | d22 | d23 ( q11 )
729 * 4 | d24 | d25 ( q12 )
730 * 5 | d26 | d27 ( q13 )
731 * 6 | d28 | d29 ( q14 )
732 * 7 | d30 | d31 ( q15 )
734 adr ip, jsimd_idct_ifast_neon_consts
735 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
736 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
737 vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
739 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
741 vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
742 vmul.s16 q10, q10, q2
743 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
744 vmul.s16 q11, q11, q3
745 vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]
746 vmul.s16 q12, q12, q0
747 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
748 vmul.s16 q14, q14, q2
749 vmul.s16 q13, q13, q1
750 vld1.16 {d0}, [ip, :64] /* load constants */
751 vmul.s16 q15, q15, q3
752 vpush {d8-d13} /* save NEON registers */
753 /* 1-D IDCT, pass 1 */
754 vsub.s16 q2, q10, q14
755 vadd.s16 q14, q10, q14
756 vsub.s16 q1, q11, q13
757 vadd.s16 q13, q11, q13
759 vadd.s16 q15, q9, q15
760 vqdmulh.s16 q4, q2, XFIX_1_414213562
761 vqdmulh.s16 q6, q1, XFIX_2_613125930
765 vqdmulh.s16 q4, q1, XFIX_1_847759065
766 vsub.s16 q2, q15, q13
768 vqdmulh.s16 q6, q2, XFIX_1_414213562
770 vqdmulh.s16 q4, q5, XFIX_1_082392200
771 vsub.s16 q10, q10, q14
774 vadd.s16 q12, q8, q12
777 vsub.s16 q10, q6, q10
778 vadd.s16 q6, q15, q13
779 vadd.s16 q8, q12, q14
781 vsub.s16 q12, q12, q14
790 vsub.s16 q13, q10, q2
791 vadd.s16 q10, q10, q2
794 vsub.s16 q11, q12, q1
796 vadd.s16 q12, q12, q1
805 /* 1-D IDCT, pass 2 */
806 vsub.s16 q2, q10, q14
808 vadd.s16 q14, q10, q14
810 vsub.s16 q1, q11, q13
811 vadd.s16 q13, q11, q13
813 vadd.s16 q15, q9, q15
814 vqdmulh.s16 q4, q2, XFIX_1_414213562
815 vqdmulh.s16 q6, q1, XFIX_2_613125930
819 vqdmulh.s16 q4, q1, XFIX_1_847759065
820 vsub.s16 q2, q15, q13
822 vqdmulh.s16 q6, q2, XFIX_1_414213562
824 vqdmulh.s16 q4, q5, XFIX_1_082392200
825 vsub.s16 q10, q10, q14
828 vadd.s16 q12, q8, q12
831 vsub.s16 q10, q6, q10
832 vadd.s16 q6, q15, q13
833 vadd.s16 q8, q12, q14
835 vsub.s16 q12, q12, q14
844 vsub.s16 q13, q10, q2
845 vpop {d8-d13} /* restore NEON registers */
846 vadd.s16 q10, q10, q2
847 vsub.s16 q11, q12, q1
848 vadd.s16 q12, q12, q1
849 /* Descale to 8-bit and range limit */
851 vqshrn.s16 d16, q8, #5
852 vqshrn.s16 d17, q9, #5
853 vqshrn.s16 d18, q10, #5
854 vqshrn.s16 d19, q11, #5
855 vqshrn.s16 d20, q12, #5
856 vqshrn.s16 d21, q13, #5
857 vqshrn.s16 d22, q14, #5
858 vqshrn.s16 d23, q15, #5
863 /* Transpose the final 8-bit samples */
870 /* Store results to the output buffer */
871 ldmia OUTPUT_BUF!, {TMP1, TMP2}
872 add TMP1, TMP1, OUTPUT_COL
873 add TMP2, TMP2, OUTPUT_COL
876 ldmia OUTPUT_BUF!, {TMP1, TMP2}
877 add TMP1, TMP1, OUTPUT_COL
878 add TMP2, TMP2, OUTPUT_COL
882 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
883 add TMP1, TMP1, OUTPUT_COL
884 add TMP2, TMP2, OUTPUT_COL
885 add TMP3, TMP3, OUTPUT_COL
886 add TMP4, TMP4, OUTPUT_COL
904 /*****************************************************************************/
907 * jsimd_idct_4x4_neon
909 * This function contains inverse-DCT code for getting reduced-size
910 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations
911 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
912 * function from jpeg-6b (jidctred.c).
914 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
915 * requires much less arithmetic operations and hence should be faster.
916 * The primary purpose of this particular NEON optimized function is
917 * bit exact compatibility with jpeg-6b.
919 * TODO: a bit better instructions scheduling can be achieved by expanding
920 * idct_helper/transpose_4x4 macros and reordering instructions,
921 * but readability will suffer somewhat.
924 #define CONST_BITS 13
926 #define FIX_0_211164243 (1730) /* FIX(0.211164243) */
927 #define FIX_0_509795579 (4176) /* FIX(0.509795579) */
928 #define FIX_0_601344887 (4926) /* FIX(0.601344887) */
929 #define FIX_0_720959822 (5906) /* FIX(0.720959822) */
930 #define FIX_0_765366865 (6270) /* FIX(0.765366865) */
931 #define FIX_0_850430095 (6967) /* FIX(0.850430095) */
932 #define FIX_0_899976223 (7373) /* FIX(0.899976223) */
933 #define FIX_1_061594337 (8697) /* FIX(1.061594337) */
934 #define FIX_1_272758580 (10426) /* FIX(1.272758580) */
935 #define FIX_1_451774981 (11893) /* FIX(1.451774981) */
936 #define FIX_1_847759065 (15137) /* FIX(1.847759065) */
937 #define FIX_2_172734803 (17799) /* FIX(2.172734803) */
938 #define FIX_2_562915447 (20995) /* FIX(2.562915447) */
939 #define FIX_3_624509785 (29692) /* FIX(3.624509785) */
942 jsimd_idct_4x4_neon_consts:
943 .short FIX_1_847759065 /* d0[0] */
944 .short -FIX_0_765366865 /* d0[1] */
945 .short -FIX_0_211164243 /* d0[2] */
946 .short FIX_1_451774981 /* d0[3] */
947 .short -FIX_2_172734803 /* d1[0] */
948 .short FIX_1_061594337 /* d1[1] */
949 .short -FIX_0_509795579 /* d1[2] */
950 .short -FIX_0_601344887 /* d1[3] */
951 .short FIX_0_899976223 /* d2[0] */
952 .short FIX_2_562915447 /* d2[1] */
953 .short 1 << (CONST_BITS + 1) /* d2[2] */
956 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
957 vmull.s16 q14, \x4, d2[2]
958 vmlal.s16 q14, \x8, d0[0]
959 vmlal.s16 q14, \x14, d0[1]
961 vmull.s16 q13, \x16, d1[2]
962 vmlal.s16 q13, \x12, d1[3]
963 vmlal.s16 q13, \x10, d2[0]
964 vmlal.s16 q13, \x6, d2[1]
966 vmull.s16 q15, \x4, d2[2]
967 vmlsl.s16 q15, \x8, d0[0]
968 vmlsl.s16 q15, \x14, d0[1]
970 vmull.s16 q12, \x16, d0[2]
971 vmlal.s16 q12, \x12, d0[3]
972 vmlal.s16 q12, \x10, d1[0]
973 vmlal.s16 q12, \x6, d1[1]
975 vadd.s32 q10, q14, q13
976 vsub.s32 q14, q14, q13
979 vrshr.s32 q10, q10, #\shift
980 vrshr.s32 q14, q14, #\shift
984 vrshrn.s32 \y26, q10, #\shift
985 vrshrn.s32 \y29, q14, #\shift
988 vadd.s32 q10, q15, q12
989 vsub.s32 q15, q15, q12
992 vrshr.s32 q10, q10, #\shift
993 vrshr.s32 q15, q15, #\shift
997 vrshrn.s32 \y27, q10, #\shift
998 vrshrn.s32 \y28, q15, #\shift
1002 asm_function jsimd_idct_4x4_neon
1015 /* Load constants (d3 is just used for padding) */
1016 adr TMP4, jsimd_idct_4x4_neon_consts
1017 vld1.16 {d0, d1, d2, d3}, [TMP4, :128]
1019 /* Load all COEF_BLOCK into NEON registers with the following allocation:
1021 * ---------+--------
1031 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
1032 vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK, :128]!
1033 add COEF_BLOCK, COEF_BLOCK, #16
1034 vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK, :128]!
1035 vld1.16 {d16, d17}, [COEF_BLOCK, :128]!
1037 vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]!
1039 vld1.16 {d22, d23, d24, d25}, [DCT_TABLE, :128]!
1040 vmul.s16 q3, q3, q10
1041 vmul.s16 q4, q4, q11
1042 add DCT_TABLE, DCT_TABLE, #16
1043 vld1.16 {d26, d27, d28, d29}, [DCT_TABLE, :128]!
1044 vmul.s16 q5, q5, q12
1045 vmul.s16 q6, q6, q13
1046 vld1.16 {d30, d31}, [DCT_TABLE, :128]!
1047 vmul.s16 q7, q7, q14
1048 vmul.s16 q8, q8, q15
1051 idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10
1052 transpose_4x4 d4, d6, d8, d10
1053 idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11
1054 transpose_4x4 d5, d7, d9, d11
1057 idct_helper d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29
1058 transpose_4x4 d26, d27, d28, d29
1062 vadd.s16 q13, q13, q15
1063 vadd.s16 q14, q14, q15
1064 vqmovun.s16 d26, q13
1065 vqmovun.s16 d27, q14
1067 /* Store results to the output buffer */
1068 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
1069 add TMP1, TMP1, OUTPUT_COL
1070 add TMP2, TMP2, OUTPUT_COL
1071 add TMP3, TMP3, OUTPUT_COL
1072 add TMP4, TMP4, OUTPUT_COL
1074 #if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
1075 /* We can use much less instructions on little endian systems if the
1076 * OS kernel is not configured to trap unaligned memory accesses
1078 vst1.32 {d26[0]}, [TMP1]!
1079 vst1.32 {d27[0]}, [TMP3]!
1080 vst1.32 {d26[1]}, [TMP2]!
1081 vst1.32 {d27[1]}, [TMP4]!
1083 vst1.8 {d26[0]}, [TMP1]!
1084 vst1.8 {d27[0]}, [TMP3]!
1085 vst1.8 {d26[1]}, [TMP1]!
1086 vst1.8 {d27[1]}, [TMP3]!
1087 vst1.8 {d26[2]}, [TMP1]!
1088 vst1.8 {d27[2]}, [TMP3]!
1089 vst1.8 {d26[3]}, [TMP1]!
1090 vst1.8 {d27[3]}, [TMP3]!
1092 vst1.8 {d26[4]}, [TMP2]!
1093 vst1.8 {d27[4]}, [TMP4]!
1094 vst1.8 {d26[5]}, [TMP2]!
1095 vst1.8 {d27[5]}, [TMP4]!
1096 vst1.8 {d26[6]}, [TMP2]!
1097 vst1.8 {d27[6]}, [TMP4]!
1098 vst1.8 {d26[7]}, [TMP2]!
1099 vst1.8 {d27[7]}, [TMP4]!
1117 /*****************************************************************************/
1120 * jsimd_idct_2x2_neon
1122 * This function contains inverse-DCT code for getting reduced-size
1123 * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations
1124 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
1125 * function from jpeg-6b (jidctred.c).
1127 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
1128 * requires much less arithmetic operations and hence should be faster.
1129 * The primary purpose of this particular NEON optimized function is
1130 * bit exact compatibility with jpeg-6b.
1134 jsimd_idct_2x2_neon_consts:
1135 .short -FIX_0_720959822 /* d0[0] */
1136 .short FIX_0_850430095 /* d0[1] */
1137 .short -FIX_1_272758580 /* d0[2] */
1138 .short FIX_3_624509785 /* d0[3] */
1140 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
1141 vshll.s16 q14, \x4, #15
1142 vmull.s16 q13, \x6, d0[3]
1143 vmlal.s16 q13, \x10, d0[2]
1144 vmlal.s16 q13, \x12, d0[1]
1145 vmlal.s16 q13, \x16, d0[0]
1147 vadd.s32 q10, q14, q13
1148 vsub.s32 q14, q14, q13
1151 vrshr.s32 q10, q10, #\shift
1152 vrshr.s32 q14, q14, #\shift
1156 vrshrn.s32 \y26, q10, #\shift
1157 vrshrn.s32 \y27, q14, #\shift
1161 asm_function jsimd_idct_2x2_neon
1172 /* Load constants */
1173 adr TMP2, jsimd_idct_2x2_neon_consts
1174 vld1.16 {d0}, [TMP2, :64]
1176 /* Load all COEF_BLOCK into NEON registers with the following allocation:
1178 * ---------+--------
1188 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
1189 add COEF_BLOCK, COEF_BLOCK, #16
1190 vld1.16 {d10, d11}, [COEF_BLOCK, :128]!
1191 add COEF_BLOCK, COEF_BLOCK, #16
1192 vld1.16 {d12, d13}, [COEF_BLOCK, :128]!
1193 add COEF_BLOCK, COEF_BLOCK, #16
1194 vld1.16 {d16, d17}, [COEF_BLOCK, :128]!
1196 vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]!
1198 vmul.s16 q3, q3, q10
1199 add DCT_TABLE, DCT_TABLE, #16
1200 vld1.16 {d24, d25}, [DCT_TABLE, :128]!
1201 vmul.s16 q5, q5, q12
1202 add DCT_TABLE, DCT_TABLE, #16
1203 vld1.16 {d26, d27}, [DCT_TABLE, :128]!
1204 vmul.s16 q6, q6, q13
1205 add DCT_TABLE, DCT_TABLE, #16
1206 vld1.16 {d30, d31}, [DCT_TABLE, :128]!
1207 vmul.s16 q8, q8, q15
1211 idct_helper d4, d6, d10, d12, d16, 13, d4, d6
1212 transpose_4x4 d4, d6, d8, d10
1213 idct_helper d5, d7, d11, d13, d17, 13, d5, d7
1214 transpose_4x4 d5, d7, d9, d11
1216 vmull.s16 q13, d6, d0[3]
1217 vmlal.s16 q13, d10, d0[2]
1218 vmlal.s16 q13, d12, d0[1]
1219 vmlal.s16 q13, d16, d0[0]
1220 vmull.s16 q12, d7, d0[3]
1221 vmlal.s16 q12, d11, d0[2]
1222 vmlal.s16 q12, d13, d0[1]
1223 vmlal.s16 q12, d17, d0[0]
1224 vshll.s16 q14, d4, #15
1225 vshll.s16 q15, d5, #15
1226 vadd.s32 q10, q14, q13
1227 vsub.s32 q14, q14, q13
1228 vrshrn.s32 d4, q10, #13
1229 vrshrn.s32 d6, q14, #13
1230 vadd.s32 q10, q15, q12
1231 vsub.s32 q14, q15, q12
1232 vrshrn.s32 d5, q10, #13
1233 vrshrn.s32 d7, q14, #13
1239 idct_helper d4, d6, d10, d7, d11, 20, d26, d27
1243 vadd.s16 q13, q13, q15
1244 vqmovun.s16 d26, q13
1245 vqmovun.s16 d27, q13
1247 /* Store results to the output buffer */
1248 ldmia OUTPUT_BUF, {TMP1, TMP2}
1249 add TMP1, TMP1, OUTPUT_COL
1250 add TMP2, TMP2, OUTPUT_COL
1252 vst1.8 {d26[0]}, [TMP1]!
1253 vst1.8 {d27[4]}, [TMP1]!
1254 vst1.8 {d26[1]}, [TMP2]!
1255 vst1.8 {d27[5]}, [TMP2]!
1270 /*****************************************************************************/
1273 * jsimd_ycc_extrgb_convert_neon
1274 * jsimd_ycc_extbgr_convert_neon
1275 * jsimd_ycc_extrgbx_convert_neon
1276 * jsimd_ycc_extbgrx_convert_neon
1277 * jsimd_ycc_extxbgr_convert_neon
1278 * jsimd_ycc_extxrgb_convert_neon
1280 * Colorspace conversion YCbCr -> RGB
1286 vld1.8 {d4}, [U, :64]!
1287 vld1.8 {d5}, [V, :64]!
1288 vld1.8 {d0}, [Y, :64]!
1293 vld1.8 {d4[0]}, [U]!
1294 vld1.8 {d4[1]}, [U]!
1295 vld1.8 {d4[2]}, [U]!
1296 vld1.8 {d4[3]}, [U]!
1297 vld1.8 {d5[0]}, [V]!
1298 vld1.8 {d5[1]}, [V]!
1299 vld1.8 {d5[2]}, [V]!
1300 vld1.8 {d5[3]}, [V]!
1301 vld1.8 {d0[0]}, [Y]!
1302 vld1.8 {d0[1]}, [Y]!
1303 vld1.8 {d0[2]}, [Y]!
1304 vld1.8 {d0[3]}, [Y]!
1306 vld1.8 {d4[4]}, [U]!
1307 vld1.8 {d4[5]}, [U]!
1308 vld1.8 {d5[4]}, [V]!
1309 vld1.8 {d5[5]}, [V]!
1310 vld1.8 {d0[4]}, [Y]!
1311 vld1.8 {d0[5]}, [Y]!
1313 vld1.8 {d4[6]}, [U]!
1314 vld1.8 {d5[6]}, [V]!
1315 vld1.8 {d0[6]}, [Y]!
1317 .error unsupported macroblock size
1321 .macro do_store bpp, size
1324 vst3.8 {d10, d11, d12}, [RGB]!
1326 vst3.8 {d10[0], d11[0], d12[0]}, [RGB]!
1327 vst3.8 {d10[1], d11[1], d12[1]}, [RGB]!
1328 vst3.8 {d10[2], d11[2], d12[2]}, [RGB]!
1329 vst3.8 {d10[3], d11[3], d12[3]}, [RGB]!
1331 vst3.8 {d10[4], d11[4], d12[4]}, [RGB]!
1332 vst3.8 {d10[5], d11[5], d12[5]}, [RGB]!
1334 vst3.8 {d10[6], d11[6], d12[6]}, [RGB]!
1336 .error unsupported macroblock size
1340 vst4.8 {d10, d11, d12, d13}, [RGB]!
1342 vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
1343 vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
1344 vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
1345 vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
1347 vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
1348 vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
1350 vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
1352 .error unsupported macroblock size
1356 vst1.16 {q15}, [RGB]!
1358 vst1.16 {d30}, [RGB]!
1360 vst1.16 {d31[0]}, [RGB]!
1361 vst1.16 {d31[1]}, [RGB]!
1363 vst1.16 {d31[2]}, [RGB]!
1365 .error unsupported macroblock size
1368 .error unsupported bpp
1372 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
1375 * 2-stage pipelined YCbCr->RGB conversion
1378 .macro do_yuv_to_rgb_stage1
1379 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */
1380 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */
1381 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */
1382 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */
1383 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */
1384 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */
1385 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */
1386 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
1387 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
1388 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
1391 .macro do_yuv_to_rgb_stage2
1392 vrshrn.s32 d20, q10, #15
1393 vrshrn.s32 d21, q11, #15
1394 vrshrn.s32 d24, q12, #14
1395 vrshrn.s32 d25, q13, #14
1396 vrshrn.s32 d28, q14, #14
1397 vrshrn.s32 d29, q15, #14
1398 vaddw.u8 q11, q10, d0
1399 vaddw.u8 q12, q12, d0
1400 vaddw.u8 q14, q14, d0
1402 vqmovun.s16 d1\g_offs, q11
1403 vqmovun.s16 d1\r_offs, q12
1404 vqmovun.s16 d1\b_offs, q14
1406 vqshlu.s16 q13, q11, #8
1407 vqshlu.s16 q15, q12, #8
1408 vqshlu.s16 q14, q14, #8
1409 vsri.u16 q15, q13, #5
1410 vsri.u16 q15, q14, #11
1414 .macro do_yuv_to_rgb_stage2_store_load_stage1
1415 /* "do_yuv_to_rgb_stage2" and "store" */
1416 vrshrn.s32 d20, q10, #15
1417 /* "load" and "do_yuv_to_rgb_stage1" */
1419 vrshrn.s32 d21, q11, #15
1421 vrshrn.s32 d24, q12, #14
1422 vrshrn.s32 d25, q13, #14
1423 vld1.8 {d4}, [U, :64]!
1424 vrshrn.s32 d28, q14, #14
1425 vld1.8 {d5}, [V, :64]!
1426 vrshrn.s32 d29, q15, #14
1427 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */
1428 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */
1429 vaddw.u8 q11, q10, d0
1430 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */
1431 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */
1432 vaddw.u8 q12, q12, d0
1433 vaddw.u8 q14, q14, d0
1434 .if \bpp != 16 /**************** rgb24/rgb32 ******************************/
1435 vqmovun.s16 d1\g_offs, q11
1437 vqmovun.s16 d1\r_offs, q12
1438 vld1.8 {d0}, [Y, :64]!
1439 vqmovun.s16 d1\b_offs, q14
1440 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */
1441 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */
1443 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */
1444 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
1445 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
1446 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
1447 .else /**************************** rgb565 ********************************/
1448 vqshlu.s16 q13, q11, #8
1450 vqshlu.s16 q15, q12, #8
1451 vqshlu.s16 q14, q14, #8
1452 vld1.8 {d0}, [Y, :64]!
1453 vmull.s16 q11, d7, d1[1]
1454 vmlal.s16 q11, d9, d1[2]
1455 vsri.u16 q15, q13, #5
1456 vmull.s16 q12, d8, d1[0]
1457 vsri.u16 q15, q14, #11
1458 vmull.s16 q13, d9, d1[0]
1459 vmull.s16 q14, d6, d1[3]
1461 vmull.s16 q15, d7, d1[3]
1465 .macro do_yuv_to_rgb
1466 do_yuv_to_rgb_stage1
1467 do_yuv_to_rgb_stage2
1470 /* Apple gas crashes on adrl, work around that by using adr.
1471 * But this requires a copy of these constants for each function.
1475 jsimd_ycc_\colorid\()_neon_consts:
1477 .short 22971, -11277, -23401, 29033
1478 .short -128, -128, -128, -128
1479 .short -128, -128, -128, -128
1481 asm_function jsimd_ycc_\colorid\()_convert_neon
1482 OUTPUT_WIDTH .req r0
1490 INPUT_BUF2 .req INPUT_BUF
1498 /* Load constants to d1, d2, d3 (d0 is just used for padding) */
1499 adr ip, jsimd_ycc_\colorid\()_neon_consts
1500 vld1.16 {d0, d1, d2, d3}, [ip, :128]
1502 /* Save ARM registers and handle input arguments */
1503 push {r4, r5, r6, r7, r8, r9, r10, lr}
1504 ldr NUM_ROWS, [sp, #(4 * 8)]
1505 ldr INPUT_BUF0, [INPUT_BUF]
1506 ldr INPUT_BUF1, [INPUT_BUF, #4]
1507 ldr INPUT_BUF2, [INPUT_BUF, #8]
1510 /* Save NEON registers */
1513 /* Initially set d10, d11, d12, d13 to 0xFF */
1517 /* Outer loop over scanlines */
1521 ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2]
1522 ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2]
1524 ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2]
1525 add INPUT_ROW, INPUT_ROW, #1
1526 ldr RGB, [OUTPUT_BUF], #4
1528 /* Inner loop over pixels */
1532 do_yuv_to_rgb_stage1
1536 do_yuv_to_rgb_stage2_store_load_stage1
1540 do_yuv_to_rgb_stage2
1570 subs NUM_ROWS, NUM_ROWS, #1
1573 /* Restore all registers and return */
1575 pop {r4, r5, r6, r7, r8, r9, r10, pc}
1590 .purgem do_yuv_to_rgb
1591 .purgem do_yuv_to_rgb_stage1
1592 .purgem do_yuv_to_rgb_stage2
1593 .purgem do_yuv_to_rgb_stage2_store_load_stage1
1597 /*--------------------------------- id ----- bpp R G B */
1598 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2
1599 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0
1600 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2
1601 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0
1602 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1
1603 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3
1604 generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, 0, 0
1610 /*****************************************************************************/
1613 * jsimd_extrgb_ycc_convert_neon
1614 * jsimd_extbgr_ycc_convert_neon
1615 * jsimd_extrgbx_ycc_convert_neon
1616 * jsimd_extbgrx_ycc_convert_neon
1617 * jsimd_extxbgr_ycc_convert_neon
1618 * jsimd_extxrgb_ycc_convert_neon
1620 * Colorspace conversion RGB -> YCbCr
1623 .macro do_store size
1629 vst1.8 {d20[0]}, [Y]!
1630 vst1.8 {d20[1]}, [Y]!
1631 vst1.8 {d20[2]}, [Y]!
1632 vst1.8 {d20[3]}, [Y]!
1633 vst1.8 {d21[0]}, [U]!
1634 vst1.8 {d21[1]}, [U]!
1635 vst1.8 {d21[2]}, [U]!
1636 vst1.8 {d21[3]}, [U]!
1637 vst1.8 {d22[0]}, [V]!
1638 vst1.8 {d22[1]}, [V]!
1639 vst1.8 {d22[2]}, [V]!
1640 vst1.8 {d22[3]}, [V]!
1642 vst1.8 {d20[4]}, [Y]!
1643 vst1.8 {d20[5]}, [Y]!
1644 vst1.8 {d21[4]}, [U]!
1645 vst1.8 {d21[5]}, [U]!
1646 vst1.8 {d22[4]}, [V]!
1647 vst1.8 {d22[5]}, [V]!
1649 vst1.8 {d20[6]}, [Y]!
1650 vst1.8 {d21[6]}, [U]!
1651 vst1.8 {d22[6]}, [V]!
1653 .error unsupported macroblock size
1657 .macro do_load bpp, size
1660 vld3.8 {d10, d11, d12}, [RGB]!
1663 vld3.8 {d10[0], d11[0], d12[0]}, [RGB]!
1664 vld3.8 {d10[1], d11[1], d12[1]}, [RGB]!
1665 vld3.8 {d10[2], d11[2], d12[2]}, [RGB]!
1666 vld3.8 {d10[3], d11[3], d12[3]}, [RGB]!
1668 vld3.8 {d10[4], d11[4], d12[4]}, [RGB]!
1669 vld3.8 {d10[5], d11[5], d12[5]}, [RGB]!
1671 vld3.8 {d10[6], d11[6], d12[6]}, [RGB]!
1673 .error unsupported macroblock size
1677 vld4.8 {d10, d11, d12, d13}, [RGB]!
1680 vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
1681 vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
1682 vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
1683 vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
1685 vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
1686 vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
1688 vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
1690 .error unsupported macroblock size
1693 .error unsupported bpp
1697 .macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
1700 * 2-stage pipelined RGB->YCbCr conversion
1703 .macro do_rgb_to_yuv_stage1
1704 vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
1705 vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
1706 vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
1707 vmull.u16 q7, d4, d0[0]
1708 vmlal.u16 q7, d6, d0[1]
1709 vmlal.u16 q7, d8, d0[2]
1710 vmull.u16 q8, d5, d0[0]
1711 vmlal.u16 q8, d7, d0[1]
1712 vmlal.u16 q8, d9, d0[2]
1715 vmlsl.u16 q9, d4, d0[3]
1716 vmlsl.u16 q9, d6, d1[0]
1717 vmlal.u16 q9, d8, d1[1]
1718 vmlsl.u16 q13, d5, d0[3]
1719 vmlsl.u16 q13, d7, d1[0]
1720 vmlal.u16 q13, d9, d1[1]
1723 vmlal.u16 q14, d4, d1[1]
1724 vmlsl.u16 q14, d6, d1[2]
1725 vmlsl.u16 q14, d8, d1[3]
1726 vmlal.u16 q15, d5, d1[1]
1727 vmlsl.u16 q15, d7, d1[2]
1728 vmlsl.u16 q15, d9, d1[3]
1731 .macro do_rgb_to_yuv_stage2
1732 vrshrn.u32 d20, q7, #16
1733 vrshrn.u32 d21, q8, #16
1734 vshrn.u32 d22, q9, #16
1735 vshrn.u32 d23, q13, #16
1736 vshrn.u32 d24, q14, #16
1737 vshrn.u32 d25, q15, #16
1738 vmovn.u16 d20, q10 /* d20 = y */
1739 vmovn.u16 d21, q11 /* d21 = u */
1740 vmovn.u16 d22, q12 /* d22 = v */
1743 .macro do_rgb_to_yuv
1744 do_rgb_to_yuv_stage1
1745 do_rgb_to_yuv_stage2
1748 .macro do_rgb_to_yuv_stage2_store_load_stage1
1749 vrshrn.u32 d20, q7, #16
1750 vrshrn.u32 d21, q8, #16
1751 vshrn.u32 d22, q9, #16
1753 vshrn.u32 d23, q13, #16
1755 vshrn.u32 d24, q14, #16
1756 vshrn.u32 d25, q15, #16
1758 vmovn.u16 d20, q10 /* d20 = y */
1759 vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
1760 vmovn.u16 d21, q11 /* d21 = u */
1761 vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
1762 vmovn.u16 d22, q12 /* d22 = v */
1763 vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
1764 vmull.u16 q7, d4, d0[0]
1765 vmlal.u16 q7, d6, d0[1]
1766 vmlal.u16 q7, d8, d0[2]
1768 vmull.u16 q8, d5, d0[0]
1769 vmlal.u16 q8, d7, d0[1]
1770 vmlal.u16 q8, d9, d0[2]
1771 vmlsl.u16 q9, d4, d0[3]
1772 vmlsl.u16 q9, d6, d1[0]
1773 vmlal.u16 q9, d8, d1[1]
1775 vmlsl.u16 q13, d5, d0[3]
1776 vmlsl.u16 q13, d7, d1[0]
1777 vmlal.u16 q13, d9, d1[1]
1780 vmlal.u16 q14, d4, d1[1]
1781 vmlsl.u16 q14, d6, d1[2]
1782 vmlsl.u16 q14, d8, d1[3]
1784 vmlal.u16 q15, d5, d1[1]
1785 vmlsl.u16 q15, d7, d1[2]
1786 vmlsl.u16 q15, d9, d1[3]
1790 jsimd_\colorid\()_ycc_neon_consts:
1791 .short 19595, 38470, 7471, 11059
1792 .short 21709, 32768, 27439, 5329
1793 .short 32767, 128, 32767, 128
1794 .short 32767, 128, 32767, 128
1796 asm_function jsimd_\colorid\()_ycc_convert_neon
1797 OUTPUT_WIDTH .req r0
1805 OUTPUT_BUF2 .req OUTPUT_BUF
1813 /* Load constants to d0, d1, d2, d3 */
1814 adr ip, jsimd_\colorid\()_ycc_neon_consts
1815 vld1.16 {d0, d1, d2, d3}, [ip, :128]
1817 /* Save ARM registers and handle input arguments */
1818 push {r4, r5, r6, r7, r8, r9, r10, lr}
1819 ldr NUM_ROWS, [sp, #(4 * 8)]
1820 ldr OUTPUT_BUF0, [OUTPUT_BUF]
1821 ldr OUTPUT_BUF1, [OUTPUT_BUF, #4]
1822 ldr OUTPUT_BUF2, [OUTPUT_BUF, #8]
1825 /* Save NEON registers */
1828 /* Outer loop over scanlines */
1832 ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
1833 ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
1835 ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
1836 add OUTPUT_ROW, OUTPUT_ROW, #1
1837 ldr RGB, [INPUT_BUF], #4
1839 /* Inner loop over pixels */
1843 do_rgb_to_yuv_stage1
1847 do_rgb_to_yuv_stage2_store_load_stage1
1851 do_rgb_to_yuv_stage2
1881 subs NUM_ROWS, NUM_ROWS, #1
1884 /* Restore all registers and return */
1886 pop {r4, r5, r6, r7, r8, r9, r10, pc}
1901 .purgem do_rgb_to_yuv
1902 .purgem do_rgb_to_yuv_stage1
1903 .purgem do_rgb_to_yuv_stage2
1904 .purgem do_rgb_to_yuv_stage2_store_load_stage1
1908 /*--------------------------------- id ----- bpp R G B */
1909 generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2
1910 generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0
1911 generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
1912 generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
1913 generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
1914 generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
1920 /*****************************************************************************/
1923 * Load data into workspace, applying unsigned->signed conversion
1925 * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
1926 * rid of VST1.16 instructions
1929 asm_function jsimd_convsamp_neon
1941 ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
1942 add TMP1, TMP1, START_COL
1943 add TMP2, TMP2, START_COL
1944 add TMP3, TMP3, START_COL
1945 add TMP4, TMP4, START_COL
1946 vld1.8 {d16}, [TMP1]
1947 vsubl.u8 q8, d16, d0
1948 vld1.8 {d18}, [TMP2]
1949 vsubl.u8 q9, d18, d0
1950 vld1.8 {d20}, [TMP3]
1951 vsubl.u8 q10, d20, d0
1952 vld1.8 {d22}, [TMP4]
1953 ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
1954 vsubl.u8 q11, d22, d0
1955 vst1.16 {d16, d17, d18, d19}, [WORKSPACE, :128]!
1956 add TMP1, TMP1, START_COL
1957 add TMP2, TMP2, START_COL
1958 vst1.16 {d20, d21, d22, d23}, [WORKSPACE, :128]!
1959 add TMP3, TMP3, START_COL
1960 add TMP4, TMP4, START_COL
1961 vld1.8 {d24}, [TMP1]
1962 vsubl.u8 q12, d24, d0
1963 vld1.8 {d26}, [TMP2]
1964 vsubl.u8 q13, d26, d0
1965 vld1.8 {d28}, [TMP3]
1966 vsubl.u8 q14, d28, d0
1967 vld1.8 {d30}, [TMP4]
1968 vsubl.u8 q15, d30, d0
1969 vst1.16 {d24, d25, d26, d27}, [WORKSPACE, :128]!
1970 vst1.16 {d28, d29, d30, d31}, [WORKSPACE, :128]!
1983 /*****************************************************************************/
1986 * jsimd_fdct_ifast_neon
1988 * This function contains a fast, not so accurate integer implementation of
1989 * the forward DCT (Discrete Cosine Transform). It uses the same calculations
1990 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
1991 * function from jfdctfst.c
1993 * TODO: can be combined with 'jsimd_convsamp_neon' to get
1994 * rid of a bunch of VLD1.16 instructions
1997 #define XFIX_0_382683433 d0[0]
1998 #define XFIX_0_541196100 d0[1]
1999 #define XFIX_0_707106781 d0[2]
2000 #define XFIX_1_306562965 d0[3]
2003 jsimd_fdct_ifast_neon_consts:
2004 .short (98 * 128) /* XFIX_0_382683433 */
2005 .short (139 * 128) /* XFIX_0_541196100 */
2006 .short (181 * 128) /* XFIX_0_707106781 */
2007 .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */
2009 asm_function jsimd_fdct_ifast_neon
2016 /* Load constants */
2017 adr TMP, jsimd_fdct_ifast_neon_consts
2018 vld1.16 {d0}, [TMP, :64]
2020 /* Load all DATA into NEON registers with the following allocation:
2022 * ---------+--------
2023 * 0 | d16 | d17 | q8
2024 * 1 | d18 | d19 | q9
2025 * 2 | d20 | d21 | q10
2026 * 3 | d22 | d23 | q11
2027 * 4 | d24 | d25 | q12
2028 * 5 | d26 | d27 | q13
2029 * 6 | d28 | d29 | q14
2030 * 7 | d30 | d31 | q15
2033 vld1.16 {d16, d17, d18, d19}, [DATA, :128]!
2034 vld1.16 {d20, d21, d22, d23}, [DATA, :128]!
2035 vld1.16 {d24, d25, d26, d27}, [DATA, :128]!
2036 vld1.16 {d28, d29, d30, d31}, [DATA, :128]
2037 sub DATA, DATA, #(128 - 32)
2054 vadd.s16 q2, q11, q12
2056 vsub.s16 q12, q11, q12
2057 vsub.s16 q6, q10, q13
2058 vadd.s16 q10, q10, q13
2059 vsub.s16 q7, q9, q14
2060 vadd.s16 q9, q9, q14
2061 vsub.s16 q1, q8, q15
2062 vadd.s16 q8, q8, q15
2063 vsub.s16 q4, q9, q10
2065 vadd.s16 q3, q9, q10
2068 vqdmulh.s16 q4, q4, XFIX_0_707106781
2069 vadd.s16 q11, q12, q6
2071 vsub.s16 q12, q2, q3
2074 vqdmulh.s16 q3, q3, XFIX_0_707106781
2075 vsub.s16 q6, q11, q7
2076 vadd.s16 q10, q5, q4
2077 vqdmulh.s16 q6, q6, XFIX_0_382683433
2078 vsub.s16 q14, q5, q4
2079 vqdmulh.s16 q11, q11, XFIX_0_541196100
2080 vqdmulh.s16 q5, q7, XFIX_1_306562965
2084 vadd.s16 q11, q11, q6
2086 vadd.s16 q13, q3, q11
2087 vsub.s16 q11, q3, q11
2089 vsub.s16 q15, q4, q7
2094 vst1.16 {d16, d17, d18, d19}, [DATA, :128]!
2095 vst1.16 {d20, d21, d22, d23}, [DATA, :128]!
2096 vst1.16 {d24, d25, d26, d27}, [DATA, :128]!
2097 vst1.16 {d28, d29, d30, d31}, [DATA, :128]
2106 /*****************************************************************************/
2110 * jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
2111 * DCTELEM *workspace);
2113 * Note: the code uses 2 stage pipelining in order to improve instructions
2114 * scheduling and eliminate stalls (this provides ~15% better
2115 * performance for this function on both ARM Cortex-A8 and
2116 * ARM Cortex-A9 when compared to the non-pipelined variant).
2117 * The instructions which belong to the second stage use different
2118 * indentation for better readiability.
2120 asm_function jsimd_quantize_neon
2126 RECIPROCAL .req DIVISORS
2131 vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]!
2133 add CORRECTION, DIVISORS, #(64 * 2)
2134 add SHIFT, DIVISORS, #(64 * 6)
2135 vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]!
2137 vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]!
2138 vadd.u16 q12, q12, q10 /* add correction */
2139 vadd.u16 q13, q13, q11
2140 vmull.u16 q10, d24, d16 /* multiply by reciprocal */
2141 vmull.u16 q11, d25, d17
2142 vmull.u16 q8, d26, d18
2143 vmull.u16 q9, d27, d19
2144 vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]!
2145 vshrn.u32 d20, q10, #16
2146 vshrn.u32 d21, q11, #16
2147 vshrn.u32 d22, q8, #16
2148 vshrn.u32 d23, q9, #16
2151 vshr.s16 q2, q0, #15 /* extract sign */
2152 vshr.s16 q3, q1, #15
2153 vshl.u16 q14, q10, q12 /* shift */
2154 vshl.u16 q15, q11, q13
2159 vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]!
2160 veor.u16 q14, q14, q2 /* restore sign */
2162 vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]!
2164 veor.u16 q15, q15, q3
2165 vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]!
2166 vadd.u16 q12, q12, q10 /* add correction */
2167 vadd.u16 q13, q13, q11
2168 vmull.u16 q10, d24, d16 /* multiply by reciprocal */
2169 vmull.u16 q11, d25, d17
2170 vmull.u16 q8, d26, d18
2171 vmull.u16 q9, d27, d19
2172 vsub.u16 q14, q14, q2
2173 vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]!
2174 vsub.u16 q15, q15, q3
2175 vshrn.u32 d20, q10, #16
2176 vshrn.u32 d21, q11, #16
2177 vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
2178 vshrn.u32 d22, q8, #16
2179 vshrn.u32 d23, q9, #16
2182 vshr.s16 q2, q0, #15 /* extract sign */
2183 vshr.s16 q3, q1, #15
2184 vshl.u16 q14, q10, q12 /* shift */
2185 vshl.u16 q15, q11, q13
2186 subs LOOP_COUNT, LOOP_COUNT, #1
2190 veor.u16 q14, q14, q2 /* restore sign */
2191 veor.u16 q15, q15, q3
2192 vsub.u16 q14, q14, q2
2193 vsub.u16 q15, q15, q3
2194 vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
2207 /*****************************************************************************/
2211 * jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,
2212 * JDIMENSION downsampled_width,
2213 * JSAMPARRAY input_data,
2214 * JSAMPARRAY *output_data_ptr);
2216 * Note: the use of unaligned writes is the main remaining bottleneck in
2217 * this code, which can be potentially solved to get up to tens
2218 * of percents performance improvement on Cortex-A8/Cortex-A9.
2222 * Upsample 16 source pixels to 32 destination pixels. The new 16 source
2223 * pixels are loaded to q0. The previous 16 source pixels are in q1. The
2224 * shifted-by-one source pixels are constructed in q2 by using q0 and q1.
2225 * Register d28 is used for multiplication by 3. Register q15 is used
2226 * for adding +1 bias.
2228 .macro upsample16 OUTPTR, INPTR
2229 vld1.8 {q0}, [\INPTR]!
2231 vext.8 q2, q1, q0, #15
2233 vaddw.u8 q10, q15, d4
2234 vaddw.u8 q11, q15, d5
2235 vmlal.u8 q8, d4, d28
2236 vmlal.u8 q9, d5, d28
2237 vmlal.u8 q10, d0, d28
2238 vmlal.u8 q11, d1, d28
2239 vmov q1, q0 /* backup source pixels to q1 */
2240 vrshrn.u16 d6, q8, #2
2241 vrshrn.u16 d7, q9, #2
2242 vshrn.u16 d8, q10, #2
2243 vshrn.u16 d9, q11, #2
2244 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
2248 * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
2249 * macro, the roles of q0 and q1 registers are reversed for even and odd
2250 * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
2251 * Also this unrolling allows to reorder loads and stores to compensate
2252 * multiplication latency and reduce stalls.
2254 .macro upsample32 OUTPTR, INPTR
2255 /* even 16 pixels group */
2256 vld1.8 {q0}, [\INPTR]!
2258 vext.8 q2, q1, q0, #15
2260 vaddw.u8 q10, q15, d4
2261 vaddw.u8 q11, q15, d5
2262 vmlal.u8 q8, d4, d28
2263 vmlal.u8 q9, d5, d28
2264 vmlal.u8 q10, d0, d28
2265 vmlal.u8 q11, d1, d28
2266 /* odd 16 pixels group */
2267 vld1.8 {q1}, [\INPTR]!
2268 vrshrn.u16 d6, q8, #2
2269 vrshrn.u16 d7, q9, #2
2270 vshrn.u16 d8, q10, #2
2271 vshrn.u16 d9, q11, #2
2273 vext.8 q2, q0, q1, #15
2275 vaddw.u8 q10, q15, d4
2276 vaddw.u8 q11, q15, d5
2277 vmlal.u8 q8, d4, d28
2278 vmlal.u8 q9, d5, d28
2279 vmlal.u8 q10, d2, d28
2280 vmlal.u8 q11, d3, d28
2281 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
2282 vrshrn.u16 d6, q8, #2
2283 vrshrn.u16 d7, q9, #2
2284 vshrn.u16 d8, q10, #2
2285 vshrn.u16 d9, q11, #2
2286 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
2290 * Upsample a row of WIDTH pixels from INPTR to OUTPTR.
2292 .macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
2293 /* special case for the first and last pixels */
2294 sub \WIDTH, \WIDTH, #1
2295 add \OUTPTR, \OUTPTR, #1
2296 ldrb \TMP1, [\INPTR, \WIDTH]
2297 strb \TMP1, [\OUTPTR, \WIDTH, asl #1]
2298 ldrb \TMP1, [\INPTR], #1
2299 strb \TMP1, [\OUTPTR, #-1]
2302 subs \WIDTH, \WIDTH, #32
2304 0: /* process 32 pixels per iteration */
2305 upsample32 \OUTPTR, \INPTR
2306 subs \WIDTH, \WIDTH, #32
2309 adds \WIDTH, \WIDTH, #16
2311 0: /* process 16 pixels if needed */
2312 upsample16 \OUTPTR, \INPTR
2313 subs \WIDTH, \WIDTH, #16
2315 adds \WIDTH, \WIDTH, #16
2318 /* load the remaining 1-15 pixels */
2319 add \INPTR, \INPTR, \WIDTH
2322 sub \INPTR, \INPTR, #1
2323 vld1.8 {d0[0]}, [\INPTR]
2327 vext.8 d0, d0, d0, #6
2328 sub \INPTR, \INPTR, #1
2329 vld1.8 {d0[1]}, [\INPTR]
2330 sub \INPTR, \INPTR, #1
2331 vld1.8 {d0[0]}, [\INPTR]
2336 sub \INPTR, \INPTR, #1
2337 vld1.8 {d0[3]}, [\INPTR]
2338 sub \INPTR, \INPTR, #1
2339 vld1.8 {d0[2]}, [\INPTR]
2340 sub \INPTR, \INPTR, #1
2341 vld1.8 {d0[1]}, [\INPTR]
2342 sub \INPTR, \INPTR, #1
2343 vld1.8 {d0[0]}, [\INPTR]
2348 sub \INPTR, \INPTR, #8
2349 vld1.8 {d0}, [\INPTR]
2350 2: /* upsample the remaining pixels */
2352 vext.8 q2, q1, q0, #15
2354 vaddw.u8 q10, q15, d4
2355 vaddw.u8 q11, q15, d5
2356 vmlal.u8 q8, d4, d28
2357 vmlal.u8 q9, d5, d28
2358 vmlal.u8 q10, d0, d28
2359 vmlal.u8 q11, d1, d28
2360 vrshrn.u16 d10, q8, #2
2361 vrshrn.u16 d12, q9, #2
2362 vshrn.u16 d11, q10, #2
2363 vshrn.u16 d13, q11, #2
2366 /* store the remaining pixels */
2369 vst1.8 {d10, d11}, [\OUTPTR]!
2374 vst1.8 {d10}, [\OUTPTR]!
2379 vst1.8 {d10[0]}, [\OUTPTR]!
2380 vst1.8 {d10[1]}, [\OUTPTR]!
2381 vst1.8 {d10[2]}, [\OUTPTR]!
2382 vst1.8 {d10[3]}, [\OUTPTR]!
2383 vext.8 d10, d10, d10, #4
2387 vst1.8 {d10[0]}, [\OUTPTR]!
2388 vst1.8 {d10[1]}, [\OUTPTR]!
2393 asm_function jsimd_h2v1_fancy_upsample_neon
2395 MAX_V_SAMP_FACTOR .req r0
2396 DOWNSAMPLED_WIDTH .req r1
2398 OUTPUT_DATA_PTR .req r3
2399 OUTPUT_DATA .req OUTPUT_DATA_PTR
2406 push {r4, r5, r6, lr}
2409 ldr OUTPUT_DATA, [OUTPUT_DATA_PTR]
2410 cmp MAX_V_SAMP_FACTOR, #0
2413 /* initialize constants */
2417 ldr INPTR, [INPUT_DATA], #4
2418 ldr OUTPTR, [OUTPUT_DATA], #4
2419 mov WIDTH, DOWNSAMPLED_WIDTH
2420 upsample_row OUTPTR, INPTR, WIDTH, TMP
2421 subs MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
2426 pop {r4, r5, r6, pc}
2428 .unreq MAX_V_SAMP_FACTOR
2429 .unreq DOWNSAMPLED_WIDTH
2431 .unreq OUTPUT_DATA_PTR
2441 .purgem upsample_row
2444 /*****************************************************************************/
2448 * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer,
2449 * JCOEFPTR block, int last_dc_val,
2450 * c_derived_tbl *dctbl, c_derived_tbl *actbl)
2454 .macro emit_byte BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
2455 sub \PUT_BITS, \PUT_BITS, #0x8
2456 lsr \TMP, \PUT_BUFFER, \PUT_BITS
2458 strb \TMP, [\BUFFER, #1]!
2461 strbeq \ZERO, [\BUFFER, #1]!
2464 .macro put_bits PUT_BUFFER, PUT_BITS, CODE, SIZE
2465 /*lsl \PUT_BUFFER, \PUT_BUFFER, \SIZE*/
2466 add \PUT_BITS, \SIZE
2467 /*orr \PUT_BUFFER, \PUT_BUFFER, \CODE*/
2468 orr \PUT_BUFFER, \CODE, \PUT_BUFFER, lsl \SIZE
2471 .macro checkbuf15 BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
2472 cmp \PUT_BITS, #0x10
2474 eor \ZERO, \ZERO, \ZERO
2475 emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
2476 emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
2481 jsimd_huff_encode_one_block_neon_consts:
2491 asm_function jsimd_huff_encode_one_block_neon
2492 push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
2496 mov sp, r4 /* align sp on 32 bytes */
2497 vst1.64 {d8, d9, d10, d11}, [r4, :128]!
2498 vst1.64 {d12, d13, d14, d15}, [r4, :128]
2499 sub sp, #0x140 /* reserve 320 bytes */
2500 str r0, [sp, #0x18] /* working state > sp + Ox18 */
2501 add r4, sp, #0x20 /* r4 = t1 */
2502 ldr lr, [r7, #0x8] /* lr = dctbl */
2503 sub r10, r1, #0x1 /* r10=buffer-- */
2507 adr r5, jsimd_huff_encode_one_block_neon_consts
2509 vld1.8 {d26}, [r5, :64]
2521 vld1.16 {d2[0]}, [r9, :16]
2522 vld1.16 {d4[0]}, [r8, :16]
2523 vld1.16 {d6[0]}, [r3, :16]
2528 vld1.16 {d0[1]}, [r1, :16]
2529 vld1.16 {d2[1]}, [r9, :16]
2530 vld1.16 {d4[1]}, [r8, :16]
2531 vld1.16 {d6[1]}, [r3, :16]
2536 vld1.16 {d0[2]}, [r1, :16]
2537 vld1.16 {d2[2]}, [r9, :16]
2538 vld1.16 {d4[2]}, [r8, :16]
2539 vld1.16 {d6[2]}, [r3, :16]
2544 vld1.16 {d0[3]}, [r1, :16]
2545 vld1.16 {d2[3]}, [r9, :16]
2546 vld1.16 {d4[3]}, [r8, :16]
2547 vld1.16 {d6[3]}, [r3, :16]
2552 vld1.16 {d1[0]}, [r1, :16]
2553 vld1.16 {d3[0]}, [r9, :16]
2554 vld1.16 {d5[0]}, [r8, :16]
2555 vld1.16 {d7[0]}, [r3, :16]
2560 vld1.16 {d1[1]}, [r1, :16]
2561 vld1.16 {d3[1]}, [r9, :16]
2562 vld1.16 {d5[1]}, [r8, :16]
2563 vld1.16 {d7[1]}, [r3, :16]
2568 vld1.16 {d1[2]}, [r1, :16]
2569 vld1.16 {d3[2]}, [r9, :16]
2570 vld1.16 {d5[2]}, [r8, :16]
2571 vld1.16 {d7[2]}, [r3, :16]
2576 vld1.16 {d1[3]}, [r1, :16]
2577 vld1.16 {d3[3]}, [r9, :16]
2578 vld1.16 {d5[3]}, [r8, :16]
2579 vld1.16 {d7[3]}, [r3, :16]
2582 vcgt.s16 q10, q10, q2
2583 vcgt.s16 q11, q11, q3
2599 vsub.i16 q0, q14, q0
2600 vsub.i16 q1, q14, q1
2601 vsub.i16 q2, q14, q2
2602 vsub.i16 q3, q14, q3
2603 vst1.16 {d0, d1, d2, d3}, [r4, :256]
2604 vst1.16 {d4, d5, d6, d7}, [r9, :256]
2605 vshl.s16 q0, q15, q0
2606 vshl.s16 q1, q15, q1
2607 vshl.s16 q2, q15, q2
2608 vshl.s16 q3, q15, q3
2609 vsub.i16 q0, q0, q15
2610 vsub.i16 q1, q1, q15
2611 vsub.i16 q2, q2, q15
2612 vsub.i16 q3, q3, q15
2617 vst1.16 {d16, d17, d18, d19}, [r8, :256]
2618 vst1.16 {d20, d21, d22, d23}, [r3, :256]
2623 vld1.16 {d8[0]}, [r1, :16]
2624 vld1.16 {d10[0]}, [r9, :16]
2625 vld1.16 {d12[0]}, [r8, :16]
2626 vld1.16 {d14[0]}, [r3, :16]
2635 vld1.16 {d8[1]}, [r1, :16]
2636 vld1.16 {d10[1]}, [r9, :16]
2637 vld1.16 {d12[1]}, [r8, :16]
2638 vld1.16 {d14[1]}, [r3, :16]
2643 vld1.16 {d8[2]}, [r1, :16]
2644 vld1.16 {d10[2]}, [r9, :16]
2645 vld1.16 {d12[2]}, [r8, :16]
2646 vld1.16 {d14[2]}, [r3, :16]
2651 vld1.16 {d8[3]}, [r1, :16]
2652 vld1.16 {d10[3]}, [r9, :16]
2653 vld1.16 {d12[3]}, [r8, :16]
2654 vld1.16 {d14[3]}, [r3, :16]
2659 vld1.16 {d9[0]}, [r1, :16]
2660 vld1.16 {d11[0]}, [r9, :16]
2661 vld1.16 {d13[0]}, [r8, :16]
2662 vld1.16 {d15[0]}, [r3, :16]
2667 vld1.16 {d9[1]}, [r1, :16]
2668 vld1.16 {d11[1]}, [r9, :16]
2669 vld1.16 {d13[1]}, [r8, :16]
2670 vld1.16 {d15[1]}, [r3, :16]
2675 vld1.16 {d9[2]}, [r1, :16]
2676 vld1.16 {d11[2]}, [r9, :16]
2677 vld1.16 {d13[2]}, [r8, :16]
2678 vld1.16 {d15[2]}, [r3, :16]
2683 vld1.16 {d9[3]}, [r1, :16]
2684 vld1.16 {d11[3]}, [r9, :16]
2685 vld1.16 {d13[3]}, [r8, :16]
2686 vld1.16 {d15[3]}, [r3, :16]
2689 vcgt.s16 q10, q10, q6
2690 vcgt.s16 q11, q11, q7
2707 vsub.i16 q4, q14, q4
2708 vsub.i16 q5, q14, q5
2709 vsub.i16 q6, q14, q6
2710 vsub.i16 q7, q14, q7
2711 vst1.16 {d8, d9, d10, d11}, [r1, :256]
2712 vst1.16 {d12, d13, d14, d15}, [r9, :256]
2713 vshl.s16 q4, q15, q4
2714 vshl.s16 q5, q15, q5
2715 vshl.s16 q6, q15, q6
2716 vshl.s16 q7, q15, q7
2717 vsub.i16 q4, q4, q15
2718 vsub.i16 q5, q5, q15
2719 vsub.i16 q6, q6, q15
2720 vsub.i16 q7, q7, q15
2725 vst1.16 {d16, d17, d18, d19}, [r8, :256]
2726 vst1.16 {d20, d21, d22, d23}, [r3, :256]
2727 ldr r12, [r7, #0xc] /* r12 = actbl */
2728 add r1, lr, #0x400 /* r1 = dctbl->ehufsi */
2729 mov r9, r12 /* r9 = actbl */
2730 add r6, r4, #0x80 /* r6 = t2 */
2731 ldr r11, [r0, #0x8] /* r11 = put_buffer */
2732 ldr r4, [r0, #0xc] /* r4 = put_bits */
2733 ldrh r2, [r6, #-128] /* r2 = nbits */
2734 ldrh r3, [r6] /* r3 = temp2 & (((JLONG)1)<<nbits) - 1; */
2735 ldr r0, [lr, r2, lsl #2]
2737 put_bits r11, r4, r0, r5
2738 checkbuf15 r10, r11, r4, r5, r0
2739 put_bits r11, r4, r3, r2
2740 checkbuf15 r10, r11, r4, r5, r0
2741 mov lr, r6 /* lr = t2 */
2742 add r5, r9, #0x400 /* r5 = actbl->ehufsi */
2743 ldrsb r6, [r5, #0xf0] /* r6 = actbl->ehufsi[0xf0] */
2771 vpadd.i8 d8, d8, d10
2772 vpadd.i8 d12, d12, d14
2774 vpadd.i8 d8, d8, d12
2781 rrx r8, r8 /* shift in last r1 bit while shifting out DC bit */
2782 rbit r1, r1 /* r1 = index1 */
2783 rbit r8, r8 /* r8 = index0 */
2784 ldr r0, [r9, #0x3c0] /* r0 = actbl->ehufco[0xf0] */
2785 str r1, [sp, #0x14] /* index1 > sp + 0x14 */
2790 add lr, lr, r2, lsl #1
2792 ldrh r1, [lr, #-126]
2797 put_bits r11, r4, r0, r6
2801 emit_byte r10, r11, r4, r3, r12
2802 emit_byte r10, r11, r4, r3, r12
2805 add r2, r1, r2, lsl #4
2807 ldr r12, [r9, r2, lsl #2]
2809 put_bits r11, r4, r12, r2
2810 checkbuf15 r10, r11, r4, r2, r12
2811 put_bits r11, r4, r3, r1
2812 checkbuf15 r10, r11, r4, r2, r12
2816 add r12, sp, #0x20 /* r12 = t1 */
2817 ldr r8, [sp, #0x14] /* r8 = index1 */
2818 adds r12, #0xc0 /* r12 = t2 + (DCTSIZE2/2) */
2824 add r2, r2, r12, lsr #1
2825 add lr, lr, r2, lsl #1
2829 add lr, lr, r2, lsl #1
2832 ldrh r1, [lr, #-126]
2837 put_bits r11, r4, r0, r6
2841 emit_byte r10, r11, r4, r3, r12
2842 emit_byte r10, r11, r4, r3, r12
2845 add r2, r1, r2, lsl #4
2847 ldr r12, [r9, r2, lsl #2]
2849 put_bits r11, r4, r12, r2
2850 checkbuf15 r10, r11, r4, r2, r12
2851 put_bits r11, r4, r3, r1
2852 checkbuf15 r10, r11, r4, r2, r12
2862 put_bits r11, r4, r1, r0
2863 checkbuf15 r10, r11, r4, r0, r1
2865 ldr r12, [sp, #0x18]
2866 str r11, [r12, #0x8]
2870 vld1.64 {d8, d9, d10, d11}, [r4, :128]!
2871 vld1.64 {d12, d13, d14, d15}, [r4, :128]
2874 pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
2881 asm_function jsimd_pick_color
2883 @ RGB_BUFFER .req r0
2885 @ OUTPUT_WIDTH .req r2
2887 push {r3, r4, r5, lr}
2906 VLD3.8 {d0, d2, d4}, [r0]!
2907 VLD3.8 {d1, d3, d5}, [r0]!
2924 VLD3.8 {d6, d8, d10}, [r0]!
2925 VLD3.8 {d7, d9, d11}, [r0]!
2949 BLT PROCESS_U_8 @ignore less than 8 pixels as of now
2951 VLD3.8 {d6, d7, d8}, [r0]!
2974 VLD3.8 {d6[0], d7[0], d8[0]}, [r0]!
2975 VLD3.8 {d6[1], d7[1], d8[1]}, [r0]!
2976 VLD3.8 {d6[2], d7[2], d8[2]}, [r0]!
2977 VLD3.8 {d6[3], d7[3], d8[3]}, [r0]!
3002 VLD3.8 {d0, d2, d4}, [r0]!
3035 pop {r3, r4, r5, pc}