2 * ARMv8 NEON optimizations for libjpeg-turbo
4 * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
7 * Copyright (C) 2013-2014, Linaro Limited. All Rights Reserved.
8 * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
9 * Copyright (C) 2014-2016, D. R. Commander. All Rights Reserved.
10 * Copyright (C) 2015-2016, Matthieu Darbois. All Rights Reserved.
11 * Copyright (C) 2016, Siarhei Siamashka. All Rights Reserved.
13 * This software is provided 'as-is', without any express or implied
14 * warranty. In no event will the authors be held liable for any damages
15 * arising from the use of this software.
17 * Permission is granted to anyone to use this software for any purpose,
18 * including commercial applications, and to alter it and redistribute it
19 * freely, subject to the following restrictions:
21 * 1. The origin of this software must not be misrepresented; you must not
22 * claim that you wrote the original software. If you use this software
23 * in a product, an acknowledgment in the product documentation would be
24 * appreciated but is not required.
25 * 2. Altered source versions must be plainly marked as such, and must not be
26 * misrepresented as being the original software.
27 * 3. This notice may not be removed or altered from any source distribution.
30 #if defined(__linux__) && defined(__ELF__)
31 .section .note.GNU-stack, "", %progbits /* mark stack as non-executable */
37 #define RESPECT_STRICT_ALIGNMENT 1
40 /*****************************************************************************/
42 /* Supplementary macro for setting function attributes */
43 .macro asm_function fname
51 .type \fname, %function
57 /* Transpose elements of single 128 bit registers */
58 .macro transpose_single x0, x1, xi, xilen, literal
59 ins \xi\xilen[0], \x0\xilen[0]
60 ins \x1\xilen[0], \x0\xilen[1]
61 trn1 \x0\literal, \x0\literal, \x1\literal
62 trn2 \x1\literal, \xi\literal, \x1\literal
65 /* Transpose elements of 2 differnet registers */
66 .macro transpose x0, x1, xi, xilen, literal
67 mov \xi\xilen, \x0\xilen
68 trn1 \x0\literal, \x0\literal, \x1\literal
69 trn2 \x1\literal, \xi\literal, \x1\literal
72 /* Transpose a block of 4x4 coefficients in four 64-bit registers */
73 .macro transpose_4x4_32 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
74 mov \xi\xilen, \x0\xilen
75 trn1 \x0\x0len, \x0\x0len, \x2\x2len
76 trn2 \x2\x2len, \xi\x0len, \x2\x2len
77 mov \xi\xilen, \x1\xilen
78 trn1 \x1\x1len, \x1\x1len, \x3\x3len
79 trn2 \x3\x3len, \xi\x1len, \x3\x3len
82 .macro transpose_4x4_16 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
83 mov \xi\xilen, \x0\xilen
84 trn1 \x0\x0len, \x0\x0len, \x1\x1len
85 trn2 \x1\x2len, \xi\x0len, \x1\x2len
86 mov \xi\xilen, \x2\xilen
87 trn1 \x2\x2len, \x2\x2len, \x3\x3len
88 trn2 \x3\x2len, \xi\x1len, \x3\x3len
91 .macro transpose_4x4 x0, x1, x2, x3, x5
92 transpose_4x4_16 \x0, .4h, \x1, .4h, \x2, .4h, \x3, .4h, \x5, .16b
93 transpose_4x4_32 \x0, .2s, \x1, .2s, \x2, .2s, \x3, .2s, \x5, .16b
96 .macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3
97 trn1 \t0\().8h, \l0\().8h, \l1\().8h
98 trn1 \t1\().8h, \l2\().8h, \l3\().8h
99 trn1 \t2\().8h, \l4\().8h, \l5\().8h
100 trn1 \t3\().8h, \l6\().8h, \l7\().8h
101 trn2 \l1\().8h, \l0\().8h, \l1\().8h
102 trn2 \l3\().8h, \l2\().8h, \l3\().8h
103 trn2 \l5\().8h, \l4\().8h, \l5\().8h
104 trn2 \l7\().8h, \l6\().8h, \l7\().8h
106 trn1 \l4\().4s, \t2\().4s, \t3\().4s
107 trn2 \t3\().4s, \t2\().4s, \t3\().4s
108 trn1 \t2\().4s, \t0\().4s, \t1\().4s
109 trn2 \l2\().4s, \t0\().4s, \t1\().4s
110 trn1 \t0\().4s, \l1\().4s, \l3\().4s
111 trn2 \l3\().4s, \l1\().4s, \l3\().4s
112 trn2 \t1\().4s, \l5\().4s, \l7\().4s
113 trn1 \l5\().4s, \l5\().4s, \l7\().4s
115 trn2 \l6\().2d, \l2\().2d, \t3\().2d
116 trn1 \l0\().2d, \t2\().2d, \l4\().2d
117 trn1 \l1\().2d, \t0\().2d, \l5\().2d
118 trn2 \l7\().2d, \l3\().2d, \t1\().2d
119 trn1 \l2\().2d, \l2\().2d, \t3\().2d
120 trn2 \l4\().2d, \t2\().2d, \l4\().2d
121 trn1 \l3\().2d, \l3\().2d, \t1\().2d
122 trn2 \l5\().2d, \t0\().2d, \l5\().2d
126 #define CENTERJSAMPLE 128
128 /*****************************************************************************/
131 * Perform dequantization and inverse DCT on one block of coefficients.
134 * jsimd_idct_islow_neon (void *dct_table, JCOEFPTR coef_block,
135 * JSAMPARRAY output_buf, JDIMENSION output_col)
138 #define CONST_BITS 13
141 #define F_0_298 2446 /* FIX(0.298631336) */
142 #define F_0_390 3196 /* FIX(0.390180644) */
143 #define F_0_541 4433 /* FIX(0.541196100) */
144 #define F_0_765 6270 /* FIX(0.765366865) */
145 #define F_0_899 7373 /* FIX(0.899976223) */
146 #define F_1_175 9633 /* FIX(1.175875602) */
147 #define F_1_501 12299 /* FIX(1.501321110) */
148 #define F_1_847 15137 /* FIX(1.847759065) */
149 #define F_1_961 16069 /* FIX(1.961570560) */
150 #define F_2_053 16819 /* FIX(2.053119869) */
151 #define F_2_562 20995 /* FIX(2.562915447) */
152 #define F_3_072 25172 /* FIX(3.072711026) */
155 Ljsimd_idct_islow_neon_consts:
168 .short 0 /* padding */
186 #define XFIX_P_0_298 v0.h[0]
187 #define XFIX_N_0_390 v0.h[1]
188 #define XFIX_P_0_541 v0.h[2]
189 #define XFIX_P_0_765 v0.h[3]
190 #define XFIX_N_0_899 v0.h[4]
191 #define XFIX_P_1_175 v0.h[5]
192 #define XFIX_P_1_501 v0.h[6]
193 #define XFIX_N_1_847 v0.h[7]
194 #define XFIX_N_1_961 v1.h[0]
195 #define XFIX_P_2_053 v1.h[1]
196 #define XFIX_N_2_562 v1.h[2]
197 #define XFIX_P_3_072 v1.h[3]
199 asm_function jsimd_idct_islow_neon
213 /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
214 guarantee that the upper (unused) 32 bits of x3 are valid. This
215 instruction ensures that those bits are set to zero. */
219 adr x15, Ljsimd_idct_islow_neon_consts
221 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32
222 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32
223 ld1 {v0.8h, v1.8h}, [x15]
224 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64
225 ld1 {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64
226 ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64
227 ld1 {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64
229 cmeq v16.8h, v3.8h, #0
230 cmeq v26.8h, v4.8h, #0
231 cmeq v27.8h, v5.8h, #0
232 cmeq v28.8h, v6.8h, #0
233 cmeq v29.8h, v7.8h, #0
234 cmeq v30.8h, v8.8h, #0
235 cmeq v31.8h, v9.8h, #0
237 and v10.16b, v16.16b, v26.16b
238 and v11.16b, v27.16b, v28.16b
239 and v12.16b, v29.16b, v30.16b
240 and v13.16b, v31.16b, v10.16b
241 and v14.16b, v11.16b, v12.16b
242 mul v2.8h, v2.8h, v18.8h
243 and v15.16b, v13.16b, v14.16b
244 shl v10.8h, v2.8h, #(PASS1_BITS)
250 /* case all AC coeffs are zeros */
260 /* for this transpose, we should organise data like this:
261 * 00, 01, 02, 03, 40, 41, 42, 43
262 * 10, 11, 12, 13, 50, 51, 52, 53
263 * 20, 21, 22, 23, 60, 61, 62, 63
264 * 30, 31, 32, 33, 70, 71, 72, 73
265 * 04, 05, 06, 07, 44, 45, 46, 47
266 * 14, 15, 16, 17, 54, 55, 56, 57
267 * 24, 25, 26, 27, 64, 65, 66, 67
268 * 34, 35, 36, 37, 74, 75, 76, 77
270 trn1 v28.8h, v2.8h, v3.8h
271 trn1 v29.8h, v4.8h, v5.8h
272 trn1 v30.8h, v6.8h, v7.8h
273 trn1 v31.8h, v8.8h, v9.8h
274 trn2 v16.8h, v2.8h, v3.8h
275 trn2 v17.8h, v4.8h, v5.8h
276 trn2 v18.8h, v6.8h, v7.8h
277 trn2 v19.8h, v8.8h, v9.8h
278 trn1 v2.4s, v28.4s, v29.4s
279 trn1 v6.4s, v30.4s, v31.4s
280 trn1 v3.4s, v16.4s, v17.4s
281 trn1 v7.4s, v18.4s, v19.4s
282 trn2 v4.4s, v28.4s, v29.4s
283 trn2 v8.4s, v30.4s, v31.4s
284 trn2 v5.4s, v16.4s, v17.4s
285 trn2 v9.4s, v18.4s, v19.4s
286 /* Even part: reverse the even part of the forward DCT. */
287 add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
288 add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
289 smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
290 sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
291 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
292 sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
293 mov v21.16b, v19.16b /* tmp3 = z1 */
294 mov v20.16b, v18.16b /* tmp3 = z1 */
295 smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
296 smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
297 sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
298 smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
299 smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
300 sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
301 sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
302 add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */
303 sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */
304 add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */
305 sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */
306 add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */
307 sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */
308 add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */
309 sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */
311 /* Odd part per figure 8; the matrix is unitary and hence its
312 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
315 add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
316 add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
317 add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
318 add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
319 add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */
321 smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
322 smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
323 smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
324 smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
325 smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
326 smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
327 smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
328 smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
329 smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
331 smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
332 smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
333 smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
334 smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
335 smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
336 smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
337 smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
338 smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
339 smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
341 add v23.4s, v23.4s, v27.4s /* z3 += z5 */
342 add v22.4s, v22.4s, v26.4s /* z3 += z5 */
343 add v25.4s, v25.4s, v27.4s /* z4 += z5 */
344 add v24.4s, v24.4s, v26.4s /* z4 += z5 */
346 add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */
347 add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */
348 add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */
349 add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */
350 add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */
351 add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */
352 add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */
353 add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */
355 add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */
356 add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */
357 add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */
358 add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */
359 add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */
360 add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */
361 add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */
362 add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */
364 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
366 add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */
367 add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */
368 sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */
369 sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */
370 add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */
371 add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */
372 sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */
373 sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */
374 add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */
375 add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */
376 sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */
377 sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */
378 add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */
379 add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */
380 sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */
381 sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */
383 shrn v2.4h, v18.4s, #16 /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
384 shrn v9.4h, v20.4s, #16 /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
385 shrn v3.4h, v22.4s, #16 /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
386 shrn v8.4h, v24.4s, #16 /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
387 shrn v4.4h, v26.4s, #16 /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
388 shrn v7.4h, v28.4s, #16 /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
389 shrn v5.4h, v14.4s, #16 /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
390 shrn v6.4h, v16.4s, #16 /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
391 shrn2 v2.8h, v19.4s, #16 /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
392 shrn2 v9.8h, v21.4s, #16 /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
393 shrn2 v3.8h, v23.4s, #16 /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
394 shrn2 v8.8h, v25.4s, #16 /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
395 shrn2 v4.8h, v27.4s, #16 /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
396 shrn2 v7.8h, v29.4s, #16 /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
397 shrn2 v5.8h, v15.4s, #16 /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
398 shrn2 v6.8h, v17.4s, #16 /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
399 movi v0.16b, #(CENTERJSAMPLE)
400 /* Prepare pointers (dual-issue with NEON instructions) */
401 ldp TMP1, TMP2, [OUTPUT_BUF], 16
402 sqrshrn v28.8b, v2.8h, #(CONST_BITS+PASS1_BITS+3-16)
403 ldp TMP3, TMP4, [OUTPUT_BUF], 16
404 sqrshrn v29.8b, v3.8h, #(CONST_BITS+PASS1_BITS+3-16)
405 add TMP1, TMP1, OUTPUT_COL
406 sqrshrn v30.8b, v4.8h, #(CONST_BITS+PASS1_BITS+3-16)
407 add TMP2, TMP2, OUTPUT_COL
408 sqrshrn v31.8b, v5.8h, #(CONST_BITS+PASS1_BITS+3-16)
409 add TMP3, TMP3, OUTPUT_COL
410 sqrshrn2 v28.16b, v6.8h, #(CONST_BITS+PASS1_BITS+3-16)
411 add TMP4, TMP4, OUTPUT_COL
412 sqrshrn2 v29.16b, v7.8h, #(CONST_BITS+PASS1_BITS+3-16)
413 ldp TMP5, TMP6, [OUTPUT_BUF], 16
414 sqrshrn2 v30.16b, v8.8h, #(CONST_BITS+PASS1_BITS+3-16)
415 ldp TMP7, TMP8, [OUTPUT_BUF], 16
416 sqrshrn2 v31.16b, v9.8h, #(CONST_BITS+PASS1_BITS+3-16)
417 add TMP5, TMP5, OUTPUT_COL
418 add v16.16b, v28.16b, v0.16b
419 add TMP6, TMP6, OUTPUT_COL
420 add v18.16b, v29.16b, v0.16b
421 add TMP7, TMP7, OUTPUT_COL
422 add v20.16b, v30.16b, v0.16b
423 add TMP8, TMP8, OUTPUT_COL
424 add v22.16b, v31.16b, v0.16b
426 /* Transpose the final 8-bit samples */
427 trn1 v28.16b, v16.16b, v18.16b
428 trn1 v30.16b, v20.16b, v22.16b
429 trn2 v29.16b, v16.16b, v18.16b
430 trn2 v31.16b, v20.16b, v22.16b
432 trn1 v16.8h, v28.8h, v30.8h
433 trn2 v18.8h, v28.8h, v30.8h
434 trn1 v20.8h, v29.8h, v31.8h
435 trn2 v22.8h, v29.8h, v31.8h
437 uzp1 v28.4s, v16.4s, v18.4s
438 uzp2 v30.4s, v16.4s, v18.4s
439 uzp1 v29.4s, v20.4s, v22.4s
440 uzp2 v31.4s, v20.4s, v22.4s
442 /* Store results to the output buffer */
443 st1 {v28.d}[0], [TMP1]
444 st1 {v29.d}[0], [TMP2]
445 st1 {v28.d}[1], [TMP3]
446 st1 {v29.d}[1], [TMP4]
447 st1 {v30.d}[0], [TMP5]
448 st1 {v31.d}[0], [TMP6]
449 st1 {v30.d}[1], [TMP7]
450 st1 {v31.d}[1], [TMP8]
451 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
452 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
457 mul v3.8h, v3.8h, v19.8h
458 mul v4.8h, v4.8h, v20.8h
459 mul v5.8h, v5.8h, v21.8h
460 add TMP4, xzr, TMP2, LSL #32
461 mul v6.8h, v6.8h, v22.8h
462 mul v7.8h, v7.8h, v23.8h
463 adds TMP3, xzr, TMP2, LSR #32
464 mul v8.8h, v8.8h, v24.8h
465 mul v9.8h, v9.8h, v25.8h
467 /* Right AC coef is zero */
469 /* Even part: reverse the even part of the forward DCT. */
470 add v18.4h, v4.4h, v8.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
471 add v22.4h, v2.4h, v6.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
472 sub v26.4h, v2.4h, v6.4h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
473 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
474 sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
475 mov v20.16b, v18.16b /* tmp3 = z1 */
476 sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
477 smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
478 smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
479 add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */
480 sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */
481 add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */
482 sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */
484 /* Odd part per figure 8; the matrix is unitary and hence its
485 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
488 add v22.4h, v9.4h, v5.4h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
489 add v24.4h, v7.4h, v3.4h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
490 add v18.4h, v9.4h, v3.4h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
491 add v20.4h, v7.4h, v5.4h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
492 add v26.4h, v22.4h, v24.4h /* z5 = z3 + z4 */
494 smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
495 smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
496 smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
497 smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
498 smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
499 smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
500 smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
501 smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
502 smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
504 add v22.4s, v22.4s, v26.4s /* z3 += z5 */
505 add v24.4s, v24.4s, v26.4s /* z4 += z5 */
507 add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */
508 add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */
509 add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */
510 add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */
512 add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */
513 add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */
514 add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */
515 add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */
517 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
519 add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */
520 sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */
521 add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */
522 sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */
523 add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */
524 sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */
525 add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */
526 sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */
528 rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
529 rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
530 rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
531 rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
532 rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
533 rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
534 rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
535 rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
545 /* Left AC coef is zero */
547 /* Even part: reverse the even part of the forward DCT. */
548 add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
549 add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
550 smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
551 sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
552 sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
553 mov v21.16b, v19.16b /* tmp3 = z1 */
554 smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
555 sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
556 smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
557 add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */
558 sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */
559 add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */
560 sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */
562 /* Odd part per figure 8; the matrix is unitary and hence its
563 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
566 add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
567 add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
568 add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
569 add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
570 add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */
572 smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
573 smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
574 smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
575 smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
576 smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
577 smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
578 smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
579 smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
580 smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
582 add v23.4s, v23.4s, v27.4s /* z3 += z5 */
583 add v22.4s, v22.4s, v26.4s /* z3 += z5 */
584 add v25.4s, v25.4s, v27.4s /* z4 += z5 */
585 add v24.4s, v24.4s, v26.4s /* z4 += z5 */
587 add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */
588 add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */
589 add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */
590 add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */
592 add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */
593 add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */
594 add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */
595 add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */
597 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
599 add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */
600 sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */
601 add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */
602 sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */
603 add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */
604 sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */
605 add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */
606 sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */
612 rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
613 rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
614 rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
615 rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
616 rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
617 rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
618 rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
619 rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
624 /* "No" AC coef is zero */
625 /* Even part: reverse the even part of the forward DCT. */
626 add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
627 add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
628 smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
629 sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
630 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
631 sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
632 mov v21.16b, v19.16b /* tmp3 = z1 */
633 mov v20.16b, v18.16b /* tmp3 = z1 */
634 smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
635 smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
636 sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
637 smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
638 smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
639 sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
640 sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
641 add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */
642 sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */
643 add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */
644 sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */
645 add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */
646 sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */
647 add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */
648 sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */
650 /* Odd part per figure 8; the matrix is unitary and hence its
651 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
654 add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
655 add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
656 add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
657 add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
658 add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */
660 smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
661 smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
662 smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
663 smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
664 smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
665 smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
666 smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
667 smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
668 smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
670 smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
671 smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
672 smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
673 smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
674 smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
675 smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
676 smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
677 smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
678 smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
680 add v23.4s, v23.4s, v27.4s /* z3 += z5 */
681 add v22.4s, v22.4s, v26.4s /* z3 += z5 */
682 add v25.4s, v25.4s, v27.4s /* z4 += z5 */
683 add v24.4s, v24.4s, v26.4s /* z4 += z5 */
685 add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */
686 add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */
687 add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */
688 add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */
689 add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */
690 add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */
691 add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */
692 add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */
694 add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */
695 add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */
696 add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */
697 add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */
698 add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */
699 add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */
700 add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */
701 add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */
703 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
705 add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */
706 add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */
707 sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */
708 sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */
709 add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */
710 add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */
711 sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */
712 sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */
713 add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */
714 add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */
715 sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */
716 sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */
717 add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */
718 add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */
719 sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */
720 sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */
722 rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
723 rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
724 rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
725 rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
726 rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
727 rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
728 rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
729 rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
730 rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
731 rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
732 rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
733 rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
734 rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
735 rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
736 rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
737 rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
770 /*****************************************************************************/
773 * jsimd_idct_ifast_neon
775 * This function contains a fast, not so accurate integer implementation of
776 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
777 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
778 * function from jidctfst.c
780 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
781 * But in ARM NEON case some extra additions are required because VQDMULH
782 * instruction can't handle the constants larger than 1. So the expressions
783 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
784 * which introduces an extra addition. Overall, there are 6 extra additions
785 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
788 #define XFIX_1_082392200 v0.h[0]
789 #define XFIX_1_414213562 v0.h[1]
790 #define XFIX_1_847759065 v0.h[2]
791 #define XFIX_2_613125930 v0.h[3]
794 Ljsimd_idct_ifast_neon_consts:
795 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
796 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
797 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
798 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
800 asm_function jsimd_idct_ifast_neon
815 /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
816 guarantee that the upper (unused) 32 bits of x3 are valid. This
817 instruction ensures that those bits are set to zero. */
820 /* Load and dequantize coefficients into NEON registers
821 * with the following allocation:
824 * 0 | d16 | d17 ( v16.8h )
825 * 1 | d18 | d19 ( v17.8h )
826 * 2 | d20 | d21 ( v18.8h )
827 * 3 | d22 | d23 ( v19.8h )
828 * 4 | d24 | d25 ( v20.8h )
829 * 5 | d26 | d27 ( v21.8h )
830 * 6 | d28 | d29 ( v22.8h )
831 * 7 | d30 | d31 ( v23.8h )
833 /* Save NEON registers used in fast IDCT */
834 adr TMP5, Ljsimd_idct_ifast_neon_consts
835 ld1 {v16.8h, v17.8h}, [COEF_BLOCK], 32
836 ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32
837 ld1 {v18.8h, v19.8h}, [COEF_BLOCK], 32
838 mul v16.8h, v16.8h, v0.8h
839 ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32
840 mul v17.8h, v17.8h, v1.8h
841 ld1 {v20.8h, v21.8h}, [COEF_BLOCK], 32
842 mul v18.8h, v18.8h, v2.8h
843 ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32
844 mul v19.8h, v19.8h, v3.8h
845 ld1 {v22.8h, v23.8h}, [COEF_BLOCK], 32
846 mul v20.8h, v20.8h, v0.8h
847 ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32
848 mul v22.8h, v22.8h, v2.8h
849 mul v21.8h, v21.8h, v1.8h
850 ld1 {v0.4h}, [TMP5] /* load constants */
851 mul v23.8h, v23.8h, v3.8h
853 /* 1-D IDCT, pass 1 */
854 sub v2.8h, v18.8h, v22.8h
855 add v22.8h, v18.8h, v22.8h
856 sub v1.8h, v19.8h, v21.8h
857 add v21.8h, v19.8h, v21.8h
858 sub v5.8h, v17.8h, v23.8h
859 add v23.8h, v17.8h, v23.8h
860 sqdmulh v4.8h, v2.8h, XFIX_1_414213562
861 sqdmulh v6.8h, v1.8h, XFIX_2_613125930
862 add v3.8h, v1.8h, v1.8h
863 sub v1.8h, v5.8h, v1.8h
864 add v18.8h, v2.8h, v4.8h
865 sqdmulh v4.8h, v1.8h, XFIX_1_847759065
866 sub v2.8h, v23.8h, v21.8h
867 add v3.8h, v3.8h, v6.8h
868 sqdmulh v6.8h, v2.8h, XFIX_1_414213562
869 add v1.8h, v1.8h, v4.8h
870 sqdmulh v4.8h, v5.8h, XFIX_1_082392200
871 sub v18.8h, v18.8h, v22.8h
872 add v2.8h, v2.8h, v6.8h
873 sub v6.8h, v16.8h, v20.8h
874 add v20.8h, v16.8h, v20.8h
875 add v17.8h, v5.8h, v4.8h
876 add v5.8h, v6.8h, v18.8h
877 sub v18.8h, v6.8h, v18.8h
878 add v6.8h, v23.8h, v21.8h
879 add v16.8h, v20.8h, v22.8h
880 sub v3.8h, v6.8h, v3.8h
881 sub v20.8h, v20.8h, v22.8h
882 sub v3.8h, v3.8h, v1.8h
883 sub v1.8h, v17.8h, v1.8h
884 add v2.8h, v3.8h, v2.8h
885 sub v23.8h, v16.8h, v6.8h
886 add v1.8h, v1.8h, v2.8h
887 add v16.8h, v16.8h, v6.8h
888 add v22.8h, v5.8h, v3.8h
889 sub v17.8h, v5.8h, v3.8h
890 sub v21.8h, v18.8h, v2.8h
891 add v18.8h, v18.8h, v2.8h
892 sub v19.8h, v20.8h, v1.8h
893 add v20.8h, v20.8h, v1.8h
894 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30, v31
895 /* 1-D IDCT, pass 2 */
896 sub v2.8h, v18.8h, v22.8h
897 add v22.8h, v18.8h, v22.8h
898 sub v1.8h, v19.8h, v21.8h
899 add v21.8h, v19.8h, v21.8h
900 sub v5.8h, v17.8h, v23.8h
901 add v23.8h, v17.8h, v23.8h
902 sqdmulh v4.8h, v2.8h, XFIX_1_414213562
903 sqdmulh v6.8h, v1.8h, XFIX_2_613125930
904 add v3.8h, v1.8h, v1.8h
905 sub v1.8h, v5.8h, v1.8h
906 add v18.8h, v2.8h, v4.8h
907 sqdmulh v4.8h, v1.8h, XFIX_1_847759065
908 sub v2.8h, v23.8h, v21.8h
909 add v3.8h, v3.8h, v6.8h
910 sqdmulh v6.8h, v2.8h, XFIX_1_414213562
911 add v1.8h, v1.8h, v4.8h
912 sqdmulh v4.8h, v5.8h, XFIX_1_082392200
913 sub v18.8h, v18.8h, v22.8h
914 add v2.8h, v2.8h, v6.8h
915 sub v6.8h, v16.8h, v20.8h
916 add v20.8h, v16.8h, v20.8h
917 add v17.8h, v5.8h, v4.8h
918 add v5.8h, v6.8h, v18.8h
919 sub v18.8h, v6.8h, v18.8h
920 add v6.8h, v23.8h, v21.8h
921 add v16.8h, v20.8h, v22.8h
922 sub v3.8h, v6.8h, v3.8h
923 sub v20.8h, v20.8h, v22.8h
924 sub v3.8h, v3.8h, v1.8h
925 sub v1.8h, v17.8h, v1.8h
926 add v2.8h, v3.8h, v2.8h
927 sub v23.8h, v16.8h, v6.8h
928 add v1.8h, v1.8h, v2.8h
929 add v16.8h, v16.8h, v6.8h
930 add v22.8h, v5.8h, v3.8h
931 sub v17.8h, v5.8h, v3.8h
932 sub v21.8h, v18.8h, v2.8h
933 add v18.8h, v18.8h, v2.8h
934 sub v19.8h, v20.8h, v1.8h
935 add v20.8h, v20.8h, v1.8h
936 /* Descale to 8-bit and range limit */
938 /* Prepare pointers (dual-issue with NEON instructions) */
939 ldp TMP1, TMP2, [OUTPUT_BUF], 16
940 sqshrn v28.8b, v16.8h, #5
941 ldp TMP3, TMP4, [OUTPUT_BUF], 16
942 sqshrn v29.8b, v17.8h, #5
943 add TMP1, TMP1, OUTPUT_COL
944 sqshrn v30.8b, v18.8h, #5
945 add TMP2, TMP2, OUTPUT_COL
946 sqshrn v31.8b, v19.8h, #5
947 add TMP3, TMP3, OUTPUT_COL
948 sqshrn2 v28.16b, v20.8h, #5
949 add TMP4, TMP4, OUTPUT_COL
950 sqshrn2 v29.16b, v21.8h, #5
951 ldp TMP5, TMP6, [OUTPUT_BUF], 16
952 sqshrn2 v30.16b, v22.8h, #5
953 ldp TMP7, TMP8, [OUTPUT_BUF], 16
954 sqshrn2 v31.16b, v23.8h, #5
955 add TMP5, TMP5, OUTPUT_COL
956 add v16.16b, v28.16b, v0.16b
957 add TMP6, TMP6, OUTPUT_COL
958 add v18.16b, v29.16b, v0.16b
959 add TMP7, TMP7, OUTPUT_COL
960 add v20.16b, v30.16b, v0.16b
961 add TMP8, TMP8, OUTPUT_COL
962 add v22.16b, v31.16b, v0.16b
964 /* Transpose the final 8-bit samples */
965 trn1 v28.16b, v16.16b, v18.16b
966 trn1 v30.16b, v20.16b, v22.16b
967 trn2 v29.16b, v16.16b, v18.16b
968 trn2 v31.16b, v20.16b, v22.16b
970 trn1 v16.8h, v28.8h, v30.8h
971 trn2 v18.8h, v28.8h, v30.8h
972 trn1 v20.8h, v29.8h, v31.8h
973 trn2 v22.8h, v29.8h, v31.8h
975 uzp1 v28.4s, v16.4s, v18.4s
976 uzp2 v30.4s, v16.4s, v18.4s
977 uzp1 v29.4s, v20.4s, v22.4s
978 uzp2 v31.4s, v20.4s, v22.4s
980 /* Store results to the output buffer */
981 st1 {v28.d}[0], [TMP1]
982 st1 {v29.d}[0], [TMP2]
983 st1 {v28.d}[1], [TMP3]
984 st1 {v29.d}[1], [TMP4]
985 st1 {v30.d}[0], [TMP5]
986 st1 {v31.d}[0], [TMP6]
987 st1 {v30.d}[1], [TMP7]
988 st1 {v31.d}[1], [TMP8]
1005 /*****************************************************************************/
1008 * jsimd_idct_4x4_neon
1010 * This function contains inverse-DCT code for getting reduced-size
1011 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations
1012 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
1013 * function from jpeg-6b (jidctred.c).
1015 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
1016 * requires much less arithmetic operations and hence should be faster.
1017 * The primary purpose of this particular NEON optimized function is
1018 * bit exact compatibility with jpeg-6b.
1020 * TODO: a bit better instructions scheduling can be achieved by expanding
1021 * idct_helper/transpose_4x4 macros and reordering instructions,
1022 * but readability will suffer somewhat.
1025 #define CONST_BITS 13
1027 #define FIX_0_211164243 (1730) /* FIX(0.211164243) */
1028 #define FIX_0_509795579 (4176) /* FIX(0.509795579) */
1029 #define FIX_0_601344887 (4926) /* FIX(0.601344887) */
1030 #define FIX_0_720959822 (5906) /* FIX(0.720959822) */
1031 #define FIX_0_765366865 (6270) /* FIX(0.765366865) */
1032 #define FIX_0_850430095 (6967) /* FIX(0.850430095) */
1033 #define FIX_0_899976223 (7373) /* FIX(0.899976223) */
1034 #define FIX_1_061594337 (8697) /* FIX(1.061594337) */
1035 #define FIX_1_272758580 (10426) /* FIX(1.272758580) */
1036 #define FIX_1_451774981 (11893) /* FIX(1.451774981) */
1037 #define FIX_1_847759065 (15137) /* FIX(1.847759065) */
1038 #define FIX_2_172734803 (17799) /* FIX(2.172734803) */
1039 #define FIX_2_562915447 (20995) /* FIX(2.562915447) */
1040 #define FIX_3_624509785 (29692) /* FIX(3.624509785) */
1043 Ljsimd_idct_4x4_neon_consts:
1044 .short FIX_1_847759065 /* v0.h[0] */
1045 .short -FIX_0_765366865 /* v0.h[1] */
1046 .short -FIX_0_211164243 /* v0.h[2] */
1047 .short FIX_1_451774981 /* v0.h[3] */
1048 .short -FIX_2_172734803 /* d1[0] */
1049 .short FIX_1_061594337 /* d1[1] */
1050 .short -FIX_0_509795579 /* d1[2] */
1051 .short -FIX_0_601344887 /* d1[3] */
1052 .short FIX_0_899976223 /* v2.h[0] */
1053 .short FIX_2_562915447 /* v2.h[1] */
1054 .short 1 << (CONST_BITS+1) /* v2.h[2] */
1055 .short 0 /* v2.h[3] */
1057 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
1058 smull v28.4s, \x4, v2.h[2]
1059 smlal v28.4s, \x8, v0.h[0]
1060 smlal v28.4s, \x14, v0.h[1]
1062 smull v26.4s, \x16, v1.h[2]
1063 smlal v26.4s, \x12, v1.h[3]
1064 smlal v26.4s, \x10, v2.h[0]
1065 smlal v26.4s, \x6, v2.h[1]
1067 smull v30.4s, \x4, v2.h[2]
1068 smlsl v30.4s, \x8, v0.h[0]
1069 smlsl v30.4s, \x14, v0.h[1]
1071 smull v24.4s, \x16, v0.h[2]
1072 smlal v24.4s, \x12, v0.h[3]
1073 smlal v24.4s, \x10, v1.h[0]
1074 smlal v24.4s, \x6, v1.h[1]
1076 add v20.4s, v28.4s, v26.4s
1077 sub v28.4s, v28.4s, v26.4s
1080 srshr v20.4s, v20.4s, #\shift
1081 srshr v28.4s, v28.4s, #\shift
1085 rshrn \y26, v20.4s, #\shift
1086 rshrn \y29, v28.4s, #\shift
1089 add v20.4s, v30.4s, v24.4s
1090 sub v30.4s, v30.4s, v24.4s
1093 srshr v20.4s, v20.4s, #\shift
1094 srshr v30.4s, v30.4s, #\shift
1098 rshrn \y27, v20.4s, #\shift
1099 rshrn \y28, v30.4s, #\shift
1103 asm_function jsimd_idct_4x4_neon
1114 /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
1115 guarantee that the upper (unused) 32 bits of x3 are valid. This
1116 instruction ensures that those bits are set to zero. */
1119 /* Save all used NEON registers */
1122 /* Load constants (v3.4h is just used for padding) */
1123 adr TMP4, Ljsimd_idct_4x4_neon_consts
1124 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
1125 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
1126 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
1128 /* Load all COEF_BLOCK into NEON registers with the following allocation:
1130 * ---------+--------
1134 * 3 | v10.4h | v11.4h
1136 * 5 | v12.4h | v13.4h
1137 * 6 | v14.4h | v15.4h
1138 * 7 | v16.4h | v17.4h
1140 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
1141 ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32
1142 add COEF_BLOCK, COEF_BLOCK, #16
1143 ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32
1144 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16
1146 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
1147 mul v4.4h, v4.4h, v18.4h
1148 mul v5.4h, v5.4h, v19.4h
1149 ins v4.d[1], v5.d[0] /* 128 bit q4 */
1150 ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
1151 mul v6.4h, v6.4h, v20.4h
1152 mul v7.4h, v7.4h, v21.4h
1153 ins v6.d[1], v7.d[0] /* 128 bit q6 */
1154 mul v8.4h, v8.4h, v22.4h
1155 mul v9.4h, v9.4h, v23.4h
1156 ins v8.d[1], v9.d[0] /* 128 bit q8 */
1157 add DCT_TABLE, DCT_TABLE, #16
1158 ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
1159 mul v10.4h, v10.4h, v24.4h
1160 mul v11.4h, v11.4h, v25.4h
1161 ins v10.d[1], v11.d[0] /* 128 bit q10 */
1162 mul v12.4h, v12.4h, v26.4h
1163 mul v13.4h, v13.4h, v27.4h
1164 ins v12.d[1], v13.d[0] /* 128 bit q12 */
1165 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
1166 mul v14.4h, v14.4h, v28.4h
1167 mul v15.4h, v15.4h, v29.4h
1168 ins v14.d[1], v15.d[0] /* 128 bit q14 */
1169 mul v16.4h, v16.4h, v30.4h
1170 mul v17.4h, v17.4h, v31.4h
1171 ins v16.d[1], v17.d[0] /* 128 bit q16 */
1174 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, \
1175 v4.4h, v6.4h, v8.4h, v10.4h
1176 transpose_4x4 v4, v6, v8, v10, v3
1177 ins v10.d[1], v11.d[0]
1178 idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, \
1179 v5.4h, v7.4h, v9.4h, v11.4h
1180 transpose_4x4 v5, v7, v9, v11, v3
1181 ins v10.d[1], v11.d[0]
1184 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, \
1185 v26.4h, v27.4h, v28.4h, v29.4h
1186 transpose_4x4 v26, v27, v28, v29, v3
1190 ins v26.d[1], v27.d[0]
1191 ins v28.d[1], v29.d[0]
1192 add v26.8h, v26.8h, v30.8h
1193 add v28.8h, v28.8h, v30.8h
1194 sqxtun v26.8b, v26.8h
1195 sqxtun v27.8b, v28.8h
1197 /* Store results to the output buffer */
1198 ldp TMP1, TMP2, [OUTPUT_BUF], 16
1199 ldp TMP3, TMP4, [OUTPUT_BUF]
1200 add TMP1, TMP1, OUTPUT_COL
1201 add TMP2, TMP2, OUTPUT_COL
1202 add TMP3, TMP3, OUTPUT_COL
1203 add TMP4, TMP4, OUTPUT_COL
1205 #if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
1206 /* We can use much less instructions on little endian systems if the
1207 * OS kernel is not configured to trap unaligned memory accesses
1209 st1 {v26.s}[0], [TMP1], 4
1210 st1 {v27.s}[0], [TMP3], 4
1211 st1 {v26.s}[1], [TMP2], 4
1212 st1 {v27.s}[1], [TMP4], 4
1214 st1 {v26.b}[0], [TMP1], 1
1215 st1 {v27.b}[0], [TMP3], 1
1216 st1 {v26.b}[1], [TMP1], 1
1217 st1 {v27.b}[1], [TMP3], 1
1218 st1 {v26.b}[2], [TMP1], 1
1219 st1 {v27.b}[2], [TMP3], 1
1220 st1 {v26.b}[3], [TMP1], 1
1221 st1 {v27.b}[3], [TMP3], 1
1223 st1 {v26.b}[4], [TMP2], 1
1224 st1 {v27.b}[4], [TMP4], 1
1225 st1 {v26.b}[5], [TMP2], 1
1226 st1 {v27.b}[5], [TMP4], 1
1227 st1 {v26.b}[6], [TMP2], 1
1228 st1 {v27.b}[6], [TMP4], 1
1229 st1 {v26.b}[7], [TMP2], 1
1230 st1 {v27.b}[7], [TMP4], 1
1233 /* vpop {v8.4h - v15.4h} ;not available */
1234 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1235 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1250 /*****************************************************************************/
1253 * jsimd_idct_2x2_neon
1255 * This function contains inverse-DCT code for getting reduced-size
1256 * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations
1257 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
1258 * function from jpeg-6b (jidctred.c).
1260 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
1261 * requires much less arithmetic operations and hence should be faster.
1262 * The primary purpose of this particular NEON optimized function is
1263 * bit exact compatibility with jpeg-6b.
1267 Ljsimd_idct_2x2_neon_consts:
1268 .short -FIX_0_720959822 /* v14[0] */
1269 .short FIX_0_850430095 /* v14[1] */
1270 .short -FIX_1_272758580 /* v14[2] */
1271 .short FIX_3_624509785 /* v14[3] */
1273 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
1274 sshll v15.4s, \x4, #15
1275 smull v26.4s, \x6, v14.h[3]
1276 smlal v26.4s, \x10, v14.h[2]
1277 smlal v26.4s, \x12, v14.h[1]
1278 smlal v26.4s, \x16, v14.h[0]
1280 add v20.4s, v15.4s, v26.4s
1281 sub v15.4s, v15.4s, v26.4s
1284 srshr v20.4s, v20.4s, #\shift
1285 srshr v15.4s, v15.4s, #\shift
1289 rshrn \y26, v20.4s, #\shift
1290 rshrn \y27, v15.4s, #\shift
1294 asm_function jsimd_idct_2x2_neon
1303 /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
1304 guarantee that the upper (unused) 32 bits of x3 are valid. This
1305 instruction ensures that those bits are set to zero. */
1308 /* vpush {v8.4h - v15.4h} ; not available */
1312 /* Load constants */
1313 adr TMP2, Ljsimd_idct_2x2_neon_consts
1314 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
1315 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
1316 ld1 {v14.4h}, [TMP2]
1318 /* Load all COEF_BLOCK into NEON registers with the following allocation:
1320 * ---------+--------
1324 * 3 | v10.4h | v11.4h
1326 * 5 | v12.4h | v13.4h
1328 * 7 | v16.4h | v17.4h
1330 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
1331 add COEF_BLOCK, COEF_BLOCK, #16
1332 ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16
1333 add COEF_BLOCK, COEF_BLOCK, #16
1334 ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16
1335 add COEF_BLOCK, COEF_BLOCK, #16
1336 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16
1338 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
1339 mul v4.4h, v4.4h, v18.4h
1340 mul v5.4h, v5.4h, v19.4h
1341 ins v4.d[1], v5.d[0]
1342 mul v6.4h, v6.4h, v20.4h
1343 mul v7.4h, v7.4h, v21.4h
1344 ins v6.d[1], v7.d[0]
1345 add DCT_TABLE, DCT_TABLE, #16
1346 ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16
1347 mul v10.4h, v10.4h, v24.4h
1348 mul v11.4h, v11.4h, v25.4h
1349 ins v10.d[1], v11.d[0]
1350 add DCT_TABLE, DCT_TABLE, #16
1351 ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16
1352 mul v12.4h, v12.4h, v26.4h
1353 mul v13.4h, v13.4h, v27.4h
1354 ins v12.d[1], v13.d[0]
1355 add DCT_TABLE, DCT_TABLE, #16
1356 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
1357 mul v16.4h, v16.4h, v30.4h
1358 mul v17.4h, v17.4h, v31.4h
1359 ins v16.d[1], v17.d[0]
1363 idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h
1364 transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h
1365 idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
1366 transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h
1368 smull v26.4s, v6.4h, v14.h[3]
1369 smlal v26.4s, v10.4h, v14.h[2]
1370 smlal v26.4s, v12.4h, v14.h[1]
1371 smlal v26.4s, v16.4h, v14.h[0]
1372 smull v24.4s, v7.4h, v14.h[3]
1373 smlal v24.4s, v11.4h, v14.h[2]
1374 smlal v24.4s, v13.4h, v14.h[1]
1375 smlal v24.4s, v17.4h, v14.h[0]
1376 sshll v15.4s, v4.4h, #15
1377 sshll v30.4s, v5.4h, #15
1378 add v20.4s, v15.4s, v26.4s
1379 sub v15.4s, v15.4s, v26.4s
1380 rshrn v4.4h, v20.4s, #13
1381 rshrn v6.4h, v15.4s, #13
1382 add v20.4s, v30.4s, v24.4s
1383 sub v15.4s, v30.4s, v24.4s
1384 rshrn v5.4h, v20.4s, #13
1385 rshrn v7.4h, v15.4s, #13
1386 ins v4.d[1], v5.d[0]
1387 ins v6.d[1], v7.d[0]
1388 transpose v4, v6, v3, .16b, .8h
1389 transpose v6, v10, v3, .16b, .4s
1390 ins v11.d[0], v10.d[1]
1391 ins v7.d[0], v6.d[1]
1395 idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h
1399 ins v26.d[1], v27.d[0]
1400 add v26.8h, v26.8h, v30.8h
1401 sqxtun v30.8b, v26.8h
1402 ins v26.d[0], v30.d[0]
1403 sqxtun v27.8b, v26.8h
1405 /* Store results to the output buffer */
1406 ldp TMP1, TMP2, [OUTPUT_BUF]
1407 add TMP1, TMP1, OUTPUT_COL
1408 add TMP2, TMP2, OUTPUT_COL
1410 st1 {v26.b}[0], [TMP1], 1
1411 st1 {v27.b}[4], [TMP1], 1
1412 st1 {v26.b}[1], [TMP2], 1
1413 st1 {v27.b}[5], [TMP2], 1
1415 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1416 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1429 /*****************************************************************************/
1432 * jsimd_ycc_extrgb_convert_neon
1433 * jsimd_ycc_extbgr_convert_neon
1434 * jsimd_ycc_extrgbx_convert_neon
1435 * jsimd_ycc_extbgrx_convert_neon
1436 * jsimd_ycc_extxbgr_convert_neon
1437 * jsimd_ycc_extxrgb_convert_neon
1439 * Colorspace conversion YCbCr -> RGB
1447 prfm pldl1keep, [U, #64]
1448 prfm pldl1keep, [V, #64]
1449 prfm pldl1keep, [Y, #64]
1451 ld1 {v4.b}[0], [U], 1
1452 ld1 {v4.b}[1], [U], 1
1453 ld1 {v4.b}[2], [U], 1
1454 ld1 {v4.b}[3], [U], 1
1455 ld1 {v5.b}[0], [V], 1
1456 ld1 {v5.b}[1], [V], 1
1457 ld1 {v5.b}[2], [V], 1
1458 ld1 {v5.b}[3], [V], 1
1459 ld1 {v0.b}[0], [Y], 1
1460 ld1 {v0.b}[1], [Y], 1
1461 ld1 {v0.b}[2], [Y], 1
1462 ld1 {v0.b}[3], [Y], 1
1464 ld1 {v4.b}[4], [U], 1
1465 ld1 {v4.b}[5], [U], 1
1466 ld1 {v5.b}[4], [V], 1
1467 ld1 {v5.b}[5], [V], 1
1468 ld1 {v0.b}[4], [Y], 1
1469 ld1 {v0.b}[5], [Y], 1
1471 ld1 {v4.b}[6], [U], 1
1472 ld1 {v5.b}[6], [V], 1
1473 ld1 {v0.b}[6], [Y], 1
1475 .error unsupported macroblock size
1479 .macro do_store bpp, size, fast_st3
1483 st3 {v10.8b, v11.8b, v12.8b}, [RGB], 24
1485 st1 {v10.b}[0], [RGB], #1
1486 st1 {v11.b}[0], [RGB], #1
1487 st1 {v12.b}[0], [RGB], #1
1489 st1 {v10.b}[1], [RGB], #1
1490 st1 {v11.b}[1], [RGB], #1
1491 st1 {v12.b}[1], [RGB], #1
1493 st1 {v10.b}[2], [RGB], #1
1494 st1 {v11.b}[2], [RGB], #1
1495 st1 {v12.b}[2], [RGB], #1
1497 st1 {v10.b}[3], [RGB], #1
1498 st1 {v11.b}[3], [RGB], #1
1499 st1 {v12.b}[3], [RGB], #1
1501 st1 {v10.b}[4], [RGB], #1
1502 st1 {v11.b}[4], [RGB], #1
1503 st1 {v12.b}[4], [RGB], #1
1505 st1 {v10.b}[5], [RGB], #1
1506 st1 {v11.b}[5], [RGB], #1
1507 st1 {v12.b}[5], [RGB], #1
1509 st1 {v10.b}[6], [RGB], #1
1510 st1 {v11.b}[6], [RGB], #1
1511 st1 {v12.b}[6], [RGB], #1
1513 st1 {v10.b}[7], [RGB], #1
1514 st1 {v11.b}[7], [RGB], #1
1515 st1 {v12.b}[7], [RGB], #1
1518 st3 {v10.b, v11.b, v12.b}[0], [RGB], 3
1519 st3 {v10.b, v11.b, v12.b}[1], [RGB], 3
1520 st3 {v10.b, v11.b, v12.b}[2], [RGB], 3
1521 st3 {v10.b, v11.b, v12.b}[3], [RGB], 3
1523 st3 {v10.b, v11.b, v12.b}[4], [RGB], 3
1524 st3 {v10.b, v11.b, v12.b}[5], [RGB], 3
1526 st3 {v10.b, v11.b, v12.b}[6], [RGB], 3
1528 .error unsupported macroblock size
1532 st4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32
1534 st4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4
1535 st4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4
1536 st4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4
1537 st4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4
1539 st4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4
1540 st4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4
1542 st4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4
1544 .error unsupported macroblock size
1548 st1 {v25.8h}, [RGB], 16
1550 st1 {v25.4h}, [RGB], 8
1552 st1 {v25.h}[4], [RGB], 2
1553 st1 {v25.h}[5], [RGB], 2
1555 st1 {v25.h}[6], [RGB], 2
1557 .error unsupported macroblock size
1560 .error unsupported bpp
1564 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, \
1565 g_offs, gsize, b_offs, bsize, \
1569 * 2-stage pipelined YCbCr->RGB conversion
1572 .macro do_yuv_to_rgb_stage1
1573 uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */
1574 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
1575 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
1576 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
1577 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
1578 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
1579 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
1580 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
1581 smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */
1582 smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */
1585 .macro do_yuv_to_rgb_stage2
1586 rshrn v20.4h, v20.4s, #15
1587 rshrn2 v20.8h, v22.4s, #15
1588 rshrn v24.4h, v24.4s, #14
1589 rshrn2 v24.8h, v26.4s, #14
1590 rshrn v28.4h, v28.4s, #14
1591 rshrn2 v28.8h, v30.4s, #14
1592 uaddw v20.8h, v20.8h, v0.8b
1593 uaddw v24.8h, v24.8h, v0.8b
1594 uaddw v28.8h, v28.8h, v0.8b
1596 sqxtun v1\g_offs\defsize, v20.8h
1597 sqxtun v1\r_offs\defsize, v24.8h
1598 sqxtun v1\b_offs\defsize, v28.8h
1600 sqshlu v21.8h, v20.8h, #8
1601 sqshlu v25.8h, v24.8h, #8
1602 sqshlu v29.8h, v28.8h, #8
1603 sri v25.8h, v21.8h, #5
1604 sri v25.8h, v29.8h, #11
1608 .macro do_yuv_to_rgb_stage2_store_load_stage1 fast_st3
1609 rshrn v20.4h, v20.4s, #15
1610 rshrn v24.4h, v24.4s, #14
1611 rshrn v28.4h, v28.4s, #14
1613 rshrn2 v20.8h, v22.4s, #15
1614 rshrn2 v24.8h, v26.4s, #14
1615 rshrn2 v28.8h, v30.4s, #14
1617 uaddw v20.8h, v20.8h, v0.8b
1618 uaddw v24.8h, v24.8h, v0.8b
1619 uaddw v28.8h, v28.8h, v0.8b
1620 .if \bpp != 16 /**************** rgb24/rgb32 ******************************/
1621 sqxtun v1\g_offs\defsize, v20.8h
1623 sqxtun v1\r_offs\defsize, v24.8h
1624 prfm pldl1keep, [U, #64]
1625 prfm pldl1keep, [V, #64]
1626 prfm pldl1keep, [Y, #64]
1627 sqxtun v1\b_offs\defsize, v28.8h
1628 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
1629 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
1630 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
1631 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
1632 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
1633 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
1634 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
1635 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
1636 .else /**************************** rgb565 ********************************/
1637 sqshlu v21.8h, v20.8h, #8
1638 sqshlu v25.8h, v24.8h, #8
1639 sqshlu v29.8h, v28.8h, #8
1640 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
1641 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
1643 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
1644 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
1645 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
1646 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
1647 sri v25.8h, v21.8h, #5
1648 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
1649 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
1650 prfm pldl1keep, [U, #64]
1651 prfm pldl1keep, [V, #64]
1652 prfm pldl1keep, [Y, #64]
1653 sri v25.8h, v29.8h, #11
1655 do_store \bpp, 8, \fast_st3
1656 smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */
1657 smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */
1660 .macro do_yuv_to_rgb
1661 do_yuv_to_rgb_stage1
1662 do_yuv_to_rgb_stage2
1665 /* Apple gas crashes on adrl, work around that by using adr.
1666 * But this requires a copy of these constants for each function.
1671 Ljsimd_ycc_\colorid\()_neon_consts:
1673 Ljsimd_ycc_\colorid\()_neon_slowst3_consts:
1676 .short 22971, -11277, -23401, 29033
1677 .short -128, -128, -128, -128
1678 .short -128, -128, -128, -128
1681 asm_function jsimd_ycc_\colorid\()_convert_neon
1683 asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
1685 OUTPUT_WIDTH .req w0
1704 /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
1706 adr x15, Ljsimd_ycc_\colorid\()_neon_consts
1708 adr x15, Ljsimd_ycc_\colorid\()_neon_slowst3_consts
1711 /* Save NEON registers */
1712 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
1713 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
1714 ld1 {v0.4h, v1.4h}, [x15], 16
1717 ldr INPUT_BUF0, [INPUT_BUF]
1718 ldr INPUT_BUF1, [INPUT_BUF, #8]
1719 ldr INPUT_BUF2, [INPUT_BUF, #16]
1722 /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
1726 /* Outer loop over scanlines */
1730 ldr Y, [INPUT_BUF0, INPUT_ROW, uxtw #3]
1731 ldr U, [INPUT_BUF1, INPUT_ROW, uxtw #3]
1733 ldr V, [INPUT_BUF2, INPUT_ROW, uxtw #3]
1734 add INPUT_ROW, INPUT_ROW, #1
1735 ldr RGB, [OUTPUT_BUF], #8
1737 /* Inner loop over pixels */
1741 do_yuv_to_rgb_stage1
1745 do_yuv_to_rgb_stage2_store_load_stage1 \fast_st3
1749 do_yuv_to_rgb_stage2
1750 do_store \bpp, 8, \fast_st3
1769 do_store \bpp, 4, \fast_st3
1773 do_store \bpp, 2, \fast_st3
1777 do_store \bpp, 1, \fast_st3
1779 subs NUM_ROWS, NUM_ROWS, #1
1782 /* Restore all registers and return */
1783 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1784 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1799 .purgem do_yuv_to_rgb
1800 .purgem do_yuv_to_rgb_stage1
1801 .purgem do_yuv_to_rgb_stage2
1802 .purgem do_yuv_to_rgb_stage2_store_load_stage1
1806 /*--------------------------------- id ----- bpp R rsize G gsize B bsize defsize fast_st3*/
1807 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 1
1808 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 1
1809 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, .8b, 1
1810 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, .8b, 1
1811 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, .8b, 1
1812 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, .8b, 1
1813 generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, .8b, 1
1815 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 0
1816 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 0
1822 /*****************************************************************************/
1825 * jsimd_extrgb_ycc_convert_neon
1826 * jsimd_extbgr_ycc_convert_neon
1827 * jsimd_extrgbx_ycc_convert_neon
1828 * jsimd_extbgrx_ycc_convert_neon
1829 * jsimd_extxbgr_ycc_convert_neon
1830 * jsimd_extxrgb_ycc_convert_neon
1832 * Colorspace conversion RGB -> YCbCr
1835 .macro do_store size
1837 st1 {v20.8b}, [Y], #8
1838 st1 {v21.8b}, [U], #8
1839 st1 {v22.8b}, [V], #8
1841 st1 {v20.b}[0], [Y], #1
1842 st1 {v20.b}[1], [Y], #1
1843 st1 {v20.b}[2], [Y], #1
1844 st1 {v20.b}[3], [Y], #1
1845 st1 {v21.b}[0], [U], #1
1846 st1 {v21.b}[1], [U], #1
1847 st1 {v21.b}[2], [U], #1
1848 st1 {v21.b}[3], [U], #1
1849 st1 {v22.b}[0], [V], #1
1850 st1 {v22.b}[1], [V], #1
1851 st1 {v22.b}[2], [V], #1
1852 st1 {v22.b}[3], [V], #1
1854 st1 {v20.b}[4], [Y], #1
1855 st1 {v20.b}[5], [Y], #1
1856 st1 {v21.b}[4], [U], #1
1857 st1 {v21.b}[5], [U], #1
1858 st1 {v22.b}[4], [V], #1
1859 st1 {v22.b}[5], [V], #1
1861 st1 {v20.b}[6], [Y], #1
1862 st1 {v21.b}[6], [U], #1
1863 st1 {v22.b}[6], [V], #1
1865 .error unsupported macroblock size
1869 .macro do_load bpp, size, fast_ld3
1873 ld3 {v10.8b, v11.8b, v12.8b}, [RGB], #24
1875 ld1 {v10.b}[0], [RGB], #1
1876 ld1 {v11.b}[0], [RGB], #1
1877 ld1 {v12.b}[0], [RGB], #1
1879 ld1 {v10.b}[1], [RGB], #1
1880 ld1 {v11.b}[1], [RGB], #1
1881 ld1 {v12.b}[1], [RGB], #1
1883 ld1 {v10.b}[2], [RGB], #1
1884 ld1 {v11.b}[2], [RGB], #1
1885 ld1 {v12.b}[2], [RGB], #1
1887 ld1 {v10.b}[3], [RGB], #1
1888 ld1 {v11.b}[3], [RGB], #1
1889 ld1 {v12.b}[3], [RGB], #1
1891 ld1 {v10.b}[4], [RGB], #1
1892 ld1 {v11.b}[4], [RGB], #1
1893 ld1 {v12.b}[4], [RGB], #1
1895 ld1 {v10.b}[5], [RGB], #1
1896 ld1 {v11.b}[5], [RGB], #1
1897 ld1 {v12.b}[5], [RGB], #1
1899 ld1 {v10.b}[6], [RGB], #1
1900 ld1 {v11.b}[6], [RGB], #1
1901 ld1 {v12.b}[6], [RGB], #1
1903 ld1 {v10.b}[7], [RGB], #1
1904 ld1 {v11.b}[7], [RGB], #1
1905 ld1 {v12.b}[7], [RGB], #1
1907 prfm pldl1keep, [RGB, #128]
1909 ld3 {v10.b, v11.b, v12.b}[0], [RGB], #3
1910 ld3 {v10.b, v11.b, v12.b}[1], [RGB], #3
1911 ld3 {v10.b, v11.b, v12.b}[2], [RGB], #3
1912 ld3 {v10.b, v11.b, v12.b}[3], [RGB], #3
1914 ld3 {v10.b, v11.b, v12.b}[4], [RGB], #3
1915 ld3 {v10.b, v11.b, v12.b}[5], [RGB], #3
1917 ld3 {v10.b, v11.b, v12.b}[6], [RGB], #3
1919 .error unsupported macroblock size
1923 ld4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32
1924 prfm pldl1keep, [RGB, #128]
1926 ld4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4
1927 ld4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4
1928 ld4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4
1929 ld4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4
1931 ld4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4
1932 ld4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4
1934 ld4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4
1936 .error unsupported macroblock size
1939 .error unsupported bpp
1943 .macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, \
1947 * 2-stage pipelined RGB->YCbCr conversion
1950 .macro do_rgb_to_yuv_stage1
1951 ushll v4.8h, v1\r_offs\().8b, #0 /* r = v4 */
1952 ushll v6.8h, v1\g_offs\().8b, #0 /* g = v6 */
1953 ushll v8.8h, v1\b_offs\().8b, #0 /* b = v8 */
1958 umull v14.4s, v4.4h, v0.h[0]
1959 umull2 v16.4s, v4.8h, v0.h[0]
1960 umlsl v18.4s, v4.4h, v0.h[3]
1961 umlsl2 v26.4s, v4.8h, v0.h[3]
1962 umlal v28.4s, v4.4h, v0.h[5]
1963 umlal2 v30.4s, v4.8h, v0.h[5]
1964 umlal v14.4s, v6.4h, v0.h[1]
1965 umlal2 v16.4s, v6.8h, v0.h[1]
1966 umlsl v18.4s, v6.4h, v0.h[4]
1967 umlsl2 v26.4s, v6.8h, v0.h[4]
1968 umlsl v28.4s, v6.4h, v0.h[6]
1969 umlsl2 v30.4s, v6.8h, v0.h[6]
1970 umlal v14.4s, v8.4h, v0.h[2]
1971 umlal2 v16.4s, v8.8h, v0.h[2]
1972 umlal v18.4s, v8.4h, v0.h[5]
1973 umlal2 v26.4s, v8.8h, v0.h[5]
1974 umlsl v28.4s, v8.4h, v0.h[7]
1975 umlsl2 v30.4s, v8.8h, v0.h[7]
1978 .macro do_rgb_to_yuv_stage2
1979 rshrn v20.4h, v14.4s, #16
1980 shrn v22.4h, v18.4s, #16
1981 shrn v24.4h, v28.4s, #16
1982 rshrn2 v20.8h, v16.4s, #16
1983 shrn2 v22.8h, v26.4s, #16
1984 shrn2 v24.8h, v30.4s, #16
1985 xtn v20.8b, v20.8h /* v20 = y */
1986 xtn v21.8b, v22.8h /* v21 = u */
1987 xtn v22.8b, v24.8h /* v22 = v */
1990 .macro do_rgb_to_yuv
1991 do_rgb_to_yuv_stage1
1992 do_rgb_to_yuv_stage2
1995 /* TODO: expand macros and interleave instructions if some in-order
1996 * ARM64 processor actually can dual-issue LOAD/STORE with ALU */
1997 .macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3
1998 do_rgb_to_yuv_stage2
1999 do_load \bpp, 8, \fast_ld3
2000 st1 {v20.8b}, [Y], #8
2001 st1 {v21.8b}, [U], #8
2002 st1 {v22.8b}, [V], #8
2003 do_rgb_to_yuv_stage1
2008 Ljsimd_\colorid\()_ycc_neon_consts:
2010 Ljsimd_\colorid\()_ycc_neon_slowld3_consts:
2012 .short 19595, 38470, 7471, 11059
2013 .short 21709, 32768, 27439, 5329
2014 .short 32767, 128, 32767, 128
2015 .short 32767, 128, 32767, 128
2018 asm_function jsimd_\colorid\()_ycc_convert_neon
2020 asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
2022 OUTPUT_WIDTH .req w0
2030 OUTPUT_BUF2 .req x2 /* OUTPUT_BUF */
2038 /* Load constants to d0, d1, d2, d3 */
2040 adr x13, Ljsimd_\colorid\()_ycc_neon_consts
2042 adr x13, Ljsimd_\colorid\()_ycc_neon_slowld3_consts
2044 ld1 {v0.8h, v1.8h}, [x13]
2046 ldr OUTPUT_BUF0, [OUTPUT_BUF]
2047 ldr OUTPUT_BUF1, [OUTPUT_BUF, #8]
2048 ldr OUTPUT_BUF2, [OUTPUT_BUF, #16]
2051 /* Save NEON registers */
2054 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
2055 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
2057 /* Outer loop over scanlines */
2061 ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, uxtw #3]
2062 ldr U, [OUTPUT_BUF1, OUTPUT_ROW, uxtw #3]
2064 ldr V, [OUTPUT_BUF2, OUTPUT_ROW, uxtw #3]
2065 add OUTPUT_ROW, OUTPUT_ROW, #1
2066 ldr RGB, [INPUT_BUF], #8
2068 /* Inner loop over pixels */
2071 do_load \bpp, 8, \fast_ld3
2072 do_rgb_to_yuv_stage1
2076 do_rgb_to_yuv_stage2_store_load_stage1 \fast_ld3
2080 do_rgb_to_yuv_stage2
2086 do_load \bpp, 4, \fast_ld3
2089 do_load \bpp, 2, \fast_ld3
2092 do_load \bpp, 1, \fast_ld3
2104 subs NUM_ROWS, NUM_ROWS, #1
2107 /* Restore all registers and return */
2108 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
2109 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
2125 .purgem do_rgb_to_yuv
2126 .purgem do_rgb_to_yuv_stage1
2127 .purgem do_rgb_to_yuv_stage2
2128 .purgem do_rgb_to_yuv_stage2_store_load_stage1
2132 /*--------------------------------- id ----- bpp R G B Fast LD3 */
2133 generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 1
2134 generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 1
2135 generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2, 1
2136 generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0, 1
2137 generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1, 1
2138 generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3, 1
2140 generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 0
2141 generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 0
2147 /*****************************************************************************/
2150 * Load data into workspace, applying unsigned->signed conversion
2152 * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
2153 * rid of VST1.16 instructions
2156 asm_function jsimd_convsamp_neon
2170 /* START_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
2171 guarantee that the upper (unused) 32 bits of x1 are valid. This
2172 instruction ensures that those bits are set to zero. */
2176 ldp TMP1, TMP2, [SAMPLE_DATA], 16
2177 ldp TMP3, TMP4, [SAMPLE_DATA], 16
2179 add TMP1, TMP1, START_COL
2180 add TMP2, TMP2, START_COL
2181 ldp TMP5, TMP6, [SAMPLE_DATA], 16
2182 add TMP3, TMP3, START_COL
2183 add TMP4, TMP4, START_COL
2184 ldp TMP7, TMP8, [SAMPLE_DATA], 16
2185 add TMP5, TMP5, START_COL
2186 add TMP6, TMP6, START_COL
2187 ld1 {v16.8b}, [TMP1]
2188 add TMP7, TMP7, START_COL
2189 add TMP8, TMP8, START_COL
2190 ld1 {v17.8b}, [TMP2]
2191 usubl v16.8h, v16.8b, v0.8b
2192 ld1 {v18.8b}, [TMP3]
2193 usubl v17.8h, v17.8b, v0.8b
2194 ld1 {v19.8b}, [TMP4]
2195 usubl v18.8h, v18.8b, v0.8b
2196 ld1 {v20.8b}, [TMP5]
2197 usubl v19.8h, v19.8b, v0.8b
2198 ld1 {v21.8b}, [TMP6]
2199 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64
2200 usubl v20.8h, v20.8b, v0.8b
2201 ld1 {v22.8b}, [TMP7]
2202 usubl v21.8h, v21.8b, v0.8b
2203 ld1 {v23.8b}, [TMP8]
2204 usubl v22.8h, v22.8b, v0.8b
2205 usubl v23.8h, v23.8b, v0.8b
2206 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64
2223 /*****************************************************************************/
2226 * jsimd_fdct_islow_neon
2228 * This file contains a slow-but-accurate integer implementation of the
2229 * forward DCT (Discrete Cosine Transform). The following code is based
2230 * directly on the IJG''s original jfdctint.c; see the jfdctint.c for
2233 * TODO: can be combined with 'jsimd_convsamp_neon' to get
2234 * rid of a bunch of VLD1.16 instructions
2237 #define CONST_BITS 13
2238 #define PASS1_BITS 2
2240 #define DESCALE_P1 (CONST_BITS-PASS1_BITS)
2241 #define DESCALE_P2 (CONST_BITS+PASS1_BITS)
2243 #define F_0_298 2446 /* FIX(0.298631336) */
2244 #define F_0_390 3196 /* FIX(0.390180644) */
2245 #define F_0_541 4433 /* FIX(0.541196100) */
2246 #define F_0_765 6270 /* FIX(0.765366865) */
2247 #define F_0_899 7373 /* FIX(0.899976223) */
2248 #define F_1_175 9633 /* FIX(1.175875602) */
2249 #define F_1_501 12299 /* FIX(1.501321110) */
2250 #define F_1_847 15137 /* FIX(1.847759065) */
2251 #define F_1_961 16069 /* FIX(1.961570560) */
2252 #define F_2_053 16819 /* FIX(2.053119869) */
2253 #define F_2_562 20995 /* FIX(2.562915447) */
2254 #define F_3_072 25172 /* FIX(3.072711026) */
2257 Ljsimd_fdct_islow_neon_consts:
2270 .short 0 /* padding */
2287 #define XFIX_P_0_298 v0.h[0]
2288 #define XFIX_N_0_390 v0.h[1]
2289 #define XFIX_P_0_541 v0.h[2]
2290 #define XFIX_P_0_765 v0.h[3]
2291 #define XFIX_N_0_899 v0.h[4]
2292 #define XFIX_P_1_175 v0.h[5]
2293 #define XFIX_P_1_501 v0.h[6]
2294 #define XFIX_N_1_847 v0.h[7]
2295 #define XFIX_N_1_961 v1.h[0]
2296 #define XFIX_P_2_053 v1.h[1]
2297 #define XFIX_N_2_562 v1.h[2]
2298 #define XFIX_P_3_072 v1.h[3]
2300 asm_function jsimd_fdct_islow_neon
2305 /* Load constants */
2306 adr TMP, Ljsimd_fdct_islow_neon_consts
2307 ld1 {v0.8h, v1.8h}, [TMP]
2309 /* Save NEON registers */
2312 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32
2313 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32
2315 /* Load all DATA into NEON registers with the following allocation:
2317 * ---------+--------
2318 * 0 | d16 | d17 | v16.8h
2319 * 1 | d18 | d19 | v17.8h
2320 * 2 | d20 | d21 | v18.8h
2321 * 3 | d22 | d23 | v19.8h
2322 * 4 | d24 | d25 | v20.8h
2323 * 5 | d26 | d27 | v21.8h
2324 * 6 | d28 | d29 | v22.8h
2325 * 7 | d30 | d31 | v23.8h
2328 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2329 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2333 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
2335 add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */
2336 sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */
2337 add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */
2338 sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */
2339 add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */
2340 sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */
2341 add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */
2342 sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */
2346 add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */
2347 sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */
2348 add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */
2349 sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */
2351 add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */
2352 sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */
2354 add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */
2356 shl v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM) LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */
2357 shl v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM) LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */
2359 smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
2360 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
2361 mov v22.16b, v18.16b
2362 mov v25.16b, v24.16b
2364 smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
2365 smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
2366 smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
2367 smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
2369 rshrn v18.4h, v18.4s, #DESCALE_P1
2370 rshrn v22.4h, v22.4s, #DESCALE_P1
2371 rshrn2 v18.8h, v24.4s, #DESCALE_P1 /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
2372 rshrn2 v22.8h, v25.4s, #DESCALE_P1 /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
2376 add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */
2377 add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */
2378 add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */
2379 add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */
2380 smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */
2381 smull2 v5.4s, v10.8h, XFIX_P_1_175
2382 smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
2383 smlal2 v5.4s, v11.8h, XFIX_P_1_175
2385 smull2 v24.4s, v28.8h, XFIX_P_0_298
2386 smull2 v25.4s, v29.8h, XFIX_P_2_053
2387 smull2 v26.4s, v30.8h, XFIX_P_3_072
2388 smull2 v27.4s, v31.8h, XFIX_P_1_501
2389 smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
2390 smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
2391 smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
2392 smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
2394 smull2 v12.4s, v8.8h, XFIX_N_0_899
2395 smull2 v13.4s, v9.8h, XFIX_N_2_562
2396 smull2 v14.4s, v10.8h, XFIX_N_1_961
2397 smull2 v15.4s, v11.8h, XFIX_N_0_390
2398 smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223); */
2399 smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447); */
2400 smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560); */
2401 smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644); */
2403 add v10.4s, v10.4s, v4.4s /* z3 += z5 */
2404 add v14.4s, v14.4s, v5.4s
2405 add v11.4s, v11.4s, v4.4s /* z4 += z5 */
2406 add v15.4s, v15.4s, v5.4s
2408 add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */
2409 add v24.4s, v24.4s, v12.4s
2410 add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */
2411 add v25.4s, v25.4s, v13.4s
2412 add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */
2413 add v26.4s, v26.4s, v14.4s
2414 add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */
2415 add v27.4s, v27.4s, v15.4s
2417 add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */
2418 add v24.4s, v24.4s, v14.4s
2419 add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */
2420 add v25.4s, v25.4s, v15.4s
2421 add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */
2422 add v26.4s, v26.4s, v13.4s
2423 add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */
2424 add v27.4s, v27.4s, v12.4s
2426 rshrn v23.4h, v28.4s, #DESCALE_P1
2427 rshrn v21.4h, v29.4s, #DESCALE_P1
2428 rshrn v19.4h, v30.4s, #DESCALE_P1
2429 rshrn v17.4h, v31.4s, #DESCALE_P1
2430 rshrn2 v23.8h, v24.4s, #DESCALE_P1 /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
2431 rshrn2 v21.8h, v25.4s, #DESCALE_P1 /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
2432 rshrn2 v19.8h, v26.4s, #DESCALE_P1 /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
2433 rshrn2 v17.8h, v27.4s, #DESCALE_P1 /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
2436 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
2439 add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */
2440 sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */
2441 add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */
2442 sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */
2443 add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */
2444 sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */
2445 add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */
2446 sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */
2449 add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */
2450 sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */
2451 add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */
2452 sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */
2454 add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */
2455 sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */
2457 add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */
2459 srshr v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS); */
2460 srshr v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS); */
2462 smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
2463 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
2464 mov v22.16b, v18.16b
2465 mov v25.16b, v24.16b
2467 smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
2468 smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
2469 smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
2470 smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
2472 rshrn v18.4h, v18.4s, #DESCALE_P2
2473 rshrn v22.4h, v22.4s, #DESCALE_P2
2474 rshrn2 v18.8h, v24.4s, #DESCALE_P2 /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
2475 rshrn2 v22.8h, v25.4s, #DESCALE_P2 /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
2478 add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */
2479 add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */
2480 add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */
2481 add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */
2483 smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */
2484 smull2 v5.4s, v10.8h, XFIX_P_1_175
2485 smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
2486 smlal2 v5.4s, v11.8h, XFIX_P_1_175
2488 smull2 v24.4s, v28.8h, XFIX_P_0_298
2489 smull2 v25.4s, v29.8h, XFIX_P_2_053
2490 smull2 v26.4s, v30.8h, XFIX_P_3_072
2491 smull2 v27.4s, v31.8h, XFIX_P_1_501
2492 smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
2493 smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
2494 smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
2495 smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
2497 smull2 v12.4s, v8.8h, XFIX_N_0_899
2498 smull2 v13.4s, v9.8h, XFIX_N_2_562
2499 smull2 v14.4s, v10.8h, XFIX_N_1_961
2500 smull2 v15.4s, v11.8h, XFIX_N_0_390
2501 smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223); */
2502 smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447); */
2503 smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560); */
2504 smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644); */
2506 add v10.4s, v10.4s, v4.4s
2507 add v14.4s, v14.4s, v5.4s
2508 add v11.4s, v11.4s, v4.4s
2509 add v15.4s, v15.4s, v5.4s
2511 add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */
2512 add v24.4s, v24.4s, v12.4s
2513 add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */
2514 add v25.4s, v25.4s, v13.4s
2515 add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */
2516 add v26.4s, v26.4s, v14.4s
2517 add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */
2518 add v27.4s, v27.4s, v15.4s
2520 add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */
2521 add v24.4s, v24.4s, v14.4s
2522 add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */
2523 add v25.4s, v25.4s, v15.4s
2524 add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */
2525 add v26.4s, v26.4s, v13.4s
2526 add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */
2527 add v27.4s, v27.4s, v12.4s
2529 rshrn v23.4h, v28.4s, #DESCALE_P2
2530 rshrn v21.4h, v29.4s, #DESCALE_P2
2531 rshrn v19.4h, v30.4s, #DESCALE_P2
2532 rshrn v17.4h, v31.4s, #DESCALE_P2
2533 rshrn2 v23.8h, v24.4s, #DESCALE_P2 /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
2534 rshrn2 v21.8h, v25.4s, #DESCALE_P2 /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
2535 rshrn2 v19.8h, v26.4s, #DESCALE_P2 /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
2536 rshrn2 v17.8h, v27.4s, #DESCALE_P2 /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
2539 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2540 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2542 /* Restore NEON registers */
2543 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
2544 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
2565 /*****************************************************************************/
2568 * jsimd_fdct_ifast_neon
2570 * This function contains a fast, not so accurate integer implementation of
2571 * the forward DCT (Discrete Cosine Transform). It uses the same calculations
2572 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
2573 * function from jfdctfst.c
2575 * TODO: can be combined with 'jsimd_convsamp_neon' to get
2576 * rid of a bunch of VLD1.16 instructions
2579 #undef XFIX_0_541196100
2580 #define XFIX_0_382683433 v0.h[0]
2581 #define XFIX_0_541196100 v0.h[1]
2582 #define XFIX_0_707106781 v0.h[2]
2583 #define XFIX_1_306562965 v0.h[3]
2586 Ljsimd_fdct_ifast_neon_consts:
2587 .short (98 * 128) /* XFIX_0_382683433 */
2588 .short (139 * 128) /* XFIX_0_541196100 */
2589 .short (181 * 128) /* XFIX_0_707106781 */
2590 .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */
2592 asm_function jsimd_fdct_ifast_neon
2597 /* Load constants */
2598 adr TMP, Ljsimd_fdct_ifast_neon_consts
2601 /* Load all DATA into NEON registers with the following allocation:
2603 * ---------+--------
2604 * 0 | d16 | d17 | v0.8h
2605 * 1 | d18 | d19 | q9
2606 * 2 | d20 | d21 | q10
2607 * 3 | d22 | d23 | q11
2608 * 4 | d24 | d25 | q12
2609 * 5 | d26 | d27 | q13
2610 * 6 | d28 | d29 | q14
2611 * 7 | d30 | d31 | q15
2614 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2615 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2620 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4
2623 add v4.8h, v19.8h, v20.8h
2624 sub v20.8h, v19.8h, v20.8h
2625 sub v28.8h, v18.8h, v21.8h
2626 add v18.8h, v18.8h, v21.8h
2627 sub v29.8h, v17.8h, v22.8h
2628 add v17.8h, v17.8h, v22.8h
2629 sub v21.8h, v16.8h, v23.8h
2630 add v16.8h, v16.8h, v23.8h
2631 sub v6.8h, v17.8h, v18.8h
2632 sub v7.8h, v16.8h, v4.8h
2633 add v5.8h, v17.8h, v18.8h
2634 add v6.8h, v6.8h, v7.8h
2635 add v4.8h, v16.8h, v4.8h
2636 sqdmulh v6.8h, v6.8h, XFIX_0_707106781
2637 add v19.8h, v20.8h, v28.8h
2638 add v16.8h, v4.8h, v5.8h
2639 sub v20.8h, v4.8h, v5.8h
2640 add v5.8h, v28.8h, v29.8h
2641 add v29.8h, v29.8h, v21.8h
2642 sqdmulh v5.8h, v5.8h, XFIX_0_707106781
2643 sub v28.8h, v19.8h, v29.8h
2644 add v18.8h, v7.8h, v6.8h
2645 sqdmulh v28.8h, v28.8h, XFIX_0_382683433
2646 sub v22.8h, v7.8h, v6.8h
2647 sqdmulh v19.8h, v19.8h, XFIX_0_541196100
2648 sqdmulh v7.8h, v29.8h, XFIX_1_306562965
2649 add v6.8h, v21.8h, v5.8h
2650 sub v5.8h, v21.8h, v5.8h
2651 add v29.8h, v29.8h, v28.8h
2652 add v19.8h, v19.8h, v28.8h
2653 add v29.8h, v29.8h, v7.8h
2654 add v21.8h, v5.8h, v19.8h
2655 sub v19.8h, v5.8h, v19.8h
2656 add v17.8h, v6.8h, v29.8h
2657 sub v23.8h, v6.8h, v29.8h
2662 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2663 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2669 #undef XFIX_0_382683433
2670 #undef XFIX_0_541196100
2671 #undef XFIX_0_707106781
2672 #undef XFIX_1_306562965
2675 /*****************************************************************************/
2679 * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM *divisors,
2680 * DCTELEM *workspace);
2683 asm_function jsimd_quantize_neon
2689 RECIPROCAL .req DIVISORS
2695 add CORRECTION, DIVISORS, #(64 * 2)
2696 add SHIFT, DIVISORS, #(64 * 6)
2698 subs LOOP_COUNT, LOOP_COUNT, #1
2699 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [WORKSPACE], 64
2700 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [CORRECTION], 64
2705 ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [RECIPROCAL], 64
2706 add v20.8h, v20.8h, v4.8h /* add correction */
2707 add v21.8h, v21.8h, v5.8h
2708 add v22.8h, v22.8h, v6.8h
2709 add v23.8h, v23.8h, v7.8h
2710 umull v4.4s, v20.4h, v28.4h /* multiply by reciprocal */
2711 umull2 v16.4s, v20.8h, v28.8h
2712 umull v5.4s, v21.4h, v29.4h
2713 umull2 v17.4s, v21.8h, v29.8h
2714 umull v6.4s, v22.4h, v30.4h /* multiply by reciprocal */
2715 umull2 v18.4s, v22.8h, v30.8h
2716 umull v7.4s, v23.4h, v31.4h
2717 umull2 v19.4s, v23.8h, v31.8h
2718 ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [SHIFT], 64
2719 shrn v4.4h, v4.4s, #16
2720 shrn v5.4h, v5.4s, #16
2721 shrn v6.4h, v6.4s, #16
2722 shrn v7.4h, v7.4s, #16
2723 shrn2 v4.8h, v16.4s, #16
2724 shrn2 v5.8h, v17.4s, #16
2725 shrn2 v6.8h, v18.4s, #16
2726 shrn2 v7.8h, v19.4s, #16
2731 sshr v0.8h, v0.8h, #15 /* extract sign */
2732 sshr v1.8h, v1.8h, #15
2733 sshr v2.8h, v2.8h, #15
2734 sshr v3.8h, v3.8h, #15
2735 ushl v4.8h, v4.8h, v24.8h /* shift */
2736 ushl v5.8h, v5.8h, v25.8h
2737 ushl v6.8h, v6.8h, v26.8h
2738 ushl v7.8h, v7.8h, v27.8h
2740 eor v4.16b, v4.16b, v0.16b /* restore sign */
2741 eor v5.16b, v5.16b, v1.16b
2742 eor v6.16b, v6.16b, v2.16b
2743 eor v7.16b, v7.16b, v3.16b
2744 sub v4.8h, v4.8h, v0.8h
2745 sub v5.8h, v5.8h, v1.8h
2746 sub v6.8h, v6.8h, v2.8h
2747 sub v7.8h, v7.8h, v3.8h
2748 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [COEF_BLOCK], 64
2763 /*****************************************************************************/
2766 * Downsample pixel values of a single component.
2767 * This version handles the common case of 2:1 horizontal and 1:1 vertical,
2768 * without smoothing.
2771 * jsimd_h2v1_downsample_neon (JDIMENSION image_width, int max_v_samp_factor,
2772 * JDIMENSION v_samp_factor,
2773 * JDIMENSION width_blocks, JSAMPARRAY input_data,
2774 * JSAMPARRAY output_data);
2778 Ljsimd_h2_downsample_neon_consts:
2779 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2780 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F /* diff 0 */
2781 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2782 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E /* diff 1 */
2783 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2784 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D /* diff 2 */
2785 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2786 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C /* diff 3 */
2787 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2788 0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B /* diff 4 */
2789 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2790 0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A /* diff 5 */
2791 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2792 0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09 /* diff 6 */
2793 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2794 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08 /* diff 7 */
2795 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2796 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07 /* diff 8 */
2797 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \
2798 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06 /* diff 9 */
2799 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \
2800 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05 /* diff 10 */
2801 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \
2802 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04 /* diff 11 */
2803 .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \
2804 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 /* diff 12 */
2805 .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \
2806 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 /* diff 13 */
2807 .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \
2808 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 /* diff 14 */
2809 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
2810 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* diff 15 */
2812 asm_function jsimd_h2v1_downsample_neon
2826 mov TMPDUP, #0x10000
2827 lsl TMP2, BLOCK_WIDTH, #4
2828 sub TMP2, TMP2, IMAGE_WIDTH
2829 adr TMP3, Ljsimd_h2_downsample_neon_consts
2830 add TMP3, TMP3, TMP2, lsl #4
2832 ld1 {v18.16b}, [TMP3]
2835 ldr INPTR, [INPUT_DATA], #8
2836 ldr OUTPTR, [OUTPUT_DATA], #8
2837 subs TMP1, BLOCK_WIDTH, #1
2840 ld1 {v0.16b}, [INPTR], #16
2843 uadalp v4.8h, v0.16b
2844 shrn v6.8b, v4.8h, #1
2845 st1 {v6.8b}, [OUTPTR], #8
2847 3: /* last columns */
2848 ld1 {v0.16b}, [INPTR]
2850 subs V_SAMP, V_SAMP, #1
2852 tbl v2.16b, {v0.16b}, v18.16b
2853 uadalp v4.8h, v2.16b
2854 shrn v6.8b, v4.8h, #1
2855 st1 {v6.8b}, [OUTPTR], #8
2874 /*****************************************************************************/
2877 * Downsample pixel values of a single component.
2878 * This version handles the common case of 2:1 horizontal and 2:1 vertical,
2879 * without smoothing.
2882 * jsimd_h2v2_downsample_neon (JDIMENSION image_width, int max_v_samp_factor,
2883 * JDIMENSION v_samp_factor, JDIMENSION width_blocks,
2884 * JSAMPARRAY input_data, JSAMPARRAY output_data);
2888 asm_function jsimd_h2v2_downsample_neon
2904 lsl TMP2, BLOCK_WIDTH, #4
2905 lsl TMPDUP, TMPDUP, #17
2906 sub TMP2, TMP2, IMAGE_WIDTH
2907 adr TMP3, Ljsimd_h2_downsample_neon_consts
2908 orr TMPDUP, TMPDUP, #1
2909 add TMP3, TMP3, TMP2, lsl #4
2911 ld1 {v18.16b}, [TMP3]
2914 ldr INPTR0, [INPUT_DATA], #8
2915 ldr OUTPTR, [OUTPUT_DATA], #8
2916 ldr INPTR1, [INPUT_DATA], #8
2917 subs TMP1, BLOCK_WIDTH, #1
2920 ld1 {v0.16b}, [INPTR0], #16
2921 ld1 {v1.16b}, [INPTR1], #16
2924 uadalp v4.8h, v0.16b
2925 uadalp v4.8h, v1.16b
2926 shrn v6.8b, v4.8h, #2
2927 st1 {v6.8b}, [OUTPTR], #8
2929 3: /* last columns */
2930 ld1 {v0.16b}, [INPTR0], #16
2931 ld1 {v1.16b}, [INPTR1], #16
2933 subs V_SAMP, V_SAMP, #1
2935 tbl v2.16b, {v0.16b}, v18.16b
2936 tbl v3.16b, {v1.16b}, v18.16b
2937 uadalp v4.8h, v2.16b
2938 uadalp v4.8h, v3.16b
2939 shrn v6.8b, v4.8h, #2
2940 st1 {v6.8b}, [OUTPTR], #8
2960 /*****************************************************************************/
2964 * jsimd_huff_encode_one_block (working_state *state, JOCTET *buffer,
2965 * JCOEFPTR block, int last_dc_val,
2966 * c_derived_tbl *dctbl, c_derived_tbl *actbl)
2976 sub PUT_BITS, PUT_BITS, #0x8
2977 lsr x19, PUT_BUFFER, PUT_BITS
2979 strb w19, [BUFFER, #1]!
2982 strb wzr, [BUFFER, #1]!
2985 .macro put_bits CODE, SIZE
2986 lsl PUT_BUFFER, PUT_BUFFER, \SIZE
2987 add PUT_BITS, PUT_BITS, \SIZE
2988 orr PUT_BUFFER, PUT_BUFFER, \CODE
3011 .macro generate_jsimd_huff_encode_one_block fast_tbl
3015 Ljsimd_huff_encode_one_block_neon_consts:
3017 Ljsimd_huff_encode_one_block_neon_slowtbl_consts:
3019 .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
3020 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
3022 .byte 0, 1, 2, 3, 16, 17, 32, 33, \
3023 18, 19, 4, 5, 6, 7, 20, 21 /* L0 => L3 : 4 lines OK */
3024 .byte 34, 35, 48, 49, 255, 255, 50, 51, \
3025 36, 37, 22, 23, 8, 9, 10, 11 /* L0 => L3 : 4 lines OK */
3026 .byte 8, 9, 22, 23, 36, 37, 50, 51, \
3027 255, 255, 255, 255, 255, 255, 52, 53 /* L1 => L4 : 4 lines OK */
3028 .byte 54, 55, 40, 41, 26, 27, 12, 13, \
3029 14, 15, 28, 29, 42, 43, 56, 57 /* L0 => L3 : 4 lines OK */
3030 .byte 6, 7, 20, 21, 34, 35, 48, 49, \
3031 50, 51, 36, 37, 22, 23, 8, 9 /* L4 => L7 : 4 lines OK */
3032 .byte 42, 43, 28, 29, 14, 15, 30, 31, \
3033 44, 45, 58, 59, 255, 255, 255, 255 /* L1 => L4 : 4 lines OK */
3034 .byte 255, 255, 255, 255, 56, 57, 42, 43, \
3035 28, 29, 14, 15, 30, 31, 44, 45 /* L3 => L6 : 4 lines OK */
3036 .byte 26, 27, 40, 41, 42, 43, 28, 29, \
3037 14, 15, 30, 31, 44, 45, 46, 47 /* L5 => L7 : 3 lines OK */
3038 .byte 255, 255, 255, 255, 0, 1, 255, 255, \
3039 255, 255, 255, 255, 255, 255, 255, 255 /* L4 : 1 lines OK */
3040 .byte 255, 255, 255, 255, 255, 255, 255, 255, \
3041 0, 1, 16, 17, 2, 3, 255, 255 /* L5 => L6 : 2 lines OK */
3042 .byte 255, 255, 255, 255, 255, 255, 255, 255, \
3043 255, 255, 255, 255, 8, 9, 22, 23 /* L5 => L6 : 2 lines OK */
3044 .byte 4, 5, 6, 7, 255, 255, 255, 255, \
3045 255, 255, 255, 255, 255, 255, 255, 255 /* L7 : 1 line OK */
3049 asm_function jsimd_huff_encode_one_block_neon
3051 asm_function jsimd_huff_encode_one_block_neon_slowtbl
3054 sub BUFFER, BUFFER, #0x1 /* BUFFER=buffer-- */
3055 /* Save ARM registers */
3058 adr x15, Ljsimd_huff_encode_one_block_neon_consts
3060 adr x15, Ljsimd_huff_encode_one_block_neon_slowtbl_consts
3062 ldr PUT_BUFFER, [x0, #0x10]
3063 ldr PUT_BITSw, [x0, #0x18]
3064 ldrsh w12, [x2] /* load DC coeff in w12 */
3067 ld1 {v23.16b}, [x15], #16
3068 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64
3069 ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64
3070 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64
3071 ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64
3072 ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64
3073 sub w12, w12, w3 /* last_dc_val, not used afterwards */
3075 tbl v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b
3076 tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b
3077 tbl v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b
3078 tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b
3079 tbl v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b
3080 tbl v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b
3081 tbl v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b
3082 tbl v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b
3084 tbx v1.16b, {v28.16b}, v16.16b
3085 tbx v2.16b, {v29.16b, v30.16b}, v17.16b
3086 tbx v5.16b, {v29.16b, v30.16b}, v18.16b
3087 tbx v6.16b, {v31.16b}, v19.16b
3090 sub w12, w12, w3 /* last_dc_val, not used afterwards */
3091 ld1 {v23.16b}, [x15]
3096 ld1 {v1.h}[0], [x13]
3098 ld1 {v2.h}[0], [x14]
3104 ld1 {v1.h}[1], [x15]
3106 ld1 {v2.h}[1], [x19]
3108 ld1 {v3.h}[1], [x20]
3110 ld1 {v0.h}[2], [x12]
3112 ld1 {v1.h}[2], [x13]
3114 ld1 {v2.h}[2], [x14]
3120 ld1 {v1.h}[3], [x15]
3122 ld1 {v2.h}[3], [x19]
3124 ld1 {v3.h}[3], [x20]
3126 ld1 {v0.h}[4], [x12]
3128 ld1 {v1.h}[4], [x13]
3130 ld1 {v2.h}[4], [x14]
3136 ld1 {v1.h}[5], [x15]
3138 ld1 {v2.h}[5], [x19]
3140 ld1 {v3.h}[5], [x20]
3142 ld1 {v0.h}[6], [x12]
3144 ld1 {v1.h}[6], [x13]
3146 ld1 {v2.h}[6], [x14]
3152 ld1 {v1.h}[7], [x15]
3154 ld1 {v2.h}[7], [x19]
3156 ld1 {v3.h}[7], [x20]
3158 ld1 {v4.h}[0], [x12]
3160 ld1 {v5.h}[0], [x13]
3162 ld1 {v6.h}[0], [x14]
3168 ld1 {v5.h}[1], [x15]
3170 ld1 {v6.h}[1], [x19]
3172 ld1 {v7.h}[1], [x20]
3174 ld1 {v4.h}[2], [x12]
3176 ld1 {v5.h}[2], [x13]
3178 ld1 {v6.h}[2], [x14]
3184 ld1 {v5.h}[3], [x15]
3186 ld1 {v6.h}[3], [x19]
3188 ld1 {v7.h}[3], [x20]
3190 ld1 {v4.h}[4], [x12]
3192 ld1 {v5.h}[4], [x13]
3194 ld1 {v6.h}[4], [x14]
3200 ld1 {v5.h}[5], [x15]
3202 ld1 {v6.h}[5], [x19]
3204 ld1 {v7.h}[5], [x20]
3206 ld1 {v4.h}[6], [x12]
3208 ld1 {v5.h}[6], [x13]
3210 ld1 {v6.h}[6], [x14]
3215 ld1 {v5.h}[7], [x15]
3216 ld1 {v6.h}[7], [x19]
3217 ld1 {v7.h}[7], [x20]
3219 cmlt v24.8h, v0.8h, #0
3220 cmlt v25.8h, v1.8h, #0
3221 cmlt v26.8h, v2.8h, #0
3222 cmlt v27.8h, v3.8h, #0
3223 cmlt v28.8h, v4.8h, #0
3224 cmlt v29.8h, v5.8h, #0
3225 cmlt v30.8h, v6.8h, #0
3226 cmlt v31.8h, v7.8h, #0
3235 eor v24.16b, v24.16b, v0.16b
3236 eor v25.16b, v25.16b, v1.16b
3237 eor v26.16b, v26.16b, v2.16b
3238 eor v27.16b, v27.16b, v3.16b
3239 eor v28.16b, v28.16b, v4.16b
3240 eor v29.16b, v29.16b, v5.16b
3241 eor v30.16b, v30.16b, v6.16b
3242 eor v31.16b, v31.16b, v7.16b
3243 cmeq v16.8h, v0.8h, #0
3244 cmeq v17.8h, v1.8h, #0
3245 cmeq v18.8h, v2.8h, #0
3246 cmeq v19.8h, v3.8h, #0
3247 cmeq v20.8h, v4.8h, #0
3248 cmeq v21.8h, v5.8h, #0
3249 cmeq v22.8h, v6.8h, #0
3255 xtn2 v16.16b, v17.8h
3257 xtn2 v18.16b, v19.8h
3259 xtn2 v20.16b, v21.8h
3261 cmeq v17.8h, v7.8h, #0
3263 xtn2 v22.16b, v17.8h
3265 and v16.16b, v16.16b, v23.16b
3267 and v18.16b, v18.16b, v23.16b
3268 add x3, x4, #0x400 /* r1 = dctbl->ehufsi */
3269 and v20.16b, v20.16b, v23.16b
3270 add x15, sp, #0x90 /* x15 = t2 */
3271 and v22.16b, v22.16b, v23.16b
3272 ldr w10, [x4, x12, lsl #2]
3273 addp v16.16b, v16.16b, v18.16b
3275 addp v20.16b, v20.16b, v22.16b
3277 addp v16.16b, v16.16b, v20.16b
3279 addp v16.16b, v16.16b, v18.16b
3286 add x4, x5, #0x400 /* x4 = actbl->ehufsi */
3288 lsr x9, x9, #0x1 /* clear AC coeff */
3289 ldr w13, [x5, #0x3c0] /* x13 = actbl->ehufco[0xf0] */
3290 rbit x9, x9 /* x9 = index0 */
3291 ldrb w14, [x4, #0xf0] /* x14 = actbl->ehufsi[0xf0] */
3296 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
3297 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
3298 st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
3299 st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
3302 add x15, x15, x2, lsl #1
3304 ldrh w20, [x15, #-126]
3319 add x2, x11, x2, lsl #4
3321 ldr w12, [x5, x2, lsl #2]
3329 movi v21.8h, #0x0010
3338 ushl v24.8h, v24.8h, v0.8h
3339 ushl v25.8h, v25.8h, v1.8h
3340 ushl v26.8h, v26.8h, v2.8h
3341 ushl v27.8h, v27.8h, v3.8h
3342 ushl v28.8h, v28.8h, v4.8h
3343 ushl v29.8h, v29.8h, v5.8h
3344 ushl v30.8h, v30.8h, v6.8h
3345 ushl v31.8h, v31.8h, v7.8h
3354 ushl v24.8h, v24.8h, v0.8h
3355 ushl v25.8h, v25.8h, v1.8h
3356 ushl v26.8h, v26.8h, v2.8h
3357 ushl v27.8h, v27.8h, v3.8h
3358 ushl v28.8h, v28.8h, v4.8h
3359 ushl v29.8h, v29.8h, v5.8h
3360 ushl v30.8h, v30.8h, v6.8h
3361 ushl v31.8h, v31.8h, v7.8h
3362 add v0.8h, v21.8h, v0.8h
3363 add v1.8h, v21.8h, v1.8h
3364 add v2.8h, v21.8h, v2.8h
3365 add v3.8h, v21.8h, v3.8h
3366 add v4.8h, v21.8h, v4.8h
3367 add v5.8h, v21.8h, v5.8h
3368 add v6.8h, v21.8h, v6.8h
3369 add v7.8h, v21.8h, v7.8h
3370 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
3371 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
3372 st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
3373 st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
3376 add x15, x15, x2, lsl #1
3378 ldrh w11, [x15, #-126]
3388 add x2, x11, x2, lsl #4
3390 ldr w12, [x5, x2, lsl #2]
3405 str PUT_BUFFER, [x0, #0x10]
3406 str PUT_BITSw, [x0, #0x18]
3407 ldp x19, x20, [sp], 16
3408 add x0, BUFFER, #0x1
3414 generate_jsimd_huff_encode_one_block 1
3415 generate_jsimd_huff_encode_one_block 0