simd/jsimd_arm64_neon.S

   1 /*
   2  * ARMv8 NEON optimizations for libjpeg-turbo
   3  *
   4  * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
   5  * All Rights Reserved.
   6  * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
   7  * Copyright (C) 2013-2014, Linaro Limited.  All Rights Reserved.
   8  * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
   9  * Copyright (C) 2014-2016, D. R. Commander.  All Rights Reserved.
  10  * Copyright (C) 2015-2016, Matthieu Darbois.  All Rights Reserved.
  11  * Copyright (C) 2016, Siarhei Siamashka.  All Rights Reserved.
  12  *
  13  * This software is provided 'as-is', without any express or implied
  14  * warranty.  In no event will the authors be held liable for any damages
  15  * arising from the use of this software.
  16  *
  17  * Permission is granted to anyone to use this software for any purpose,
  18  * including commercial applications, and to alter it and redistribute it
  19  * freely, subject to the following restrictions:
  20  *
  21  * 1. The origin of this software must not be misrepresented; you must not
  22  *    claim that you wrote the original software. If you use this software
  23  *    in a product, an acknowledgment in the product documentation would be
  24  *    appreciated but is not required.
  25  * 2. Altered source versions must be plainly marked as such, and must not be
  26  *    misrepresented as being the original software.
  27  * 3. This notice may not be removed or altered from any source distribution.
  28  */
  29
  30 #if defined(__linux__) && defined(__ELF__)
  31 .section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
  32 #endif
  33
  34 .text
  35
  36
  37 #define RESPECT_STRICT_ALIGNMENT 1
  38
  39
  40 /*****************************************************************************/
  41
  42 /* Supplementary macro for setting function attributes */
  43 .macro asm_function fname
  44 #ifdef __APPLE__
  45     .globl _\fname
  46 _\fname:
  47 #else
  48     .global \fname
  49 #ifdef __ELF__
  50     .hidden \fname
  51     .type \fname, %function
  52 #endif
  53 \fname:
  54 #endif
  55 .endm
  56
  57 /* Transpose elements of single 128 bit registers */
  58 .macro transpose_single x0, x1, xi, xilen, literal
  59     ins             \xi\xilen[0], \x0\xilen[0]
  60     ins             \x1\xilen[0], \x0\xilen[1]
  61     trn1            \x0\literal, \x0\literal, \x1\literal
  62     trn2            \x1\literal, \xi\literal, \x1\literal
  63 .endm
  64
  65 /* Transpose elements of 2 differnet registers */
  66 .macro transpose x0, x1, xi, xilen, literal
  67     mov             \xi\xilen, \x0\xilen
  68     trn1            \x0\literal, \x0\literal, \x1\literal
  69     trn2            \x1\literal, \xi\literal, \x1\literal
  70 .endm
  71
  72 /* Transpose a block of 4x4 coefficients in four 64-bit registers */
  73 .macro transpose_4x4_32 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
  74     mov             \xi\xilen, \x0\xilen
  75     trn1            \x0\x0len, \x0\x0len, \x2\x2len
  76     trn2            \x2\x2len, \xi\x0len, \x2\x2len
  77     mov             \xi\xilen, \x1\xilen
  78     trn1            \x1\x1len, \x1\x1len, \x3\x3len
  79     trn2            \x3\x3len, \xi\x1len, \x3\x3len
  80 .endm
  81
  82 .macro transpose_4x4_16 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
  83     mov             \xi\xilen, \x0\xilen
  84     trn1            \x0\x0len, \x0\x0len, \x1\x1len
  85     trn2            \x1\x2len, \xi\x0len, \x1\x2len
  86     mov             \xi\xilen, \x2\xilen
  87     trn1            \x2\x2len, \x2\x2len, \x3\x3len
  88     trn2            \x3\x2len, \xi\x1len, \x3\x3len
  89 .endm
  90
  91 .macro transpose_4x4 x0, x1, x2, x3, x5
  92     transpose_4x4_16 \x0, .4h, \x1, .4h, \x2, .4h, \x3, .4h, \x5, .16b
  93     transpose_4x4_32 \x0, .2s, \x1, .2s, \x2, .2s, \x3, .2s, \x5, .16b
  94 .endm
  95
  96 .macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3
  97     trn1            \t0\().8h, \l0\().8h, \l1\().8h
  98     trn1            \t1\().8h, \l2\().8h, \l3\().8h
  99     trn1            \t2\().8h, \l4\().8h, \l5\().8h
 100     trn1            \t3\().8h, \l6\().8h, \l7\().8h
 101     trn2            \l1\().8h, \l0\().8h, \l1\().8h
 102     trn2            \l3\().8h, \l2\().8h, \l3\().8h
 103     trn2            \l5\().8h, \l4\().8h, \l5\().8h
 104     trn2            \l7\().8h, \l6\().8h, \l7\().8h
 105
 106     trn1            \l4\().4s, \t2\().4s, \t3\().4s
 107     trn2            \t3\().4s, \t2\().4s, \t3\().4s
 108     trn1            \t2\().4s, \t0\().4s, \t1\().4s
 109     trn2            \l2\().4s, \t0\().4s, \t1\().4s
 110     trn1            \t0\().4s, \l1\().4s, \l3\().4s
 111     trn2            \l3\().4s, \l1\().4s, \l3\().4s
 112     trn2            \t1\().4s, \l5\().4s, \l7\().4s
 113     trn1            \l5\().4s, \l5\().4s, \l7\().4s
 114
 115     trn2            \l6\().2d, \l2\().2d, \t3\().2d
 116     trn1            \l0\().2d, \t2\().2d, \l4\().2d
 117     trn1            \l1\().2d, \t0\().2d, \l5\().2d
 118     trn2            \l7\().2d, \l3\().2d, \t1\().2d
 119     trn1            \l2\().2d, \l2\().2d, \t3\().2d
 120     trn2            \l4\().2d, \t2\().2d, \l4\().2d
 121     trn1            \l3\().2d, \l3\().2d, \t1\().2d
 122     trn2            \l5\().2d, \t0\().2d, \l5\().2d
 123 .endm
 124
 125
 126 #define CENTERJSAMPLE 128
 127
 128 /*****************************************************************************/
 129
 130 /*
 131  * Perform dequantization and inverse DCT on one block of coefficients.
 132  *
 133  * GLOBAL(void)
 134  * jsimd_idct_islow_neon (void *dct_table, JCOEFPTR coef_block,
 135  *                        JSAMPARRAY output_buf, JDIMENSION output_col)
 136  */
 137
 138 #define CONST_BITS 13
 139 #define PASS1_BITS 2
 140
 141 #define F_0_298  2446  /* FIX(0.298631336) */
 142 #define F_0_390  3196  /* FIX(0.390180644) */
 143 #define F_0_541  4433  /* FIX(0.541196100) */
 144 #define F_0_765  6270  /* FIX(0.765366865) */
 145 #define F_0_899  7373  /* FIX(0.899976223) */
 146 #define F_1_175  9633  /* FIX(1.175875602) */
 147 #define F_1_501 12299  /* FIX(1.501321110) */
 148 #define F_1_847 15137  /* FIX(1.847759065) */
 149 #define F_1_961 16069  /* FIX(1.961570560) */
 150 #define F_2_053 16819  /* FIX(2.053119869) */
 151 #define F_2_562 20995  /* FIX(2.562915447) */
 152 #define F_3_072 25172  /* FIX(3.072711026) */
 153
 154 .balign 16
 155 Ljsimd_idct_islow_neon_consts:
 156   .short F_0_298
 157   .short -F_0_390
 158   .short F_0_541
 159   .short F_0_765
 160   .short - F_0_899
 161   .short F_1_175
 162   .short F_1_501
 163   .short - F_1_847
 164   .short - F_1_961
 165   .short F_2_053
 166   .short - F_2_562
 167   .short F_3_072
 168   .short 0          /* padding */
 169   .short 0
 170   .short 0
 171   .short 0
 172
 173 #undef F_0_298
 174 #undef F_0_390
 175 #undef F_0_541
 176 #undef F_0_765
 177 #undef F_0_899
 178 #undef F_1_175
 179 #undef F_1_501
 180 #undef F_1_847
 181 #undef F_1_961
 182 #undef F_2_053
 183 #undef F_2_562
 184 #undef F_3_072
 185
 186 #define XFIX_P_0_298 v0.h[0]
 187 #define XFIX_N_0_390 v0.h[1]
 188 #define XFIX_P_0_541 v0.h[2]
 189 #define XFIX_P_0_765 v0.h[3]
 190 #define XFIX_N_0_899 v0.h[4]
 191 #define XFIX_P_1_175 v0.h[5]
 192 #define XFIX_P_1_501 v0.h[6]
 193 #define XFIX_N_1_847 v0.h[7]
 194 #define XFIX_N_1_961 v1.h[0]
 195 #define XFIX_P_2_053 v1.h[1]
 196 #define XFIX_N_2_562 v1.h[2]
 197 #define XFIX_P_3_072 v1.h[3]
 198
 199 asm_function jsimd_idct_islow_neon
 200     DCT_TABLE       .req x0
 201     COEF_BLOCK      .req x1
 202     OUTPUT_BUF      .req x2
 203     OUTPUT_COL      .req x3
 204     TMP1            .req x0
 205     TMP2            .req x1
 206     TMP3            .req x9
 207     TMP4            .req x10
 208     TMP5            .req x11
 209     TMP6            .req x12
 210     TMP7            .req x13
 211     TMP8            .req x14
 212
 213     /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
 214        guarantee that the upper (unused) 32 bits of x3 are valid.  This
 215        instruction ensures that those bits are set to zero. */
 216     uxtw x3, w3
 217
 218     sub             sp, sp, #64
 219     adr             x15, Ljsimd_idct_islow_neon_consts
 220     mov             x10, sp
 221     st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32
 222     st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32
 223     ld1             {v0.8h, v1.8h}, [x15]
 224     ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64
 225     ld1             {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64
 226     ld1             {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64
 227     ld1             {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64
 228
 229     cmeq            v16.8h, v3.8h, #0
 230     cmeq            v26.8h, v4.8h, #0
 231     cmeq            v27.8h, v5.8h, #0
 232     cmeq            v28.8h, v6.8h, #0
 233     cmeq            v29.8h, v7.8h, #0
 234     cmeq            v30.8h, v8.8h, #0
 235     cmeq            v31.8h, v9.8h, #0
 236
 237     and             v10.16b, v16.16b, v26.16b
 238     and             v11.16b, v27.16b, v28.16b
 239     and             v12.16b, v29.16b, v30.16b
 240     and             v13.16b, v31.16b, v10.16b
 241     and             v14.16b, v11.16b, v12.16b
 242     mul             v2.8h, v2.8h, v18.8h
 243     and             v15.16b, v13.16b, v14.16b
 244     shl             v10.8h, v2.8h, #(PASS1_BITS)
 245     sqxtn           v16.8b, v15.8h
 246     mov             TMP1, v16.d[0]
 247     mvn             TMP2, TMP1
 248
 249     cbnz            TMP2, 2f
 250     /* case all AC coeffs are zeros */
 251     dup             v2.2d, v10.d[0]
 252     dup             v6.2d, v10.d[1]
 253     mov             v3.16b, v2.16b
 254     mov             v7.16b, v6.16b
 255     mov             v4.16b, v2.16b
 256     mov             v8.16b, v6.16b
 257     mov             v5.16b, v2.16b
 258     mov             v9.16b, v6.16b
 259 1:
 260     /* for this transpose, we should organise data like this:
 261      * 00, 01, 02, 03, 40, 41, 42, 43
 262      * 10, 11, 12, 13, 50, 51, 52, 53
 263      * 20, 21, 22, 23, 60, 61, 62, 63
 264      * 30, 31, 32, 33, 70, 71, 72, 73
 265      * 04, 05, 06, 07, 44, 45, 46, 47
 266      * 14, 15, 16, 17, 54, 55, 56, 57
 267      * 24, 25, 26, 27, 64, 65, 66, 67
 268      * 34, 35, 36, 37, 74, 75, 76, 77
 269      */
 270     trn1            v28.8h, v2.8h, v3.8h
 271     trn1            v29.8h, v4.8h, v5.8h
 272     trn1            v30.8h, v6.8h, v7.8h
 273     trn1            v31.8h, v8.8h, v9.8h
 274     trn2            v16.8h, v2.8h, v3.8h
 275     trn2            v17.8h, v4.8h, v5.8h
 276     trn2            v18.8h, v6.8h, v7.8h
 277     trn2            v19.8h, v8.8h, v9.8h
 278     trn1            v2.4s, v28.4s, v29.4s
 279     trn1            v6.4s, v30.4s, v31.4s
 280     trn1            v3.4s, v16.4s, v17.4s
 281     trn1            v7.4s, v18.4s, v19.4s
 282     trn2            v4.4s, v28.4s, v29.4s
 283     trn2            v8.4s, v30.4s, v31.4s
 284     trn2            v5.4s, v16.4s, v17.4s
 285     trn2            v9.4s, v18.4s, v19.4s
 286     /* Even part: reverse the even part of the forward DCT. */
 287     add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
 288     add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
 289     smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
 290     sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
 291     smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
 292     sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
 293     mov             v21.16b, v19.16b               /* tmp3 = z1 */
 294     mov             v20.16b, v18.16b               /* tmp3 = z1 */
 295     smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
 296     smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
 297     sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
 298     smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
 299     smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
 300     sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
 301     sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
 302     add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
 303     sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
 304     add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
 305     sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
 306     add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
 307     sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
 308     add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
 309     sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
 310
 311     /* Odd part per figure 8; the matrix is unitary and hence its
 312      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
 313      */
 314
 315     add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
 316     add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
 317     add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
 318     add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
 319     add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
 320
 321     smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
 322     smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
 323     smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
 324     smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
 325     smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
 326     smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
 327     smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
 328     smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
 329     smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
 330
 331     smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
 332     smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
 333     smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
 334     smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
 335     smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
 336     smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
 337     smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
 338     smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
 339     smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
 340
 341     add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
 342     add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
 343     add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
 344     add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
 345
 346     add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
 347     add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
 348     add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
 349     add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
 350     add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
 351     add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
 352     add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
 353     add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
 354
 355     add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
 356     add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
 357     add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
 358     add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
 359     add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
 360     add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
 361     add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
 362     add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
 363
 364     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
 365
 366     add             v18.4s, v2.4s, v16.4s   /* tmp10 + tmp3 */
 367     add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
 368     sub             v20.4s, v2.4s, v16.4s   /* tmp10 - tmp3 */
 369     sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
 370     add             v22.4s, v8.4s, v14.4s   /* tmp11 + tmp2 */
 371     add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
 372     sub             v24.4s, v8.4s, v14.4s   /* tmp11 - tmp2 */
 373     sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
 374     add             v26.4s, v4.4s, v12.4s   /* tmp12 + tmp1 */
 375     add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
 376     sub             v28.4s, v4.4s, v12.4s   /* tmp12 - tmp1 */
 377     sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
 378     add             v14.4s, v6.4s, v10.4s   /* tmp13 + tmp0 */
 379     add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
 380     sub             v16.4s, v6.4s, v10.4s   /* tmp13 - tmp0 */
 381     sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
 382
 383     shrn            v2.4h, v18.4s, #16  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
 384     shrn            v9.4h, v20.4s, #16  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
 385     shrn            v3.4h, v22.4s, #16  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
 386     shrn            v8.4h, v24.4s, #16  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
 387     shrn            v4.4h, v26.4s, #16  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
 388     shrn            v7.4h, v28.4s, #16  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
 389     shrn            v5.4h, v14.4s, #16  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
 390     shrn            v6.4h, v16.4s, #16  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
 391     shrn2           v2.8h, v19.4s, #16  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
 392     shrn2           v9.8h, v21.4s, #16  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
 393     shrn2           v3.8h, v23.4s, #16  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
 394     shrn2           v8.8h, v25.4s, #16  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
 395     shrn2           v4.8h, v27.4s, #16  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
 396     shrn2           v7.8h, v29.4s, #16  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
 397     shrn2           v5.8h, v15.4s, #16  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
 398     shrn2           v6.8h, v17.4s, #16  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
 399     movi            v0.16b, #(CENTERJSAMPLE)
 400     /* Prepare pointers (dual-issue with NEON instructions) */
 401       ldp             TMP1, TMP2, [OUTPUT_BUF], 16
 402     sqrshrn         v28.8b, v2.8h, #(CONST_BITS+PASS1_BITS+3-16)
 403       ldp             TMP3, TMP4, [OUTPUT_BUF], 16
 404     sqrshrn         v29.8b, v3.8h, #(CONST_BITS+PASS1_BITS+3-16)
 405       add             TMP1, TMP1, OUTPUT_COL
 406     sqrshrn         v30.8b, v4.8h, #(CONST_BITS+PASS1_BITS+3-16)
 407       add             TMP2, TMP2, OUTPUT_COL
 408     sqrshrn         v31.8b, v5.8h, #(CONST_BITS+PASS1_BITS+3-16)
 409       add             TMP3, TMP3, OUTPUT_COL
 410     sqrshrn2        v28.16b, v6.8h, #(CONST_BITS+PASS1_BITS+3-16)
 411       add             TMP4, TMP4, OUTPUT_COL
 412     sqrshrn2        v29.16b, v7.8h, #(CONST_BITS+PASS1_BITS+3-16)
 413       ldp             TMP5, TMP6, [OUTPUT_BUF], 16
 414     sqrshrn2        v30.16b, v8.8h, #(CONST_BITS+PASS1_BITS+3-16)
 415       ldp             TMP7, TMP8, [OUTPUT_BUF], 16
 416     sqrshrn2        v31.16b, v9.8h, #(CONST_BITS+PASS1_BITS+3-16)
 417       add             TMP5, TMP5, OUTPUT_COL
 418     add             v16.16b, v28.16b, v0.16b
 419       add             TMP6, TMP6, OUTPUT_COL
 420     add             v18.16b, v29.16b, v0.16b
 421       add             TMP7, TMP7, OUTPUT_COL
 422     add             v20.16b, v30.16b, v0.16b
 423       add             TMP8, TMP8, OUTPUT_COL
 424     add             v22.16b, v31.16b, v0.16b
 425
 426     /* Transpose the final 8-bit samples */
 427     trn1            v28.16b, v16.16b, v18.16b
 428     trn1            v30.16b, v20.16b, v22.16b
 429     trn2            v29.16b, v16.16b, v18.16b
 430     trn2            v31.16b, v20.16b, v22.16b
 431
 432     trn1            v16.8h, v28.8h, v30.8h
 433     trn2            v18.8h, v28.8h, v30.8h
 434     trn1            v20.8h, v29.8h, v31.8h
 435     trn2            v22.8h, v29.8h, v31.8h
 436
 437     uzp1            v28.4s, v16.4s, v18.4s
 438     uzp2            v30.4s, v16.4s, v18.4s
 439     uzp1            v29.4s, v20.4s, v22.4s
 440     uzp2            v31.4s, v20.4s, v22.4s
 441
 442     /* Store results to the output buffer */
 443     st1             {v28.d}[0], [TMP1]
 444     st1             {v29.d}[0], [TMP2]
 445     st1             {v28.d}[1], [TMP3]
 446     st1             {v29.d}[1], [TMP4]
 447     st1             {v30.d}[0], [TMP5]
 448     st1             {v31.d}[0], [TMP6]
 449     st1             {v30.d}[1], [TMP7]
 450     st1             {v31.d}[1], [TMP8]
 451     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
 452     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
 453     blr             x30
 454
 455 .balign 16
 456 2:
 457     mul             v3.8h, v3.8h, v19.8h
 458     mul             v4.8h, v4.8h, v20.8h
 459     mul             v5.8h, v5.8h, v21.8h
 460     add             TMP4, xzr, TMP2, LSL #32
 461     mul             v6.8h, v6.8h, v22.8h
 462     mul             v7.8h, v7.8h, v23.8h
 463     adds            TMP3, xzr, TMP2, LSR #32
 464     mul             v8.8h, v8.8h, v24.8h
 465     mul             v9.8h, v9.8h, v25.8h
 466     b.ne            3f
 467     /* Right AC coef is zero */
 468     dup             v15.2d, v10.d[1]
 469     /* Even part: reverse the even part of the forward DCT. */
 470     add             v18.4h, v4.4h, v8.4h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
 471     add             v22.4h, v2.4h, v6.4h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
 472     sub             v26.4h, v2.4h, v6.4h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
 473     smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
 474     sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
 475     mov             v20.16b, v18.16b               /* tmp3 = z1 */
 476     sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
 477     smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
 478     smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
 479     add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
 480     sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
 481     add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
 482     sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
 483
 484     /* Odd part per figure 8; the matrix is unitary and hence its
 485      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
 486      */
 487
 488     add             v22.4h, v9.4h, v5.4h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
 489     add             v24.4h, v7.4h, v3.4h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
 490     add             v18.4h, v9.4h, v3.4h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
 491     add             v20.4h, v7.4h, v5.4h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
 492     add             v26.4h, v22.4h, v24.4h  /* z5 = z3 + z4 */
 493
 494     smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
 495     smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
 496     smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
 497     smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
 498     smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
 499     smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
 500     smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
 501     smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
 502     smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
 503
 504     add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
 505     add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
 506
 507     add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
 508     add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
 509     add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
 510     add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
 511
 512     add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
 513     add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
 514     add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
 515     add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
 516
 517     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
 518
 519     add             v18.4s, v2.4s, v16.4s  /* tmp10 + tmp3 */
 520     sub             v20.4s, v2.4s, v16.4s  /* tmp10 - tmp3 */
 521     add             v22.4s, v8.4s, v14.4s  /* tmp11 + tmp2 */
 522     sub             v24.4s, v8.4s, v14.4s  /* tmp11 - tmp2 */
 523     add             v26.4s, v4.4s, v12.4s  /* tmp12 + tmp1 */
 524     sub             v28.4s, v4.4s, v12.4s  /* tmp12 - tmp1 */
 525     add             v14.4s, v6.4s, v10.4s  /* tmp13 + tmp0 */
 526     sub             v16.4s, v6.4s, v10.4s  /* tmp13 - tmp0 */
 527
 528     rshrn           v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
 529     rshrn           v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
 530     rshrn           v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
 531     rshrn           v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
 532     rshrn2          v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
 533     rshrn2          v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
 534     rshrn2          v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
 535     rshrn2          v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
 536     mov             v6.16b, v15.16b
 537     mov             v7.16b, v15.16b
 538     mov             v8.16b, v15.16b
 539     mov             v9.16b, v15.16b
 540     b               1b
 541
 542 .balign 16
 543 3:
 544     cbnz            TMP4, 4f
 545     /* Left AC coef is zero */
 546     dup             v14.2d, v10.d[0]
 547     /* Even part: reverse the even part of the forward DCT. */
 548     add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
 549     add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
 550     smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
 551     sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
 552     sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
 553     mov             v21.16b, v19.16b               /* tmp3 = z1 */
 554     smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
 555     sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
 556     smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
 557     add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
 558     sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
 559     add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
 560     sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
 561
 562     /* Odd part per figure 8; the matrix is unitary and hence its
 563      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
 564      */
 565
 566     add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
 567     add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
 568     add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
 569     add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
 570     add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
 571
 572     smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
 573     smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
 574     smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
 575     smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
 576     smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
 577     smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
 578     smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
 579     smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
 580     smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
 581
 582     add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
 583     add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
 584     add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
 585     add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
 586
 587     add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
 588     add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
 589     add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
 590     add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
 591
 592     add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
 593     add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
 594     add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
 595     add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
 596
 597     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
 598
 599     add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
 600     sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
 601     add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
 602     sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
 603     add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
 604     sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
 605     add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
 606     sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
 607
 608     mov             v2.16b, v14.16b
 609     mov             v3.16b, v14.16b
 610     mov             v4.16b, v14.16b
 611     mov             v5.16b, v14.16b
 612     rshrn           v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
 613     rshrn           v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
 614     rshrn           v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
 615     rshrn           v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
 616     rshrn2          v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
 617     rshrn2          v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
 618     rshrn2          v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
 619     rshrn2          v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
 620     b               1b
 621
 622 .balign 16
 623 4:
 624     /* "No" AC coef is zero */
 625     /* Even part: reverse the even part of the forward DCT. */
 626     add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
 627     add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
 628     smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
 629     sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
 630     smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
 631     sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
 632     mov             v21.16b, v19.16b               /* tmp3 = z1 */
 633     mov             v20.16b, v18.16b               /* tmp3 = z1 */
 634     smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
 635     smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
 636     sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
 637     smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
 638     smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
 639     sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
 640     sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
 641     add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
 642     sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
 643     add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
 644     sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
 645     add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
 646     sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
 647     add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
 648     sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
 649
 650     /* Odd part per figure 8; the matrix is unitary and hence its
 651      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
 652      */
 653
 654     add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
 655     add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
 656     add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
 657     add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
 658     add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
 659
 660     smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
 661     smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
 662     smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
 663     smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
 664     smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
 665     smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
 666     smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
 667     smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
 668     smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
 669
 670     smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
 671     smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
 672     smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
 673     smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
 674     smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
 675     smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
 676     smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
 677     smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
 678     smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
 679
 680     add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
 681     add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
 682     add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
 683     add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
 684
 685     add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
 686     add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
 687     add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
 688     add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
 689     add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
 690     add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
 691     add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
 692     add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
 693
 694     add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
 695     add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
 696     add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
 697     add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
 698     add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
 699     add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
 700     add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
 701     add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
 702
 703     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
 704
 705     add             v18.4s, v2.4s, v16.4s   /* tmp10 + tmp3 */
 706     add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
 707     sub             v20.4s, v2.4s, v16.4s   /* tmp10 - tmp3 */
 708     sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
 709     add             v22.4s, v8.4s, v14.4s   /* tmp11 + tmp2 */
 710     add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
 711     sub             v24.4s, v8.4s, v14.4s   /* tmp11 - tmp2 */
 712     sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
 713     add             v26.4s, v4.4s, v12.4s   /* tmp12 + tmp1 */
 714     add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
 715     sub             v28.4s, v4.4s, v12.4s   /* tmp12 - tmp1 */
 716     sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
 717     add             v14.4s, v6.4s, v10.4s   /* tmp13 + tmp0 */
 718     add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
 719     sub             v16.4s, v6.4s, v10.4s   /* tmp13 - tmp0 */
 720     sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
 721
 722     rshrn           v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
 723     rshrn           v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
 724     rshrn           v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
 725     rshrn           v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
 726     rshrn           v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
 727     rshrn           v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
 728     rshrn           v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
 729     rshrn           v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
 730     rshrn2          v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
 731     rshrn2          v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
 732     rshrn2          v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
 733     rshrn2          v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
 734     rshrn2          v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
 735     rshrn2          v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
 736     rshrn2          v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
 737     rshrn2          v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
 738     b               1b
 739
 740     .unreq          DCT_TABLE
 741     .unreq          COEF_BLOCK
 742     .unreq          OUTPUT_BUF
 743     .unreq          OUTPUT_COL
 744     .unreq          TMP1
 745     .unreq          TMP2
 746     .unreq          TMP3
 747     .unreq          TMP4
 748     .unreq          TMP5
 749     .unreq          TMP6
 750     .unreq          TMP7
 751     .unreq          TMP8
 752
 753 #undef CENTERJSAMPLE
 754 #undef CONST_BITS
 755 #undef PASS1_BITS
 756 #undef XFIX_P_0_298
 757 #undef XFIX_N_0_390
 758 #undef XFIX_P_0_541
 759 #undef XFIX_P_0_765
 760 #undef XFIX_N_0_899
 761 #undef XFIX_P_1_175
 762 #undef XFIX_P_1_501
 763 #undef XFIX_N_1_847
 764 #undef XFIX_N_1_961
 765 #undef XFIX_P_2_053
 766 #undef XFIX_N_2_562
 767 #undef XFIX_P_3_072
 768
 769
 770 /*****************************************************************************/
 771
 772 /*
 773  * jsimd_idct_ifast_neon
 774  *
 775  * This function contains a fast, not so accurate integer implementation of
 776  * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
 777  * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
 778  * function from jidctfst.c
 779  *
 780  * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
 781  * But in ARM NEON case some extra additions are required because VQDMULH
 782  * instruction can't handle the constants larger than 1. So the expressions
 783  * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
 784  * which introduces an extra addition. Overall, there are 6 extra additions
 785  * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
 786  */
 787
 788 #define XFIX_1_082392200 v0.h[0]
 789 #define XFIX_1_414213562 v0.h[1]
 790 #define XFIX_1_847759065 v0.h[2]
 791 #define XFIX_2_613125930 v0.h[3]
 792
 793 .balign 16
 794 Ljsimd_idct_ifast_neon_consts:
 795   .short (277 * 128 - 256 * 128)  /* XFIX_1_082392200 */
 796   .short (362 * 128 - 256 * 128)  /* XFIX_1_414213562 */
 797   .short (473 * 128 - 256 * 128)  /* XFIX_1_847759065 */
 798   .short (669 * 128 - 512 * 128)  /* XFIX_2_613125930 */
 799
 800 asm_function jsimd_idct_ifast_neon
 801
 802     DCT_TABLE       .req x0
 803     COEF_BLOCK      .req x1
 804     OUTPUT_BUF      .req x2
 805     OUTPUT_COL      .req x3
 806     TMP1            .req x0
 807     TMP2            .req x1
 808     TMP3            .req x9
 809     TMP4            .req x10
 810     TMP5            .req x11
 811     TMP6            .req x12
 812     TMP7            .req x13
 813     TMP8            .req x14
 814
 815     /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
 816        guarantee that the upper (unused) 32 bits of x3 are valid.  This
 817        instruction ensures that those bits are set to zero. */
 818     uxtw x3, w3
 819
 820     /* Load and dequantize coefficients into NEON registers
 821      * with the following allocation:
 822      *       0 1 2 3 | 4 5 6 7
 823      *      ---------+--------
 824      *   0 | d16     | d17     ( v16.8h )
 825      *   1 | d18     | d19     ( v17.8h )
 826      *   2 | d20     | d21     ( v18.8h )
 827      *   3 | d22     | d23     ( v19.8h )
 828      *   4 | d24     | d25     ( v20.8h )
 829      *   5 | d26     | d27     ( v21.8h )
 830      *   6 | d28     | d29     ( v22.8h )
 831      *   7 | d30     | d31     ( v23.8h )
 832      */
 833     /* Save NEON registers used in fast IDCT */
 834     adr             TMP5, Ljsimd_idct_ifast_neon_consts
 835     ld1             {v16.8h, v17.8h}, [COEF_BLOCK], 32
 836     ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
 837     ld1             {v18.8h, v19.8h}, [COEF_BLOCK], 32
 838     mul             v16.8h, v16.8h, v0.8h
 839     ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
 840     mul             v17.8h, v17.8h, v1.8h
 841     ld1             {v20.8h, v21.8h}, [COEF_BLOCK], 32
 842     mul             v18.8h, v18.8h, v2.8h
 843     ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
 844     mul             v19.8h, v19.8h, v3.8h
 845     ld1             {v22.8h, v23.8h}, [COEF_BLOCK], 32
 846     mul             v20.8h, v20.8h, v0.8h
 847     ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
 848     mul             v22.8h, v22.8h, v2.8h
 849     mul             v21.8h, v21.8h, v1.8h
 850     ld1             {v0.4h}, [TMP5]        /* load constants */
 851     mul             v23.8h, v23.8h, v3.8h
 852
 853     /* 1-D IDCT, pass 1 */
 854     sub             v2.8h, v18.8h, v22.8h
 855     add             v22.8h, v18.8h, v22.8h
 856     sub             v1.8h, v19.8h, v21.8h
 857     add             v21.8h, v19.8h, v21.8h
 858     sub             v5.8h, v17.8h, v23.8h
 859     add             v23.8h, v17.8h, v23.8h
 860     sqdmulh         v4.8h, v2.8h, XFIX_1_414213562
 861     sqdmulh         v6.8h, v1.8h, XFIX_2_613125930
 862     add             v3.8h, v1.8h, v1.8h
 863     sub             v1.8h, v5.8h, v1.8h
 864     add             v18.8h, v2.8h, v4.8h
 865     sqdmulh         v4.8h, v1.8h, XFIX_1_847759065
 866     sub             v2.8h, v23.8h, v21.8h
 867     add             v3.8h, v3.8h, v6.8h
 868     sqdmulh         v6.8h, v2.8h, XFIX_1_414213562
 869     add             v1.8h, v1.8h, v4.8h
 870     sqdmulh         v4.8h, v5.8h, XFIX_1_082392200
 871     sub             v18.8h, v18.8h, v22.8h
 872     add             v2.8h, v2.8h, v6.8h
 873     sub             v6.8h, v16.8h, v20.8h
 874     add             v20.8h, v16.8h, v20.8h
 875     add             v17.8h, v5.8h, v4.8h
 876     add             v5.8h, v6.8h, v18.8h
 877     sub             v18.8h, v6.8h, v18.8h
 878     add             v6.8h, v23.8h, v21.8h
 879     add             v16.8h, v20.8h, v22.8h
 880     sub             v3.8h, v6.8h, v3.8h
 881     sub             v20.8h, v20.8h, v22.8h
 882     sub             v3.8h, v3.8h, v1.8h
 883     sub             v1.8h, v17.8h, v1.8h
 884     add             v2.8h, v3.8h, v2.8h
 885     sub             v23.8h, v16.8h, v6.8h
 886     add             v1.8h, v1.8h, v2.8h
 887     add             v16.8h, v16.8h, v6.8h
 888     add             v22.8h, v5.8h, v3.8h
 889     sub             v17.8h, v5.8h, v3.8h
 890     sub             v21.8h, v18.8h, v2.8h
 891     add             v18.8h, v18.8h, v2.8h
 892     sub             v19.8h, v20.8h, v1.8h
 893     add             v20.8h, v20.8h, v1.8h
 894     transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30, v31
 895     /* 1-D IDCT, pass 2 */
 896     sub             v2.8h, v18.8h, v22.8h
 897     add             v22.8h, v18.8h, v22.8h
 898     sub             v1.8h, v19.8h, v21.8h
 899     add             v21.8h, v19.8h, v21.8h
 900     sub             v5.8h, v17.8h, v23.8h
 901     add             v23.8h, v17.8h, v23.8h
 902     sqdmulh         v4.8h, v2.8h, XFIX_1_414213562
 903     sqdmulh         v6.8h, v1.8h, XFIX_2_613125930
 904     add             v3.8h, v1.8h, v1.8h
 905     sub             v1.8h, v5.8h, v1.8h
 906     add             v18.8h, v2.8h, v4.8h
 907     sqdmulh         v4.8h, v1.8h, XFIX_1_847759065
 908     sub             v2.8h, v23.8h, v21.8h
 909     add             v3.8h, v3.8h, v6.8h
 910     sqdmulh         v6.8h, v2.8h, XFIX_1_414213562
 911     add             v1.8h, v1.8h, v4.8h
 912     sqdmulh         v4.8h, v5.8h, XFIX_1_082392200
 913     sub             v18.8h, v18.8h, v22.8h
 914     add             v2.8h, v2.8h, v6.8h
 915     sub             v6.8h, v16.8h, v20.8h
 916     add             v20.8h, v16.8h, v20.8h
 917     add             v17.8h, v5.8h, v4.8h
 918     add             v5.8h, v6.8h, v18.8h
 919     sub             v18.8h, v6.8h, v18.8h
 920     add             v6.8h, v23.8h, v21.8h
 921     add             v16.8h, v20.8h, v22.8h
 922     sub             v3.8h, v6.8h, v3.8h
 923     sub             v20.8h, v20.8h, v22.8h
 924     sub             v3.8h, v3.8h, v1.8h
 925     sub             v1.8h, v17.8h, v1.8h
 926     add             v2.8h, v3.8h, v2.8h
 927     sub             v23.8h, v16.8h, v6.8h
 928     add             v1.8h, v1.8h, v2.8h
 929     add             v16.8h, v16.8h, v6.8h
 930     add             v22.8h, v5.8h, v3.8h
 931     sub             v17.8h, v5.8h, v3.8h
 932     sub             v21.8h, v18.8h, v2.8h
 933     add             v18.8h, v18.8h, v2.8h
 934     sub             v19.8h, v20.8h, v1.8h
 935     add             v20.8h, v20.8h, v1.8h
 936     /* Descale to 8-bit and range limit */
 937     movi            v0.16b, #0x80
 938       /* Prepare pointers (dual-issue with NEON instructions) */
 939       ldp             TMP1, TMP2, [OUTPUT_BUF], 16
 940     sqshrn          v28.8b, v16.8h, #5
 941       ldp             TMP3, TMP4, [OUTPUT_BUF], 16
 942     sqshrn          v29.8b, v17.8h, #5
 943       add             TMP1, TMP1, OUTPUT_COL
 944     sqshrn          v30.8b, v18.8h, #5
 945       add             TMP2, TMP2, OUTPUT_COL
 946     sqshrn          v31.8b, v19.8h, #5
 947       add             TMP3, TMP3, OUTPUT_COL
 948     sqshrn2         v28.16b, v20.8h, #5
 949       add             TMP4, TMP4, OUTPUT_COL
 950     sqshrn2         v29.16b, v21.8h, #5
 951       ldp             TMP5, TMP6, [OUTPUT_BUF], 16
 952     sqshrn2         v30.16b, v22.8h, #5
 953       ldp             TMP7, TMP8, [OUTPUT_BUF], 16
 954     sqshrn2         v31.16b, v23.8h, #5
 955       add             TMP5, TMP5, OUTPUT_COL
 956     add             v16.16b, v28.16b, v0.16b
 957       add             TMP6, TMP6, OUTPUT_COL
 958     add             v18.16b, v29.16b, v0.16b
 959       add             TMP7, TMP7, OUTPUT_COL
 960     add             v20.16b, v30.16b, v0.16b
 961       add             TMP8, TMP8, OUTPUT_COL
 962     add             v22.16b, v31.16b, v0.16b
 963
 964     /* Transpose the final 8-bit samples */
 965     trn1            v28.16b, v16.16b, v18.16b
 966     trn1            v30.16b, v20.16b, v22.16b
 967     trn2            v29.16b, v16.16b, v18.16b
 968     trn2            v31.16b, v20.16b, v22.16b
 969
 970     trn1            v16.8h, v28.8h, v30.8h
 971     trn2            v18.8h, v28.8h, v30.8h
 972     trn1            v20.8h, v29.8h, v31.8h
 973     trn2            v22.8h, v29.8h, v31.8h
 974
 975     uzp1            v28.4s, v16.4s, v18.4s
 976     uzp2            v30.4s, v16.4s, v18.4s
 977     uzp1            v29.4s, v20.4s, v22.4s
 978     uzp2            v31.4s, v20.4s, v22.4s
 979
 980     /* Store results to the output buffer */
 981     st1             {v28.d}[0], [TMP1]
 982     st1             {v29.d}[0], [TMP2]
 983     st1             {v28.d}[1], [TMP3]
 984     st1             {v29.d}[1], [TMP4]
 985     st1             {v30.d}[0], [TMP5]
 986     st1             {v31.d}[0], [TMP6]
 987     st1             {v30.d}[1], [TMP7]
 988     st1             {v31.d}[1], [TMP8]
 989     blr             x30
 990
 991     .unreq          DCT_TABLE
 992     .unreq          COEF_BLOCK
 993     .unreq          OUTPUT_BUF
 994     .unreq          OUTPUT_COL
 995     .unreq          TMP1
 996     .unreq          TMP2
 997     .unreq          TMP3
 998     .unreq          TMP4
 999     .unreq          TMP5
1000     .unreq          TMP6
1001     .unreq          TMP7
1002     .unreq          TMP8
1003
1004
1005 /*****************************************************************************/
1006
1007 /*
1008  * jsimd_idct_4x4_neon
1009  *
1010  * This function contains inverse-DCT code for getting reduced-size
1011  * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations
1012  * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
1013  * function from jpeg-6b (jidctred.c).
1014  *
1015  * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
1016  *       requires much less arithmetic operations and hence should be faster.
1017  *       The primary purpose of this particular NEON optimized function is
1018  *       bit exact compatibility with jpeg-6b.
1019  *
1020  * TODO: a bit better instructions scheduling can be achieved by expanding
1021  *       idct_helper/transpose_4x4 macros and reordering instructions,
1022  *       but readability will suffer somewhat.
1023  */
1024
1025 #define CONST_BITS  13
1026
1027 #define FIX_0_211164243 (1730)   /* FIX(0.211164243) */
1028 #define FIX_0_509795579 (4176)   /* FIX(0.509795579) */
1029 #define FIX_0_601344887 (4926)   /* FIX(0.601344887) */
1030 #define FIX_0_720959822 (5906)   /* FIX(0.720959822) */
1031 #define FIX_0_765366865 (6270)   /* FIX(0.765366865) */
1032 #define FIX_0_850430095 (6967)   /* FIX(0.850430095) */
1033 #define FIX_0_899976223 (7373)   /* FIX(0.899976223) */
1034 #define FIX_1_061594337 (8697)   /* FIX(1.061594337) */
1035 #define FIX_1_272758580 (10426)  /* FIX(1.272758580) */
1036 #define FIX_1_451774981 (11893)  /* FIX(1.451774981) */
1037 #define FIX_1_847759065 (15137)  /* FIX(1.847759065) */
1038 #define FIX_2_172734803 (17799)  /* FIX(2.172734803) */
1039 #define FIX_2_562915447 (20995)  /* FIX(2.562915447) */
1040 #define FIX_3_624509785 (29692)  /* FIX(3.624509785) */
1041
1042 .balign 16
1043 Ljsimd_idct_4x4_neon_consts:
1044   .short FIX_1_847759065      /* v0.h[0] */
1045   .short -FIX_0_765366865     /* v0.h[1] */
1046   .short -FIX_0_211164243     /* v0.h[2] */
1047   .short FIX_1_451774981      /* v0.h[3] */
1048   .short -FIX_2_172734803     /* d1[0] */
1049   .short FIX_1_061594337      /* d1[1] */
1050   .short -FIX_0_509795579     /* d1[2] */
1051   .short -FIX_0_601344887     /* d1[3] */
1052   .short FIX_0_899976223      /* v2.h[0] */
1053   .short FIX_2_562915447      /* v2.h[1] */
1054   .short 1 << (CONST_BITS+1)  /* v2.h[2] */
1055   .short 0                    /* v2.h[3] */
1056
1057 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
1058     smull           v28.4s, \x4, v2.h[2]
1059     smlal           v28.4s, \x8, v0.h[0]
1060     smlal           v28.4s, \x14, v0.h[1]
1061
1062     smull           v26.4s, \x16, v1.h[2]
1063     smlal           v26.4s, \x12, v1.h[3]
1064     smlal           v26.4s, \x10, v2.h[0]
1065     smlal           v26.4s, \x6, v2.h[1]
1066
1067     smull           v30.4s, \x4, v2.h[2]
1068     smlsl           v30.4s, \x8, v0.h[0]
1069     smlsl           v30.4s, \x14, v0.h[1]
1070
1071     smull           v24.4s, \x16, v0.h[2]
1072     smlal           v24.4s, \x12, v0.h[3]
1073     smlal           v24.4s, \x10, v1.h[0]
1074     smlal           v24.4s, \x6, v1.h[1]
1075
1076     add             v20.4s, v28.4s, v26.4s
1077     sub             v28.4s, v28.4s, v26.4s
1078
1079   .if \shift > 16
1080     srshr           v20.4s, v20.4s, #\shift
1081     srshr           v28.4s, v28.4s, #\shift
1082     xtn             \y26, v20.4s
1083     xtn             \y29, v28.4s
1084   .else
1085     rshrn           \y26, v20.4s, #\shift
1086     rshrn           \y29, v28.4s, #\shift
1087   .endif
1088
1089     add             v20.4s, v30.4s, v24.4s
1090     sub             v30.4s, v30.4s, v24.4s
1091
1092   .if \shift > 16
1093     srshr           v20.4s, v20.4s, #\shift
1094     srshr           v30.4s, v30.4s, #\shift
1095     xtn             \y27, v20.4s
1096     xtn             \y28, v30.4s
1097   .else
1098     rshrn           \y27, v20.4s, #\shift
1099     rshrn           \y28, v30.4s, #\shift
1100   .endif
1101 .endm
1102
1103 asm_function jsimd_idct_4x4_neon
1104
1105     DCT_TABLE       .req x0
1106     COEF_BLOCK      .req x1
1107     OUTPUT_BUF      .req x2
1108     OUTPUT_COL      .req x3
1109     TMP1            .req x0
1110     TMP2            .req x1
1111     TMP3            .req x2
1112     TMP4            .req x15
1113
1114     /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
1115        guarantee that the upper (unused) 32 bits of x3 are valid.  This
1116        instruction ensures that those bits are set to zero. */
1117     uxtw x3, w3
1118
1119     /* Save all used NEON registers */
1120     sub             sp, sp, 64
1121     mov             x9, sp
1122     /* Load constants (v3.4h is just used for padding) */
1123     adr             TMP4, Ljsimd_idct_4x4_neon_consts
1124     st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
1125     st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
1126     ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
1127
1128     /* Load all COEF_BLOCK into NEON registers with the following allocation:
1129      *       0 1 2 3 | 4 5 6 7
1130      *      ---------+--------
1131      *   0 | v4.4h   | v5.4h
1132      *   1 | v6.4h   | v7.4h
1133      *   2 | v8.4h   | v9.4h
1134      *   3 | v10.4h  | v11.4h
1135      *   4 | -       | -
1136      *   5 | v12.4h  | v13.4h
1137      *   6 | v14.4h  | v15.4h
1138      *   7 | v16.4h  | v17.4h
1139      */
1140     ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
1141     ld1             {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32
1142     add             COEF_BLOCK, COEF_BLOCK, #16
1143     ld1             {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32
1144     ld1             {v16.4h, v17.4h}, [COEF_BLOCK], 16
1145     /* dequantize */
1146     ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
1147     mul             v4.4h, v4.4h, v18.4h
1148     mul             v5.4h, v5.4h, v19.4h
1149     ins             v4.d[1], v5.d[0]              /* 128 bit q4 */
1150     ld1             {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
1151     mul             v6.4h, v6.4h, v20.4h
1152     mul             v7.4h, v7.4h, v21.4h
1153     ins             v6.d[1], v7.d[0]              /* 128 bit q6 */
1154     mul             v8.4h, v8.4h, v22.4h
1155     mul             v9.4h, v9.4h, v23.4h
1156     ins             v8.d[1], v9.d[0]              /* 128 bit q8 */
1157     add             DCT_TABLE, DCT_TABLE, #16
1158     ld1             {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
1159     mul             v10.4h, v10.4h, v24.4h
1160     mul             v11.4h, v11.4h, v25.4h
1161     ins             v10.d[1], v11.d[0]            /* 128 bit q10 */
1162     mul             v12.4h, v12.4h, v26.4h
1163     mul             v13.4h, v13.4h, v27.4h
1164     ins             v12.d[1], v13.d[0]            /* 128 bit q12 */
1165     ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
1166     mul             v14.4h, v14.4h, v28.4h
1167     mul             v15.4h, v15.4h, v29.4h
1168     ins             v14.d[1], v15.d[0]            /* 128 bit q14 */
1169     mul             v16.4h, v16.4h, v30.4h
1170     mul             v17.4h, v17.4h, v31.4h
1171     ins             v16.d[1], v17.d[0]            /* 128 bit q16 */
1172
1173     /* Pass 1 */
1174     idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, \
1175                     v4.4h, v6.4h, v8.4h, v10.4h
1176     transpose_4x4   v4, v6, v8, v10, v3
1177     ins             v10.d[1], v11.d[0]
1178     idct_helper     v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, \
1179                     v5.4h, v7.4h, v9.4h, v11.4h
1180     transpose_4x4   v5, v7, v9, v11, v3
1181     ins             v10.d[1], v11.d[0]
1182
1183     /* Pass 2 */
1184     idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, \
1185                     v26.4h, v27.4h, v28.4h, v29.4h
1186     transpose_4x4   v26, v27, v28, v29, v3
1187
1188     /* Range limit */
1189     movi            v30.8h, #0x80
1190     ins             v26.d[1], v27.d[0]
1191     ins             v28.d[1], v29.d[0]
1192     add             v26.8h, v26.8h, v30.8h
1193     add             v28.8h, v28.8h, v30.8h
1194     sqxtun          v26.8b, v26.8h
1195     sqxtun          v27.8b, v28.8h
1196
1197     /* Store results to the output buffer */
1198     ldp             TMP1, TMP2, [OUTPUT_BUF], 16
1199     ldp             TMP3, TMP4, [OUTPUT_BUF]
1200     add             TMP1, TMP1, OUTPUT_COL
1201     add             TMP2, TMP2, OUTPUT_COL
1202     add             TMP3, TMP3, OUTPUT_COL
1203     add             TMP4, TMP4, OUTPUT_COL
1204
1205 #if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
1206     /* We can use much less instructions on little endian systems if the
1207      * OS kernel is not configured to trap unaligned memory accesses
1208      */
1209     st1             {v26.s}[0], [TMP1], 4
1210     st1             {v27.s}[0], [TMP3], 4
1211     st1             {v26.s}[1], [TMP2], 4
1212     st1             {v27.s}[1], [TMP4], 4
1213 #else
1214     st1             {v26.b}[0], [TMP1], 1
1215     st1             {v27.b}[0], [TMP3], 1
1216     st1             {v26.b}[1], [TMP1], 1
1217     st1             {v27.b}[1], [TMP3], 1
1218     st1             {v26.b}[2], [TMP1], 1
1219     st1             {v27.b}[2], [TMP3], 1
1220     st1             {v26.b}[3], [TMP1], 1
1221     st1             {v27.b}[3], [TMP3], 1
1222
1223     st1             {v26.b}[4], [TMP2], 1
1224     st1             {v27.b}[4], [TMP4], 1
1225     st1             {v26.b}[5], [TMP2], 1
1226     st1             {v27.b}[5], [TMP4], 1
1227     st1             {v26.b}[6], [TMP2], 1
1228     st1             {v27.b}[6], [TMP4], 1
1229     st1             {v26.b}[7], [TMP2], 1
1230     st1             {v27.b}[7], [TMP4], 1
1231 #endif
1232
1233     /* vpop            {v8.4h - v15.4h}    ;not available */
1234     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1235     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1236     blr             x30
1237
1238     .unreq          DCT_TABLE
1239     .unreq          COEF_BLOCK
1240     .unreq          OUTPUT_BUF
1241     .unreq          OUTPUT_COL
1242     .unreq          TMP1
1243     .unreq          TMP2
1244     .unreq          TMP3
1245     .unreq          TMP4
1246
1247 .purgem idct_helper
1248
1249
1250 /*****************************************************************************/
1251
1252 /*
1253  * jsimd_idct_2x2_neon
1254  *
1255  * This function contains inverse-DCT code for getting reduced-size
1256  * 2x2 pixels output from an 8x8 DCT block. It uses the same  calculations
1257  * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
1258  * function from jpeg-6b (jidctred.c).
1259  *
1260  * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
1261  *       requires much less arithmetic operations and hence should be faster.
1262  *       The primary purpose of this particular NEON optimized function is
1263  *       bit exact compatibility with jpeg-6b.
1264  */
1265
1266 .balign 8
1267 Ljsimd_idct_2x2_neon_consts:
1268   .short -FIX_0_720959822  /* v14[0] */
1269   .short FIX_0_850430095   /* v14[1] */
1270   .short -FIX_1_272758580  /* v14[2] */
1271   .short FIX_3_624509785   /* v14[3] */
1272
1273 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
1274     sshll           v15.4s, \x4, #15
1275     smull           v26.4s, \x6, v14.h[3]
1276     smlal           v26.4s, \x10, v14.h[2]
1277     smlal           v26.4s, \x12, v14.h[1]
1278     smlal           v26.4s, \x16, v14.h[0]
1279
1280     add             v20.4s, v15.4s, v26.4s
1281     sub             v15.4s, v15.4s, v26.4s
1282
1283   .if \shift > 16
1284     srshr           v20.4s, v20.4s, #\shift
1285     srshr           v15.4s, v15.4s, #\shift
1286     xtn             \y26, v20.4s
1287     xtn             \y27, v15.4s
1288   .else
1289     rshrn           \y26, v20.4s, #\shift
1290     rshrn           \y27, v15.4s, #\shift
1291   .endif
1292 .endm
1293
1294 asm_function jsimd_idct_2x2_neon
1295
1296     DCT_TABLE       .req x0
1297     COEF_BLOCK      .req x1
1298     OUTPUT_BUF      .req x2
1299     OUTPUT_COL      .req x3
1300     TMP1            .req x0
1301     TMP2            .req x15
1302
1303     /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
1304        guarantee that the upper (unused) 32 bits of x3 are valid.  This
1305        instruction ensures that those bits are set to zero. */
1306     uxtw x3, w3
1307
1308     /* vpush           {v8.4h - v15.4h}            ; not available */
1309     sub             sp, sp, 64
1310     mov             x9, sp
1311
1312     /* Load constants */
1313     adr             TMP2, Ljsimd_idct_2x2_neon_consts
1314     st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
1315     st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
1316     ld1             {v14.4h}, [TMP2]
1317
1318     /* Load all COEF_BLOCK into NEON registers with the following allocation:
1319      *       0 1 2 3 | 4 5 6 7
1320      *      ---------+--------
1321      *   0 | v4.4h   | v5.4h
1322      *   1 | v6.4h   | v7.4h
1323      *   2 | -       | -
1324      *   3 | v10.4h  | v11.4h
1325      *   4 | -       | -
1326      *   5 | v12.4h  | v13.4h
1327      *   6 | -       | -
1328      *   7 | v16.4h  | v17.4h
1329      */
1330     ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
1331     add             COEF_BLOCK, COEF_BLOCK, #16
1332     ld1             {v10.4h, v11.4h}, [COEF_BLOCK], 16
1333     add             COEF_BLOCK, COEF_BLOCK, #16
1334     ld1             {v12.4h, v13.4h}, [COEF_BLOCK], 16
1335     add             COEF_BLOCK, COEF_BLOCK, #16
1336     ld1             {v16.4h, v17.4h}, [COEF_BLOCK], 16
1337     /* Dequantize */
1338     ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
1339     mul             v4.4h, v4.4h, v18.4h
1340     mul             v5.4h, v5.4h, v19.4h
1341     ins             v4.d[1], v5.d[0]
1342     mul             v6.4h, v6.4h, v20.4h
1343     mul             v7.4h, v7.4h, v21.4h
1344     ins             v6.d[1], v7.d[0]
1345     add             DCT_TABLE, DCT_TABLE, #16
1346     ld1             {v24.4h, v25.4h}, [DCT_TABLE], 16
1347     mul             v10.4h, v10.4h, v24.4h
1348     mul             v11.4h, v11.4h, v25.4h
1349     ins             v10.d[1], v11.d[0]
1350     add             DCT_TABLE, DCT_TABLE, #16
1351     ld1             {v26.4h, v27.4h}, [DCT_TABLE], 16
1352     mul             v12.4h, v12.4h, v26.4h
1353     mul             v13.4h, v13.4h, v27.4h
1354     ins             v12.d[1], v13.d[0]
1355     add             DCT_TABLE, DCT_TABLE, #16
1356     ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
1357     mul             v16.4h, v16.4h, v30.4h
1358     mul             v17.4h, v17.4h, v31.4h
1359     ins             v16.d[1], v17.d[0]
1360
1361     /* Pass 1 */
1362 #if 0
1363     idct_helper     v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h
1364     transpose_4x4   v4.4h, v6.4h, v8.4h, v10.4h
1365     idct_helper     v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
1366     transpose_4x4   v5.4h, v7.4h, v9.4h, v11.4h
1367 #else
1368     smull           v26.4s, v6.4h, v14.h[3]
1369     smlal           v26.4s, v10.4h, v14.h[2]
1370     smlal           v26.4s, v12.4h, v14.h[1]
1371     smlal           v26.4s, v16.4h, v14.h[0]
1372     smull           v24.4s, v7.4h, v14.h[3]
1373     smlal           v24.4s, v11.4h, v14.h[2]
1374     smlal           v24.4s, v13.4h, v14.h[1]
1375     smlal           v24.4s, v17.4h, v14.h[0]
1376     sshll           v15.4s, v4.4h, #15
1377     sshll           v30.4s, v5.4h, #15
1378     add             v20.4s, v15.4s, v26.4s
1379     sub             v15.4s, v15.4s, v26.4s
1380     rshrn           v4.4h, v20.4s, #13
1381     rshrn           v6.4h, v15.4s, #13
1382     add             v20.4s, v30.4s, v24.4s
1383     sub             v15.4s, v30.4s, v24.4s
1384     rshrn           v5.4h, v20.4s, #13
1385     rshrn           v7.4h, v15.4s, #13
1386     ins             v4.d[1], v5.d[0]
1387     ins             v6.d[1], v7.d[0]
1388     transpose       v4, v6, v3, .16b, .8h
1389     transpose       v6, v10, v3, .16b, .4s
1390     ins             v11.d[0], v10.d[1]
1391     ins             v7.d[0], v6.d[1]
1392 #endif
1393
1394     /* Pass 2 */
1395     idct_helper     v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h
1396
1397     /* Range limit */
1398     movi            v30.8h, #0x80
1399     ins             v26.d[1], v27.d[0]
1400     add             v26.8h, v26.8h, v30.8h
1401     sqxtun          v30.8b, v26.8h
1402     ins             v26.d[0], v30.d[0]
1403     sqxtun          v27.8b, v26.8h
1404
1405     /* Store results to the output buffer */
1406     ldp             TMP1, TMP2, [OUTPUT_BUF]
1407     add             TMP1, TMP1, OUTPUT_COL
1408     add             TMP2, TMP2, OUTPUT_COL
1409
1410     st1             {v26.b}[0], [TMP1], 1
1411     st1             {v27.b}[4], [TMP1], 1
1412     st1             {v26.b}[1], [TMP2], 1
1413     st1             {v27.b}[5], [TMP2], 1
1414
1415     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1416     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1417     blr             x30
1418
1419     .unreq          DCT_TABLE
1420     .unreq          COEF_BLOCK
1421     .unreq          OUTPUT_BUF
1422     .unreq          OUTPUT_COL
1423     .unreq          TMP1
1424     .unreq          TMP2
1425
1426 .purgem idct_helper
1427
1428
1429 /*****************************************************************************/
1430
1431 /*
1432  * jsimd_ycc_extrgb_convert_neon
1433  * jsimd_ycc_extbgr_convert_neon
1434  * jsimd_ycc_extrgbx_convert_neon
1435  * jsimd_ycc_extbgrx_convert_neon
1436  * jsimd_ycc_extxbgr_convert_neon
1437  * jsimd_ycc_extxrgb_convert_neon
1438  *
1439  * Colorspace conversion YCbCr -> RGB
1440  */
1441
1442 .macro do_load size
1443   .if \size == 8
1444     ld1             {v4.8b}, [U], 8
1445     ld1             {v5.8b}, [V], 8
1446     ld1             {v0.8b}, [Y], 8
1447     prfm            pldl1keep, [U, #64]
1448     prfm            pldl1keep, [V, #64]
1449     prfm            pldl1keep, [Y, #64]
1450   .elseif \size == 4
1451     ld1             {v4.b}[0], [U], 1
1452     ld1             {v4.b}[1], [U], 1
1453     ld1             {v4.b}[2], [U], 1
1454     ld1             {v4.b}[3], [U], 1
1455     ld1             {v5.b}[0], [V], 1
1456     ld1             {v5.b}[1], [V], 1
1457     ld1             {v5.b}[2], [V], 1
1458     ld1             {v5.b}[3], [V], 1
1459     ld1             {v0.b}[0], [Y], 1
1460     ld1             {v0.b}[1], [Y], 1
1461     ld1             {v0.b}[2], [Y], 1
1462     ld1             {v0.b}[3], [Y], 1
1463   .elseif \size == 2
1464     ld1             {v4.b}[4], [U], 1
1465     ld1             {v4.b}[5], [U], 1
1466     ld1             {v5.b}[4], [V], 1
1467     ld1             {v5.b}[5], [V], 1
1468     ld1             {v0.b}[4], [Y], 1
1469     ld1             {v0.b}[5], [Y], 1
1470   .elseif \size == 1
1471     ld1             {v4.b}[6], [U], 1
1472     ld1             {v5.b}[6], [V], 1
1473     ld1             {v0.b}[6], [Y], 1
1474   .else
1475     .error unsupported macroblock size
1476   .endif
1477 .endm
1478
1479 .macro do_store bpp, size, fast_st3
1480   .if \bpp == 24
1481     .if \size == 8
1482       .if \fast_st3 == 1
1483         st3         {v10.8b, v11.8b, v12.8b}, [RGB], 24
1484       .else
1485         st1         {v10.b}[0], [RGB], #1
1486         st1         {v11.b}[0], [RGB], #1
1487         st1         {v12.b}[0], [RGB], #1
1488
1489         st1         {v10.b}[1], [RGB], #1
1490         st1         {v11.b}[1], [RGB], #1
1491         st1         {v12.b}[1], [RGB], #1
1492
1493         st1         {v10.b}[2], [RGB], #1
1494         st1         {v11.b}[2], [RGB], #1
1495         st1         {v12.b}[2], [RGB], #1
1496
1497         st1         {v10.b}[3], [RGB], #1
1498         st1         {v11.b}[3], [RGB], #1
1499         st1         {v12.b}[3], [RGB], #1
1500
1501         st1         {v10.b}[4], [RGB], #1
1502         st1         {v11.b}[4], [RGB], #1
1503         st1         {v12.b}[4], [RGB], #1
1504
1505         st1         {v10.b}[5], [RGB], #1
1506         st1         {v11.b}[5], [RGB], #1
1507         st1         {v12.b}[5], [RGB], #1
1508
1509         st1         {v10.b}[6], [RGB], #1
1510         st1         {v11.b}[6], [RGB], #1
1511         st1         {v12.b}[6], [RGB], #1
1512
1513         st1         {v10.b}[7], [RGB], #1
1514         st1         {v11.b}[7], [RGB], #1
1515         st1         {v12.b}[7], [RGB], #1
1516       .endif
1517     .elseif \size == 4
1518       st3           {v10.b, v11.b, v12.b}[0], [RGB], 3
1519       st3           {v10.b, v11.b, v12.b}[1], [RGB], 3
1520       st3           {v10.b, v11.b, v12.b}[2], [RGB], 3
1521       st3           {v10.b, v11.b, v12.b}[3], [RGB], 3
1522     .elseif \size == 2
1523       st3           {v10.b, v11.b, v12.b}[4], [RGB], 3
1524       st3           {v10.b, v11.b, v12.b}[5], [RGB], 3
1525     .elseif \size == 1
1526       st3           {v10.b, v11.b, v12.b}[6], [RGB], 3
1527     .else
1528      .error unsupported macroblock size
1529     .endif
1530   .elseif \bpp == 32
1531     .if \size == 8
1532       st4           {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32
1533     .elseif \size == 4
1534       st4           {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4
1535       st4           {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4
1536       st4           {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4
1537       st4           {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4
1538     .elseif \size == 2
1539       st4           {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4
1540       st4           {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4
1541     .elseif \size == 1
1542       st4           {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4
1543     .else
1544       .error unsupported macroblock size
1545     .endif
1546   .elseif \bpp==16
1547     .if \size == 8
1548       st1           {v25.8h}, [RGB], 16
1549     .elseif \size == 4
1550       st1           {v25.4h}, [RGB], 8
1551     .elseif \size == 2
1552       st1           {v25.h}[4], [RGB], 2
1553       st1           {v25.h}[5], [RGB], 2
1554     .elseif \size == 1
1555       st1           {v25.h}[6], [RGB], 2
1556     .else
1557       .error unsupported macroblock size
1558     .endif
1559   .else
1560     .error unsupported bpp
1561   .endif
1562 .endm
1563
1564 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, \
1565                                            g_offs, gsize, b_offs, bsize, \
1566                                            defsize, fast_st3
1567
1568 /*
1569  * 2-stage pipelined YCbCr->RGB conversion
1570  */
1571
1572 .macro do_yuv_to_rgb_stage1
1573     uaddw           v6.8h, v2.8h, v4.8b     /* q3 = u - 128 */
1574     uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
1575     smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
1576     smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
1577     smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
1578     smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
1579     smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
1580     smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
1581     smull           v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
1582     smull2          v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
1583 .endm
1584
1585 .macro do_yuv_to_rgb_stage2
1586     rshrn           v20.4h, v20.4s, #15
1587     rshrn2          v20.8h, v22.4s, #15
1588     rshrn           v24.4h, v24.4s, #14
1589     rshrn2          v24.8h, v26.4s, #14
1590     rshrn           v28.4h, v28.4s, #14
1591     rshrn2          v28.8h, v30.4s, #14
1592     uaddw           v20.8h, v20.8h, v0.8b
1593     uaddw           v24.8h, v24.8h, v0.8b
1594     uaddw           v28.8h, v28.8h, v0.8b
1595   .if \bpp != 16
1596     sqxtun          v1\g_offs\defsize, v20.8h
1597     sqxtun          v1\r_offs\defsize, v24.8h
1598     sqxtun          v1\b_offs\defsize, v28.8h
1599   .else
1600     sqshlu          v21.8h, v20.8h, #8
1601     sqshlu          v25.8h, v24.8h, #8
1602     sqshlu          v29.8h, v28.8h, #8
1603     sri             v25.8h, v21.8h, #5
1604     sri             v25.8h, v29.8h, #11
1605   .endif
1606 .endm
1607
1608 .macro do_yuv_to_rgb_stage2_store_load_stage1 fast_st3
1609     rshrn           v20.4h, v20.4s, #15
1610     rshrn           v24.4h, v24.4s, #14
1611     rshrn           v28.4h, v28.4s, #14
1612     ld1             {v4.8b}, [U], 8
1613     rshrn2          v20.8h, v22.4s, #15
1614     rshrn2          v24.8h, v26.4s, #14
1615     rshrn2          v28.8h, v30.4s, #14
1616     ld1             {v5.8b}, [V], 8
1617     uaddw           v20.8h, v20.8h, v0.8b
1618     uaddw           v24.8h, v24.8h, v0.8b
1619     uaddw           v28.8h, v28.8h, v0.8b
1620   .if \bpp != 16  /**************** rgb24/rgb32 ******************************/
1621     sqxtun          v1\g_offs\defsize, v20.8h
1622     ld1             {v0.8b}, [Y], 8
1623     sqxtun          v1\r_offs\defsize, v24.8h
1624     prfm            pldl1keep, [U, #64]
1625     prfm            pldl1keep, [V, #64]
1626     prfm            pldl1keep, [Y, #64]
1627     sqxtun          v1\b_offs\defsize, v28.8h
1628     uaddw           v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
1629     uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
1630     smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
1631     smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
1632     smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
1633     smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
1634     smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
1635     smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
1636   .else  /**************************** rgb565 ********************************/
1637     sqshlu          v21.8h, v20.8h, #8
1638     sqshlu          v25.8h, v24.8h, #8
1639     sqshlu          v29.8h, v28.8h, #8
1640     uaddw           v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
1641     uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
1642     ld1             {v0.8b}, [Y], 8
1643     smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
1644     smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
1645     smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
1646     smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
1647     sri             v25.8h, v21.8h, #5
1648     smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
1649     smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
1650     prfm            pldl1keep, [U, #64]
1651     prfm            pldl1keep, [V, #64]
1652     prfm            pldl1keep, [Y, #64]
1653     sri             v25.8h, v29.8h, #11
1654   .endif
1655     do_store        \bpp, 8, \fast_st3
1656     smull           v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
1657     smull2          v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
1658 .endm
1659
1660 .macro do_yuv_to_rgb
1661     do_yuv_to_rgb_stage1
1662     do_yuv_to_rgb_stage2
1663 .endm
1664
1665 /* Apple gas crashes on adrl, work around that by using adr.
1666  * But this requires a copy of these constants for each function.
1667  */
1668
1669 .balign 16
1670 .if \fast_st3 == 1
1671 Ljsimd_ycc_\colorid\()_neon_consts:
1672 .else
1673 Ljsimd_ycc_\colorid\()_neon_slowst3_consts:
1674 .endif
1675   .short 0,      0,     0,      0
1676   .short 22971, -11277, -23401, 29033
1677   .short -128,  -128,   -128,   -128
1678   .short -128,  -128,   -128,   -128
1679
1680 .if \fast_st3 == 1
1681 asm_function jsimd_ycc_\colorid\()_convert_neon
1682 .else
1683 asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
1684 .endif
1685     OUTPUT_WIDTH    .req w0
1686     INPUT_BUF       .req x1
1687     INPUT_ROW       .req w2
1688     OUTPUT_BUF      .req x3
1689     NUM_ROWS        .req w4
1690
1691     INPUT_BUF0      .req x5
1692     INPUT_BUF1      .req x6
1693     INPUT_BUF2      .req x1
1694
1695     RGB             .req x7
1696     Y               .req x9
1697     U               .req x10
1698     V               .req x11
1699     N               .req w15
1700
1701     sub             sp, sp, 64
1702     mov             x9, sp
1703
1704     /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
1705     .if \fast_st3 == 1
1706       adr           x15, Ljsimd_ycc_\colorid\()_neon_consts
1707     .else
1708       adr           x15, Ljsimd_ycc_\colorid\()_neon_slowst3_consts
1709     .endif
1710
1711     /* Save NEON registers */
1712     st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
1713     st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
1714     ld1             {v0.4h, v1.4h}, [x15], 16
1715     ld1             {v2.8h}, [x15]
1716
1717     ldr             INPUT_BUF0, [INPUT_BUF]
1718     ldr             INPUT_BUF1, [INPUT_BUF, #8]
1719     ldr             INPUT_BUF2, [INPUT_BUF, #16]
1720     .unreq          INPUT_BUF
1721
1722     /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
1723     movi            v10.16b, #255
1724     movi            v13.16b, #255
1725
1726     /* Outer loop over scanlines */
1727     cmp             NUM_ROWS, #1
1728     b.lt            9f
1729 0:
1730     ldr             Y, [INPUT_BUF0, INPUT_ROW, uxtw #3]
1731     ldr             U, [INPUT_BUF1, INPUT_ROW, uxtw #3]
1732     mov             N, OUTPUT_WIDTH
1733     ldr             V, [INPUT_BUF2, INPUT_ROW, uxtw #3]
1734     add             INPUT_ROW, INPUT_ROW, #1
1735     ldr             RGB, [OUTPUT_BUF], #8
1736
1737     /* Inner loop over pixels */
1738     subs            N, N, #8
1739     b.lt            3f
1740     do_load         8
1741     do_yuv_to_rgb_stage1
1742     subs            N, N, #8
1743     b.lt            2f
1744 1:
1745     do_yuv_to_rgb_stage2_store_load_stage1 \fast_st3
1746     subs            N, N, #8
1747     b.ge            1b
1748 2:
1749     do_yuv_to_rgb_stage2
1750     do_store        \bpp, 8, \fast_st3
1751     tst             N, #7
1752     b.eq            8f
1753 3:
1754     tst             N, #4
1755     b.eq            3f
1756     do_load         4
1757 3:
1758     tst             N, #2
1759     b.eq            4f
1760     do_load         2
1761 4:
1762     tst             N, #1
1763     b.eq            5f
1764     do_load         1
1765 5:
1766     do_yuv_to_rgb
1767     tst             N, #4
1768     b.eq            6f
1769     do_store        \bpp, 4, \fast_st3
1770 6:
1771     tst             N, #2
1772     b.eq            7f
1773     do_store        \bpp, 2, \fast_st3
1774 7:
1775     tst             N, #1
1776     b.eq            8f
1777     do_store        \bpp, 1, \fast_st3
1778 8:
1779     subs            NUM_ROWS, NUM_ROWS, #1
1780     b.gt            0b
1781 9:
1782     /* Restore all registers and return */
1783     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1784     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1785     br              x30
1786     .unreq          OUTPUT_WIDTH
1787     .unreq          INPUT_ROW
1788     .unreq          OUTPUT_BUF
1789     .unreq          NUM_ROWS
1790     .unreq          INPUT_BUF0
1791     .unreq          INPUT_BUF1
1792     .unreq          INPUT_BUF2
1793     .unreq          RGB
1794     .unreq          Y
1795     .unreq          U
1796     .unreq          V
1797     .unreq          N
1798
1799 .purgem do_yuv_to_rgb
1800 .purgem do_yuv_to_rgb_stage1
1801 .purgem do_yuv_to_rgb_stage2
1802 .purgem do_yuv_to_rgb_stage2_store_load_stage1
1803
1804 .endm
1805
1806 /*--------------------------------- id ----- bpp R  rsize G  gsize B  bsize defsize fast_st3*/
1807 generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,  1, .4h,  2, .4h,  .8b,    1
1808 generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,  1, .4h,  0, .4h,  .8b,    1
1809 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h,  1, .4h,  2, .4h,  .8b,    1
1810 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h,  1, .4h,  0, .4h,  .8b,    1
1811 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h,  2, .4h,  1, .4h,  .8b,    1
1812 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h,  2, .4h,  3, .4h,  .8b,    1
1813 generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, .4h,  0, .4h,  0, .4h,  .8b,    1
1814
1815 generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,  1, .4h,  2, .4h,  .8b,    0
1816 generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,  1, .4h,  0, .4h,  .8b,    0
1817
1818 .purgem do_load
1819 .purgem do_store
1820
1821
1822 /*****************************************************************************/
1823
1824 /*
1825  * jsimd_extrgb_ycc_convert_neon
1826  * jsimd_extbgr_ycc_convert_neon
1827  * jsimd_extrgbx_ycc_convert_neon
1828  * jsimd_extbgrx_ycc_convert_neon
1829  * jsimd_extxbgr_ycc_convert_neon
1830  * jsimd_extxrgb_ycc_convert_neon
1831  *
1832  * Colorspace conversion RGB -> YCbCr
1833  */
1834
1835 .macro do_store size
1836   .if \size == 8
1837     st1             {v20.8b}, [Y], #8
1838     st1             {v21.8b}, [U], #8
1839     st1             {v22.8b}, [V], #8
1840   .elseif \size == 4
1841     st1             {v20.b}[0], [Y], #1
1842     st1             {v20.b}[1], [Y], #1
1843     st1             {v20.b}[2], [Y], #1
1844     st1             {v20.b}[3], [Y], #1
1845     st1             {v21.b}[0], [U], #1
1846     st1             {v21.b}[1], [U], #1
1847     st1             {v21.b}[2], [U], #1
1848     st1             {v21.b}[3], [U], #1
1849     st1             {v22.b}[0], [V], #1
1850     st1             {v22.b}[1], [V], #1
1851     st1             {v22.b}[2], [V], #1
1852     st1             {v22.b}[3], [V], #1
1853   .elseif \size == 2
1854     st1             {v20.b}[4], [Y], #1
1855     st1             {v20.b}[5], [Y], #1
1856     st1             {v21.b}[4], [U], #1
1857     st1             {v21.b}[5], [U], #1
1858     st1             {v22.b}[4], [V], #1
1859     st1             {v22.b}[5], [V], #1
1860   .elseif \size == 1
1861     st1             {v20.b}[6], [Y], #1
1862     st1             {v21.b}[6], [U], #1
1863     st1             {v22.b}[6], [V], #1
1864   .else
1865     .error unsupported macroblock size
1866   .endif
1867 .endm
1868
1869 .macro do_load bpp, size, fast_ld3
1870   .if \bpp == 24
1871     .if \size == 8
1872       .if \fast_ld3 == 1
1873         ld3         {v10.8b, v11.8b, v12.8b}, [RGB], #24
1874       .else
1875         ld1         {v10.b}[0], [RGB], #1
1876         ld1         {v11.b}[0], [RGB], #1
1877         ld1         {v12.b}[0], [RGB], #1
1878
1879         ld1         {v10.b}[1], [RGB], #1
1880         ld1         {v11.b}[1], [RGB], #1
1881         ld1         {v12.b}[1], [RGB], #1
1882
1883         ld1         {v10.b}[2], [RGB], #1
1884         ld1         {v11.b}[2], [RGB], #1
1885         ld1         {v12.b}[2], [RGB], #1
1886
1887         ld1         {v10.b}[3], [RGB], #1
1888         ld1         {v11.b}[3], [RGB], #1
1889         ld1         {v12.b}[3], [RGB], #1
1890
1891         ld1         {v10.b}[4], [RGB], #1
1892         ld1         {v11.b}[4], [RGB], #1
1893         ld1         {v12.b}[4], [RGB], #1
1894
1895         ld1         {v10.b}[5], [RGB], #1
1896         ld1         {v11.b}[5], [RGB], #1
1897         ld1         {v12.b}[5], [RGB], #1
1898
1899         ld1         {v10.b}[6], [RGB], #1
1900         ld1         {v11.b}[6], [RGB], #1
1901         ld1         {v12.b}[6], [RGB], #1
1902
1903         ld1         {v10.b}[7], [RGB], #1
1904         ld1         {v11.b}[7], [RGB], #1
1905         ld1         {v12.b}[7], [RGB], #1
1906       .endif
1907       prfm          pldl1keep, [RGB, #128]
1908     .elseif \size == 4
1909       ld3           {v10.b, v11.b, v12.b}[0], [RGB], #3
1910       ld3           {v10.b, v11.b, v12.b}[1], [RGB], #3
1911       ld3           {v10.b, v11.b, v12.b}[2], [RGB], #3
1912       ld3           {v10.b, v11.b, v12.b}[3], [RGB], #3
1913     .elseif \size == 2
1914       ld3           {v10.b, v11.b, v12.b}[4], [RGB], #3
1915       ld3           {v10.b, v11.b, v12.b}[5], [RGB], #3
1916     .elseif \size == 1
1917       ld3           {v10.b, v11.b, v12.b}[6], [RGB], #3
1918     .else
1919       .error unsupported macroblock size
1920     .endif
1921   .elseif \bpp == 32
1922     .if \size == 8
1923       ld4           {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32
1924       prfm          pldl1keep, [RGB, #128]
1925     .elseif \size == 4
1926       ld4           {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4
1927       ld4           {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4
1928       ld4           {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4
1929       ld4           {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4
1930     .elseif \size == 2
1931       ld4           {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4
1932       ld4           {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4
1933     .elseif \size == 1
1934       ld4           {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4
1935     .else
1936       .error unsupported macroblock size
1937     .endif
1938   .else
1939     .error unsupported bpp
1940   .endif
1941 .endm
1942
1943 .macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, \
1944                                            b_offs, fast_ld3
1945
1946 /*
1947  * 2-stage pipelined RGB->YCbCr conversion
1948  */
1949
1950 .macro do_rgb_to_yuv_stage1
1951     ushll           v4.8h, v1\r_offs\().8b, #0  /* r = v4 */
1952     ushll           v6.8h, v1\g_offs\().8b, #0  /* g = v6 */
1953     ushll           v8.8h, v1\b_offs\().8b, #0  /* b = v8 */
1954     rev64           v18.4s, v1.4s
1955     rev64           v26.4s, v1.4s
1956     rev64           v28.4s, v1.4s
1957     rev64           v30.4s, v1.4s
1958     umull           v14.4s, v4.4h, v0.h[0]
1959     umull2          v16.4s, v4.8h, v0.h[0]
1960     umlsl           v18.4s, v4.4h, v0.h[3]
1961     umlsl2          v26.4s, v4.8h, v0.h[3]
1962     umlal           v28.4s, v4.4h, v0.h[5]
1963     umlal2          v30.4s, v4.8h, v0.h[5]
1964     umlal           v14.4s, v6.4h, v0.h[1]
1965     umlal2          v16.4s, v6.8h, v0.h[1]
1966     umlsl           v18.4s, v6.4h, v0.h[4]
1967     umlsl2          v26.4s, v6.8h, v0.h[4]
1968     umlsl           v28.4s, v6.4h, v0.h[6]
1969     umlsl2          v30.4s, v6.8h, v0.h[6]
1970     umlal           v14.4s, v8.4h, v0.h[2]
1971     umlal2          v16.4s, v8.8h, v0.h[2]
1972     umlal           v18.4s, v8.4h, v0.h[5]
1973     umlal2          v26.4s, v8.8h, v0.h[5]
1974     umlsl           v28.4s, v8.4h, v0.h[7]
1975     umlsl2          v30.4s, v8.8h, v0.h[7]
1976 .endm
1977
1978 .macro do_rgb_to_yuv_stage2
1979     rshrn           v20.4h, v14.4s, #16
1980     shrn            v22.4h, v18.4s, #16
1981     shrn            v24.4h, v28.4s, #16
1982     rshrn2          v20.8h, v16.4s, #16
1983     shrn2           v22.8h, v26.4s, #16
1984     shrn2           v24.8h, v30.4s, #16
1985     xtn             v20.8b, v20.8h       /* v20 = y */
1986     xtn             v21.8b, v22.8h       /* v21 = u */
1987     xtn             v22.8b, v24.8h       /* v22 = v */
1988 .endm
1989
1990 .macro do_rgb_to_yuv
1991     do_rgb_to_yuv_stage1
1992     do_rgb_to_yuv_stage2
1993 .endm
1994
1995 /* TODO: expand macros and interleave instructions if some in-order
1996  *       ARM64 processor actually can dual-issue LOAD/STORE with ALU */
1997 .macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3
1998     do_rgb_to_yuv_stage2
1999     do_load         \bpp, 8, \fast_ld3
2000     st1             {v20.8b}, [Y], #8
2001     st1             {v21.8b}, [U], #8
2002     st1             {v22.8b}, [V], #8
2003     do_rgb_to_yuv_stage1
2004 .endm
2005
2006 .balign 16
2007 .if \fast_ld3 == 1
2008 Ljsimd_\colorid\()_ycc_neon_consts:
2009 .else
2010 Ljsimd_\colorid\()_ycc_neon_slowld3_consts:
2011 .endif
2012   .short 19595, 38470, 7471, 11059
2013   .short 21709, 32768, 27439, 5329
2014   .short 32767, 128, 32767, 128
2015   .short 32767, 128, 32767, 128
2016
2017 .if \fast_ld3 == 1
2018 asm_function jsimd_\colorid\()_ycc_convert_neon
2019 .else
2020 asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
2021 .endif
2022     OUTPUT_WIDTH    .req w0
2023     INPUT_BUF       .req x1
2024     OUTPUT_BUF      .req x2
2025     OUTPUT_ROW      .req w3
2026     NUM_ROWS        .req w4
2027
2028     OUTPUT_BUF0     .req x5
2029     OUTPUT_BUF1     .req x6
2030     OUTPUT_BUF2     .req x2  /* OUTPUT_BUF */
2031
2032     RGB             .req x7
2033     Y               .req x9
2034     U               .req x10
2035     V               .req x11
2036     N               .req w12
2037
2038     /* Load constants to d0, d1, d2, d3 */
2039     .if \fast_ld3 == 1
2040       adr           x13, Ljsimd_\colorid\()_ycc_neon_consts
2041     .else
2042       adr           x13, Ljsimd_\colorid\()_ycc_neon_slowld3_consts
2043     .endif
2044     ld1             {v0.8h, v1.8h}, [x13]
2045
2046     ldr             OUTPUT_BUF0, [OUTPUT_BUF]
2047     ldr             OUTPUT_BUF1, [OUTPUT_BUF, #8]
2048     ldr             OUTPUT_BUF2, [OUTPUT_BUF, #16]
2049     .unreq          OUTPUT_BUF
2050
2051     /* Save NEON registers */
2052     sub             sp, sp, #64
2053     mov             x9, sp
2054     st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
2055     st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
2056
2057     /* Outer loop over scanlines */
2058     cmp             NUM_ROWS, #1
2059     b.lt            9f
2060 0:
2061     ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, uxtw #3]
2062     ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, uxtw #3]
2063     mov             N, OUTPUT_WIDTH
2064     ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, uxtw #3]
2065     add             OUTPUT_ROW, OUTPUT_ROW, #1
2066     ldr             RGB, [INPUT_BUF], #8
2067
2068     /* Inner loop over pixels */
2069     subs            N, N, #8
2070     b.lt            3f
2071     do_load         \bpp, 8, \fast_ld3
2072     do_rgb_to_yuv_stage1
2073     subs            N, N, #8
2074     b.lt            2f
2075 1:
2076     do_rgb_to_yuv_stage2_store_load_stage1 \fast_ld3
2077     subs            N, N, #8
2078     b.ge            1b
2079 2:
2080     do_rgb_to_yuv_stage2
2081     do_store        8
2082     tst             N, #7
2083     b.eq            8f
2084 3:
2085     tbz             N, #2, 3f
2086     do_load         \bpp, 4, \fast_ld3
2087 3:
2088     tbz             N, #1, 4f
2089     do_load         \bpp, 2, \fast_ld3
2090 4:
2091     tbz             N, #0, 5f
2092     do_load         \bpp, 1, \fast_ld3
2093 5:
2094     do_rgb_to_yuv
2095     tbz             N, #2, 6f
2096     do_store        4
2097 6:
2098     tbz             N, #1, 7f
2099     do_store        2
2100 7:
2101     tbz             N, #0, 8f
2102     do_store        1
2103 8:
2104     subs            NUM_ROWS, NUM_ROWS, #1
2105     b.gt            0b
2106 9:
2107     /* Restore all registers and return */
2108     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
2109     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
2110     br              x30
2111
2112     .unreq          OUTPUT_WIDTH
2113     .unreq          OUTPUT_ROW
2114     .unreq          INPUT_BUF
2115     .unreq          NUM_ROWS
2116     .unreq          OUTPUT_BUF0
2117     .unreq          OUTPUT_BUF1
2118     .unreq          OUTPUT_BUF2
2119     .unreq          RGB
2120     .unreq          Y
2121     .unreq          U
2122     .unreq          V
2123     .unreq          N
2124
2125 .purgem do_rgb_to_yuv
2126 .purgem do_rgb_to_yuv_stage1
2127 .purgem do_rgb_to_yuv_stage2
2128 .purgem do_rgb_to_yuv_stage2_store_load_stage1
2129
2130 .endm
2131
2132 /*--------------------------------- id ----- bpp R  G  B  Fast LD3 */
2133 generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2, 1
2134 generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0, 1
2135 generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2, 1
2136 generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0, 1
2137 generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1, 1
2138 generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3, 1
2139
2140 generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2, 0
2141 generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0, 0
2142
2143 .purgem do_load
2144 .purgem do_store
2145
2146
2147 /*****************************************************************************/
2148
2149 /*
2150  * Load data into workspace, applying unsigned->signed conversion
2151  *
2152  * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
2153  *       rid of VST1.16 instructions
2154  */
2155
2156 asm_function jsimd_convsamp_neon
2157     SAMPLE_DATA     .req x0
2158     START_COL       .req x1
2159     WORKSPACE       .req x2
2160     TMP1            .req x9
2161     TMP2            .req x10
2162     TMP3            .req x11
2163     TMP4            .req x12
2164     TMP5            .req x13
2165     TMP6            .req x14
2166     TMP7            .req x15
2167     TMP8            .req x4
2168     TMPDUP          .req w3
2169
2170     /* START_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
2171        guarantee that the upper (unused) 32 bits of x1 are valid.  This
2172        instruction ensures that those bits are set to zero. */
2173     uxtw x1, w1
2174
2175     mov             TMPDUP, #128
2176     ldp             TMP1, TMP2, [SAMPLE_DATA], 16
2177     ldp             TMP3, TMP4, [SAMPLE_DATA], 16
2178     dup             v0.8b, TMPDUP
2179     add             TMP1, TMP1, START_COL
2180     add             TMP2, TMP2, START_COL
2181     ldp             TMP5, TMP6, [SAMPLE_DATA], 16
2182     add             TMP3, TMP3, START_COL
2183     add             TMP4, TMP4, START_COL
2184     ldp             TMP7, TMP8, [SAMPLE_DATA], 16
2185     add             TMP5, TMP5, START_COL
2186     add             TMP6, TMP6, START_COL
2187     ld1             {v16.8b}, [TMP1]
2188     add             TMP7, TMP7, START_COL
2189     add             TMP8, TMP8, START_COL
2190     ld1             {v17.8b}, [TMP2]
2191     usubl           v16.8h, v16.8b, v0.8b
2192     ld1             {v18.8b}, [TMP3]
2193     usubl           v17.8h, v17.8b, v0.8b
2194     ld1             {v19.8b}, [TMP4]
2195     usubl           v18.8h, v18.8b, v0.8b
2196     ld1             {v20.8b}, [TMP5]
2197     usubl           v19.8h, v19.8b, v0.8b
2198     ld1             {v21.8b}, [TMP6]
2199     st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64
2200     usubl           v20.8h, v20.8b, v0.8b
2201     ld1             {v22.8b}, [TMP7]
2202     usubl           v21.8h, v21.8b, v0.8b
2203     ld1             {v23.8b}, [TMP8]
2204     usubl           v22.8h, v22.8b, v0.8b
2205     usubl           v23.8h, v23.8b, v0.8b
2206     st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64
2207
2208     br              x30
2209
2210     .unreq          SAMPLE_DATA
2211     .unreq          START_COL
2212     .unreq          WORKSPACE
2213     .unreq          TMP1
2214     .unreq          TMP2
2215     .unreq          TMP3
2216     .unreq          TMP4
2217     .unreq          TMP5
2218     .unreq          TMP6
2219     .unreq          TMP7
2220     .unreq          TMP8
2221     .unreq          TMPDUP
2222
2223 /*****************************************************************************/
2224
2225 /*
2226  * jsimd_fdct_islow_neon
2227  *
2228  * This file contains a slow-but-accurate integer implementation of the
2229  * forward DCT (Discrete Cosine Transform). The following code is based
2230  * directly on the IJG''s original jfdctint.c; see the jfdctint.c for
2231  * more details.
2232  *
2233  * TODO: can be combined with 'jsimd_convsamp_neon' to get
2234  *       rid of a bunch of VLD1.16 instructions
2235  */
2236
2237 #define CONST_BITS 13
2238 #define PASS1_BITS 2
2239
2240 #define DESCALE_P1 (CONST_BITS-PASS1_BITS)
2241 #define DESCALE_P2 (CONST_BITS+PASS1_BITS)
2242
2243 #define F_0_298  2446  /* FIX(0.298631336) */
2244 #define F_0_390  3196  /* FIX(0.390180644) */
2245 #define F_0_541  4433  /* FIX(0.541196100) */
2246 #define F_0_765  6270  /* FIX(0.765366865) */
2247 #define F_0_899  7373  /* FIX(0.899976223) */
2248 #define F_1_175  9633  /* FIX(1.175875602) */
2249 #define F_1_501 12299  /* FIX(1.501321110) */
2250 #define F_1_847 15137  /* FIX(1.847759065) */
2251 #define F_1_961 16069  /* FIX(1.961570560) */
2252 #define F_2_053 16819  /* FIX(2.053119869) */
2253 #define F_2_562 20995  /* FIX(2.562915447) */
2254 #define F_3_072 25172  /* FIX(3.072711026) */
2255
2256 .balign 16
2257 Ljsimd_fdct_islow_neon_consts:
2258   .short F_0_298
2259   .short -F_0_390
2260   .short F_0_541
2261   .short F_0_765
2262   .short - F_0_899
2263   .short F_1_175
2264   .short F_1_501
2265   .short - F_1_847
2266   .short - F_1_961
2267   .short F_2_053
2268   .short - F_2_562
2269   .short F_3_072
2270   .short 0          /* padding */
2271   .short 0
2272   .short 0
2273   .short 0
2274
2275 #undef F_0_298
2276 #undef F_0_390
2277 #undef F_0_541
2278 #undef F_0_765
2279 #undef F_0_899
2280 #undef F_1_175
2281 #undef F_1_501
2282 #undef F_1_847
2283 #undef F_1_961
2284 #undef F_2_053
2285 #undef F_2_562
2286 #undef F_3_072
2287 #define XFIX_P_0_298 v0.h[0]
2288 #define XFIX_N_0_390 v0.h[1]
2289 #define XFIX_P_0_541 v0.h[2]
2290 #define XFIX_P_0_765 v0.h[3]
2291 #define XFIX_N_0_899 v0.h[4]
2292 #define XFIX_P_1_175 v0.h[5]
2293 #define XFIX_P_1_501 v0.h[6]
2294 #define XFIX_N_1_847 v0.h[7]
2295 #define XFIX_N_1_961 v1.h[0]
2296 #define XFIX_P_2_053 v1.h[1]
2297 #define XFIX_N_2_562 v1.h[2]
2298 #define XFIX_P_3_072 v1.h[3]
2299
2300 asm_function jsimd_fdct_islow_neon
2301
2302     DATA            .req x0
2303     TMP             .req x9
2304
2305     /* Load constants */
2306     adr             TMP, Ljsimd_fdct_islow_neon_consts
2307     ld1             {v0.8h, v1.8h}, [TMP]
2308
2309     /* Save NEON registers */
2310     sub             sp, sp, #64
2311     mov             x10, sp
2312     st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32
2313     st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32
2314
2315     /* Load all DATA into NEON registers with the following allocation:
2316      *       0 1 2 3 | 4 5 6 7
2317      *      ---------+--------
2318      *   0 | d16     | d17    | v16.8h
2319      *   1 | d18     | d19    | v17.8h
2320      *   2 | d20     | d21    | v18.8h
2321      *   3 | d22     | d23    | v19.8h
2322      *   4 | d24     | d25    | v20.8h
2323      *   5 | d26     | d27    | v21.8h
2324      *   6 | d28     | d29    | v22.8h
2325      *   7 | d30     | d31    | v23.8h
2326      */
2327
2328     ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2329     ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2330     sub             DATA, DATA, #64
2331
2332     /* Transpose */
2333     transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
2334     /* 1-D FDCT */
2335     add             v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
2336     sub             v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
2337     add             v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
2338     sub             v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
2339     add             v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
2340     sub             v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
2341     add             v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
2342     sub             v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
2343
2344     /* even part */
2345
2346     add             v8.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
2347     sub             v9.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
2348     add             v10.8h, v25.8h, v26.8h  /* tmp11 = tmp1 + tmp2; */
2349     sub             v11.8h, v25.8h, v26.8h  /* tmp12 = tmp1 - tmp2; */
2350
2351     add             v16.8h, v8.8h, v10.8h  /* tmp10 + tmp11 */
2352     sub             v20.8h, v8.8h, v10.8h  /* tmp10 - tmp11 */
2353
2354     add             v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
2355
2356     shl             v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM) LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */
2357     shl             v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM) LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */
2358
2359     smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
2360     smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
2361     mov             v22.16b, v18.16b
2362     mov             v25.16b, v24.16b
2363
2364     smlal           v18.4s, v9.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
2365     smlal2          v24.4s, v9.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
2366     smlal           v22.4s, v11.4h, XFIX_N_1_847  /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
2367     smlal2          v25.4s, v11.8h, XFIX_N_1_847  /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
2368
2369     rshrn           v18.4h, v18.4s, #DESCALE_P1
2370     rshrn           v22.4h, v22.4s, #DESCALE_P1
2371     rshrn2          v18.8h, v24.4s, #DESCALE_P1  /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
2372     rshrn2          v22.8h, v25.4s, #DESCALE_P1  /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
2373
2374     /* Odd part */
2375
2376     add             v8.8h, v28.8h, v31.8h        /* z1 = tmp4 + tmp7; */
2377     add             v9.8h, v29.8h, v30.8h        /* z2 = tmp5 + tmp6; */
2378     add             v10.8h, v28.8h, v30.8h       /* z3 = tmp4 + tmp6; */
2379     add             v11.8h, v29.8h, v31.8h       /* z4 = tmp5 + tmp7; */
2380     smull           v4.4s, v10.4h, XFIX_P_1_175  /* z5 lo = z3 lo * XFIX_P_1_175 */
2381     smull2          v5.4s, v10.8h, XFIX_P_1_175
2382     smlal           v4.4s, v11.4h, XFIX_P_1_175  /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
2383     smlal2          v5.4s, v11.8h, XFIX_P_1_175
2384
2385     smull2          v24.4s, v28.8h, XFIX_P_0_298
2386     smull2          v25.4s, v29.8h, XFIX_P_2_053
2387     smull2          v26.4s, v30.8h, XFIX_P_3_072
2388     smull2          v27.4s, v31.8h, XFIX_P_1_501
2389     smull           v28.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
2390     smull           v29.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
2391     smull           v30.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
2392     smull           v31.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
2393
2394     smull2          v12.4s, v8.8h, XFIX_N_0_899
2395     smull2          v13.4s, v9.8h, XFIX_N_2_562
2396     smull2          v14.4s, v10.8h, XFIX_N_1_961
2397     smull2          v15.4s, v11.8h, XFIX_N_0_390
2398     smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, - FIX_0_899976223); */
2399     smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, - FIX_2_562915447); */
2400     smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560); */
2401     smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644); */
2402
2403     add             v10.4s, v10.4s, v4.4s  /* z3 += z5 */
2404     add             v14.4s, v14.4s, v5.4s
2405     add             v11.4s, v11.4s, v4.4s  /* z4 += z5 */
2406     add             v15.4s, v15.4s, v5.4s
2407
2408     add             v28.4s, v28.4s, v8.4s   /* tmp4 += z1 */
2409     add             v24.4s, v24.4s, v12.4s
2410     add             v29.4s, v29.4s, v9.4s   /* tmp5 += z2 */
2411     add             v25.4s, v25.4s, v13.4s
2412     add             v30.4s, v30.4s, v10.4s  /* tmp6 += z3 */
2413     add             v26.4s, v26.4s, v14.4s
2414     add             v31.4s, v31.4s, v11.4s  /* tmp7 += z4 */
2415     add             v27.4s, v27.4s, v15.4s
2416
2417     add             v28.4s, v28.4s, v10.4s  /* tmp4 += z3 */
2418     add             v24.4s, v24.4s, v14.4s
2419     add             v29.4s, v29.4s, v11.4s  /* tmp5 += z4 */
2420     add             v25.4s, v25.4s, v15.4s
2421     add             v30.4s, v30.4s, v9.4s   /* tmp6 += z2 */
2422     add             v26.4s, v26.4s, v13.4s
2423     add             v31.4s, v31.4s, v8.4s   /* tmp7 += z1 */
2424     add             v27.4s, v27.4s, v12.4s
2425
2426     rshrn           v23.4h, v28.4s, #DESCALE_P1
2427     rshrn           v21.4h, v29.4s, #DESCALE_P1
2428     rshrn           v19.4h, v30.4s, #DESCALE_P1
2429     rshrn           v17.4h, v31.4s, #DESCALE_P1
2430     rshrn2          v23.8h, v24.4s, #DESCALE_P1  /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
2431     rshrn2          v21.8h, v25.4s, #DESCALE_P1  /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
2432     rshrn2          v19.8h, v26.4s, #DESCALE_P1  /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
2433     rshrn2          v17.8h, v27.4s, #DESCALE_P1  /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
2434
2435     /* Transpose */
2436     transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
2437
2438     /* 1-D FDCT */
2439     add             v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
2440     sub             v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
2441     add             v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
2442     sub             v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
2443     add             v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
2444     sub             v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
2445     add             v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
2446     sub             v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
2447
2448     /* even part */
2449     add             v8.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
2450     sub             v9.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
2451     add             v10.8h, v25.8h, v26.8h  /* tmp11 = tmp1 + tmp2; */
2452     sub             v11.8h, v25.8h, v26.8h  /* tmp12 = tmp1 - tmp2; */
2453
2454     add             v16.8h, v8.8h, v10.8h  /* tmp10 + tmp11 */
2455     sub             v20.8h, v8.8h, v10.8h  /* tmp10 - tmp11 */
2456
2457     add             v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
2458
2459     srshr           v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS); */
2460     srshr           v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS); */
2461
2462     smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
2463     smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
2464     mov             v22.16b, v18.16b
2465     mov             v25.16b, v24.16b
2466
2467     smlal           v18.4s, v9.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
2468     smlal2          v24.4s, v9.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
2469     smlal           v22.4s, v11.4h, XFIX_N_1_847  /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
2470     smlal2          v25.4s, v11.8h, XFIX_N_1_847  /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
2471
2472     rshrn           v18.4h, v18.4s, #DESCALE_P2
2473     rshrn           v22.4h, v22.4s, #DESCALE_P2
2474     rshrn2          v18.8h, v24.4s, #DESCALE_P2  /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
2475     rshrn2          v22.8h, v25.4s, #DESCALE_P2  /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
2476
2477     /* Odd part */
2478     add             v8.8h, v28.8h, v31.8h   /* z1 = tmp4 + tmp7; */
2479     add             v9.8h, v29.8h, v30.8h   /* z2 = tmp5 + tmp6; */
2480     add             v10.8h, v28.8h, v30.8h  /* z3 = tmp4 + tmp6; */
2481     add             v11.8h, v29.8h, v31.8h  /* z4 = tmp5 + tmp7; */
2482
2483     smull           v4.4s, v10.4h, XFIX_P_1_175  /* z5 lo = z3 lo * XFIX_P_1_175 */
2484     smull2          v5.4s, v10.8h, XFIX_P_1_175
2485     smlal           v4.4s, v11.4h, XFIX_P_1_175  /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
2486     smlal2          v5.4s, v11.8h, XFIX_P_1_175
2487
2488     smull2          v24.4s, v28.8h, XFIX_P_0_298
2489     smull2          v25.4s, v29.8h, XFIX_P_2_053
2490     smull2          v26.4s, v30.8h, XFIX_P_3_072
2491     smull2          v27.4s, v31.8h, XFIX_P_1_501
2492     smull           v28.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
2493     smull           v29.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
2494     smull           v30.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
2495     smull           v31.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
2496
2497     smull2          v12.4s, v8.8h, XFIX_N_0_899
2498     smull2          v13.4s, v9.8h, XFIX_N_2_562
2499     smull2          v14.4s, v10.8h, XFIX_N_1_961
2500     smull2          v15.4s, v11.8h, XFIX_N_0_390
2501     smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, - FIX_0_899976223); */
2502     smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, - FIX_2_562915447); */
2503     smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560); */
2504     smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644); */
2505
2506     add             v10.4s, v10.4s, v4.4s
2507     add             v14.4s, v14.4s, v5.4s
2508     add             v11.4s, v11.4s, v4.4s
2509     add             v15.4s, v15.4s, v5.4s
2510
2511     add             v28.4s, v28.4s, v8.4s   /* tmp4 += z1 */
2512     add             v24.4s, v24.4s, v12.4s
2513     add             v29.4s, v29.4s, v9.4s   /* tmp5 += z2 */
2514     add             v25.4s, v25.4s, v13.4s
2515     add             v30.4s, v30.4s, v10.4s  /* tmp6 += z3 */
2516     add             v26.4s, v26.4s, v14.4s
2517     add             v31.4s, v31.4s, v11.4s  /* tmp7 += z4 */
2518     add             v27.4s, v27.4s, v15.4s
2519
2520     add             v28.4s, v28.4s, v10.4s  /* tmp4 += z3 */
2521     add             v24.4s, v24.4s, v14.4s
2522     add             v29.4s, v29.4s, v11.4s  /* tmp5 += z4 */
2523     add             v25.4s, v25.4s, v15.4s
2524     add             v30.4s, v30.4s, v9.4s   /* tmp6 += z2 */
2525     add             v26.4s, v26.4s, v13.4s
2526     add             v31.4s, v31.4s, v8.4s   /* tmp7 += z1 */
2527     add             v27.4s, v27.4s, v12.4s
2528
2529     rshrn           v23.4h, v28.4s, #DESCALE_P2
2530     rshrn           v21.4h, v29.4s, #DESCALE_P2
2531     rshrn           v19.4h, v30.4s, #DESCALE_P2
2532     rshrn           v17.4h, v31.4s, #DESCALE_P2
2533     rshrn2          v23.8h, v24.4s, #DESCALE_P2  /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
2534     rshrn2          v21.8h, v25.4s, #DESCALE_P2  /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
2535     rshrn2          v19.8h, v26.4s, #DESCALE_P2  /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
2536     rshrn2          v17.8h, v27.4s, #DESCALE_P2  /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
2537
2538     /* store results */
2539     st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2540     st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2541
2542     /* Restore NEON registers */
2543     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
2544     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
2545
2546     br              x30
2547
2548     .unreq          DATA
2549     .unreq          TMP
2550
2551 #undef XFIX_P_0_298
2552 #undef XFIX_N_0_390
2553 #undef XFIX_P_0_541
2554 #undef XFIX_P_0_765
2555 #undef XFIX_N_0_899
2556 #undef XFIX_P_1_175
2557 #undef XFIX_P_1_501
2558 #undef XFIX_N_1_847
2559 #undef XFIX_N_1_961
2560 #undef XFIX_P_2_053
2561 #undef XFIX_N_2_562
2562 #undef XFIX_P_3_072
2563
2564
2565 /*****************************************************************************/
2566
2567 /*
2568  * jsimd_fdct_ifast_neon
2569  *
2570  * This function contains a fast, not so accurate integer implementation of
2571  * the forward DCT (Discrete Cosine Transform). It uses the same calculations
2572  * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
2573  * function from jfdctfst.c
2574  *
2575  * TODO: can be combined with 'jsimd_convsamp_neon' to get
2576  *       rid of a bunch of VLD1.16 instructions
2577  */
2578
2579 #undef XFIX_0_541196100
2580 #define XFIX_0_382683433 v0.h[0]
2581 #define XFIX_0_541196100 v0.h[1]
2582 #define XFIX_0_707106781 v0.h[2]
2583 #define XFIX_1_306562965 v0.h[3]
2584
2585 .balign 16
2586 Ljsimd_fdct_ifast_neon_consts:
2587   .short (98 * 128)               /* XFIX_0_382683433 */
2588   .short (139 * 128)              /* XFIX_0_541196100 */
2589   .short (181 * 128)              /* XFIX_0_707106781 */
2590   .short (334 * 128 - 256 * 128)  /* XFIX_1_306562965 */
2591
2592 asm_function jsimd_fdct_ifast_neon
2593
2594     DATA            .req x0
2595     TMP             .req x9
2596
2597     /* Load constants */
2598     adr             TMP, Ljsimd_fdct_ifast_neon_consts
2599     ld1             {v0.4h}, [TMP]
2600
2601     /* Load all DATA into NEON registers with the following allocation:
2602      *       0 1 2 3 | 4 5 6 7
2603      *      ---------+--------
2604      *   0 | d16     | d17    | v0.8h
2605      *   1 | d18     | d19    | q9
2606      *   2 | d20     | d21    | q10
2607      *   3 | d22     | d23    | q11
2608      *   4 | d24     | d25    | q12
2609      *   5 | d26     | d27    | q13
2610      *   6 | d28     | d29    | q14
2611      *   7 | d30     | d31    | q15
2612      */
2613
2614     ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2615     ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2616     mov             TMP, #2
2617     sub             DATA, DATA, #64
2618 1:
2619     /* Transpose */
2620     transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4
2621     subs            TMP, TMP, #1
2622     /* 1-D FDCT */
2623     add             v4.8h, v19.8h, v20.8h
2624     sub             v20.8h, v19.8h, v20.8h
2625     sub             v28.8h, v18.8h, v21.8h
2626     add             v18.8h, v18.8h, v21.8h
2627     sub             v29.8h, v17.8h, v22.8h
2628     add             v17.8h, v17.8h, v22.8h
2629     sub             v21.8h, v16.8h, v23.8h
2630     add             v16.8h, v16.8h, v23.8h
2631     sub             v6.8h, v17.8h, v18.8h
2632     sub             v7.8h, v16.8h, v4.8h
2633     add             v5.8h, v17.8h, v18.8h
2634     add             v6.8h, v6.8h, v7.8h
2635     add             v4.8h, v16.8h, v4.8h
2636     sqdmulh         v6.8h, v6.8h, XFIX_0_707106781
2637     add             v19.8h, v20.8h, v28.8h
2638     add             v16.8h, v4.8h, v5.8h
2639     sub             v20.8h, v4.8h, v5.8h
2640     add             v5.8h, v28.8h, v29.8h
2641     add             v29.8h, v29.8h, v21.8h
2642     sqdmulh         v5.8h, v5.8h, XFIX_0_707106781
2643     sub             v28.8h, v19.8h, v29.8h
2644     add             v18.8h, v7.8h, v6.8h
2645     sqdmulh         v28.8h, v28.8h, XFIX_0_382683433
2646     sub             v22.8h, v7.8h, v6.8h
2647     sqdmulh         v19.8h, v19.8h, XFIX_0_541196100
2648     sqdmulh         v7.8h, v29.8h, XFIX_1_306562965
2649     add             v6.8h, v21.8h, v5.8h
2650     sub             v5.8h, v21.8h, v5.8h
2651     add             v29.8h, v29.8h, v28.8h
2652     add             v19.8h, v19.8h, v28.8h
2653     add             v29.8h, v29.8h, v7.8h
2654     add             v21.8h, v5.8h, v19.8h
2655     sub             v19.8h, v5.8h, v19.8h
2656     add             v17.8h, v6.8h, v29.8h
2657     sub             v23.8h, v6.8h, v29.8h
2658
2659     b.ne            1b
2660
2661     /* store results */
2662     st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2663     st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2664
2665     br              x30
2666
2667     .unreq          DATA
2668     .unreq          TMP
2669 #undef XFIX_0_382683433
2670 #undef XFIX_0_541196100
2671 #undef XFIX_0_707106781
2672 #undef XFIX_1_306562965
2673
2674
2675 /*****************************************************************************/
2676
2677 /*
2678  * GLOBAL(void)
2679  * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM *divisors,
2680  *                      DCTELEM *workspace);
2681  *
2682  */
2683 asm_function jsimd_quantize_neon
2684
2685     COEF_BLOCK      .req x0
2686     DIVISORS        .req x1
2687     WORKSPACE       .req x2
2688
2689     RECIPROCAL      .req DIVISORS
2690     CORRECTION      .req x9
2691     SHIFT           .req x10
2692     LOOP_COUNT      .req x11
2693
2694     mov             LOOP_COUNT, #2
2695     add             CORRECTION, DIVISORS, #(64 * 2)
2696     add             SHIFT, DIVISORS, #(64 * 6)
2697 1:
2698     subs            LOOP_COUNT, LOOP_COUNT, #1
2699     ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [WORKSPACE], 64
2700     ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [CORRECTION], 64
2701     abs             v20.8h, v0.8h
2702     abs             v21.8h, v1.8h
2703     abs             v22.8h, v2.8h
2704     abs             v23.8h, v3.8h
2705     ld1             {v28.8h, v29.8h, v30.8h, v31.8h}, [RECIPROCAL], 64
2706     add             v20.8h, v20.8h, v4.8h  /* add correction */
2707     add             v21.8h, v21.8h, v5.8h
2708     add             v22.8h, v22.8h, v6.8h
2709     add             v23.8h, v23.8h, v7.8h
2710     umull           v4.4s, v20.4h, v28.4h  /* multiply by reciprocal */
2711     umull2          v16.4s, v20.8h, v28.8h
2712     umull           v5.4s, v21.4h, v29.4h
2713     umull2          v17.4s, v21.8h, v29.8h
2714     umull           v6.4s, v22.4h, v30.4h  /* multiply by reciprocal */
2715     umull2          v18.4s, v22.8h, v30.8h
2716     umull           v7.4s, v23.4h, v31.4h
2717     umull2          v19.4s, v23.8h, v31.8h
2718     ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [SHIFT], 64
2719     shrn            v4.4h, v4.4s, #16
2720     shrn            v5.4h, v5.4s, #16
2721     shrn            v6.4h, v6.4s, #16
2722     shrn            v7.4h, v7.4s, #16
2723     shrn2           v4.8h, v16.4s, #16
2724     shrn2           v5.8h, v17.4s, #16
2725     shrn2           v6.8h, v18.4s, #16
2726     shrn2           v7.8h, v19.4s, #16
2727     neg             v24.8h, v24.8h
2728     neg             v25.8h, v25.8h
2729     neg             v26.8h, v26.8h
2730     neg             v27.8h, v27.8h
2731     sshr            v0.8h, v0.8h, #15  /* extract sign */
2732     sshr            v1.8h, v1.8h, #15
2733     sshr            v2.8h, v2.8h, #15
2734     sshr            v3.8h, v3.8h, #15
2735     ushl            v4.8h, v4.8h, v24.8h  /* shift */
2736     ushl            v5.8h, v5.8h, v25.8h
2737     ushl            v6.8h, v6.8h, v26.8h
2738     ushl            v7.8h, v7.8h, v27.8h
2739
2740     eor             v4.16b, v4.16b, v0.16b  /* restore sign */
2741     eor             v5.16b, v5.16b, v1.16b
2742     eor             v6.16b, v6.16b, v2.16b
2743     eor             v7.16b, v7.16b, v3.16b
2744     sub             v4.8h, v4.8h, v0.8h
2745     sub             v5.8h, v5.8h, v1.8h
2746     sub             v6.8h, v6.8h, v2.8h
2747     sub             v7.8h, v7.8h, v3.8h
2748     st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [COEF_BLOCK], 64
2749
2750     b.ne            1b
2751
2752     br              x30  /* return */
2753
2754     .unreq          COEF_BLOCK
2755     .unreq          DIVISORS
2756     .unreq          WORKSPACE
2757     .unreq          RECIPROCAL
2758     .unreq          CORRECTION
2759     .unreq          SHIFT
2760     .unreq          LOOP_COUNT
2761
2762
2763 /*****************************************************************************/
2764
2765 /*
2766  * Downsample pixel values of a single component.
2767  * This version handles the common case of 2:1 horizontal and 1:1 vertical,
2768  * without smoothing.
2769  *
2770  * GLOBAL(void)
2771  * jsimd_h2v1_downsample_neon (JDIMENSION image_width, int max_v_samp_factor,
2772  *                             JDIMENSION v_samp_factor,
2773  *                             JDIMENSION width_blocks, JSAMPARRAY input_data,
2774  *                             JSAMPARRAY output_data);
2775  */
2776
2777 .balign 16
2778 Ljsimd_h2_downsample_neon_consts:
2779   .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2780         0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F  /* diff 0 */
2781   .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2782         0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E  /* diff 1 */
2783   .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2784         0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D  /* diff 2 */
2785   .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2786         0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C  /* diff 3 */
2787   .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2788         0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B  /* diff 4 */
2789   .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2790         0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A  /* diff 5 */
2791   .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2792         0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09  /* diff 6 */
2793   .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2794         0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08  /* diff 7 */
2795   .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2796         0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07  /* diff 8 */
2797   .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \
2798         0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06  /* diff 9 */
2799   .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \
2800         0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05  /* diff 10 */
2801   .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \
2802         0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04  /* diff 11 */
2803   .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \
2804         0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03  /* diff 12 */
2805   .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \
2806         0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02  /* diff 13 */
2807   .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \
2808         0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01  /* diff 14 */
2809   .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
2810         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  /* diff 15 */
2811
2812 asm_function jsimd_h2v1_downsample_neon
2813     IMAGE_WIDTH     .req x0
2814     MAX_V_SAMP      .req x1
2815     V_SAMP          .req x2
2816     BLOCK_WIDTH     .req x3
2817     INPUT_DATA      .req x4
2818     OUTPUT_DATA     .req x5
2819     OUTPTR          .req x9
2820     INPTR           .req x10
2821     TMP1            .req x11
2822     TMP2            .req x12
2823     TMP3            .req x13
2824     TMPDUP          .req w15
2825
2826     mov             TMPDUP, #0x10000
2827     lsl             TMP2, BLOCK_WIDTH, #4
2828     sub             TMP2, TMP2, IMAGE_WIDTH
2829     adr             TMP3, Ljsimd_h2_downsample_neon_consts
2830     add             TMP3, TMP3, TMP2, lsl #4
2831     dup             v16.4s, TMPDUP
2832     ld1             {v18.16b}, [TMP3]
2833
2834 1:  /* row loop */
2835     ldr             INPTR, [INPUT_DATA], #8
2836     ldr             OUTPTR, [OUTPUT_DATA], #8
2837     subs            TMP1, BLOCK_WIDTH, #1
2838     b.eq            3f
2839 2:  /* columns */
2840     ld1             {v0.16b}, [INPTR], #16
2841     mov             v4.16b, v16.16b
2842     subs            TMP1, TMP1, #1
2843     uadalp          v4.8h, v0.16b
2844     shrn            v6.8b, v4.8h, #1
2845     st1             {v6.8b}, [OUTPTR], #8
2846     b.ne            2b
2847 3:  /* last columns */
2848     ld1             {v0.16b}, [INPTR]
2849     mov             v4.16b, v16.16b
2850     subs            V_SAMP, V_SAMP, #1
2851     /* expand right */
2852     tbl             v2.16b, {v0.16b}, v18.16b
2853     uadalp          v4.8h, v2.16b
2854     shrn            v6.8b, v4.8h, #1
2855     st1             {v6.8b}, [OUTPTR], #8
2856     b.ne            1b
2857
2858     br              x30
2859
2860     .unreq          IMAGE_WIDTH
2861     .unreq          MAX_V_SAMP
2862     .unreq          V_SAMP
2863     .unreq          BLOCK_WIDTH
2864     .unreq          INPUT_DATA
2865     .unreq          OUTPUT_DATA
2866     .unreq          OUTPTR
2867     .unreq          INPTR
2868     .unreq          TMP1
2869     .unreq          TMP2
2870     .unreq          TMP3
2871     .unreq          TMPDUP
2872
2873
2874 /*****************************************************************************/
2875
2876 /*
2877  * Downsample pixel values of a single component.
2878  * This version handles the common case of 2:1 horizontal and 2:1 vertical,
2879  * without smoothing.
2880  *
2881  * GLOBAL(void)
2882  * jsimd_h2v2_downsample_neon (JDIMENSION image_width, int max_v_samp_factor,
2883  *                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
2884  *                             JSAMPARRAY input_data, JSAMPARRAY output_data);
2885  */
2886
2887 .balign 16
2888 asm_function jsimd_h2v2_downsample_neon
2889     IMAGE_WIDTH     .req x0
2890     MAX_V_SAMP      .req x1
2891     V_SAMP          .req x2
2892     BLOCK_WIDTH     .req x3
2893     INPUT_DATA      .req x4
2894     OUTPUT_DATA     .req x5
2895     OUTPTR          .req x9
2896     INPTR0          .req x10
2897     INPTR1          .req x14
2898     TMP1            .req x11
2899     TMP2            .req x12
2900     TMP3            .req x13
2901     TMPDUP          .req w15
2902
2903     mov             TMPDUP, #1
2904     lsl             TMP2, BLOCK_WIDTH, #4
2905     lsl             TMPDUP, TMPDUP, #17
2906     sub             TMP2, TMP2, IMAGE_WIDTH
2907     adr             TMP3, Ljsimd_h2_downsample_neon_consts
2908     orr             TMPDUP, TMPDUP, #1
2909     add             TMP3, TMP3, TMP2, lsl #4
2910     dup             v16.4s, TMPDUP
2911     ld1             {v18.16b}, [TMP3]
2912
2913 1:  /* row loop */
2914     ldr             INPTR0, [INPUT_DATA], #8
2915     ldr             OUTPTR, [OUTPUT_DATA], #8
2916     ldr             INPTR1, [INPUT_DATA], #8
2917     subs            TMP1, BLOCK_WIDTH, #1
2918     b.eq            3f
2919 2:  /* columns */
2920     ld1             {v0.16b}, [INPTR0], #16
2921     ld1             {v1.16b}, [INPTR1], #16
2922     mov             v4.16b, v16.16b
2923     subs            TMP1, TMP1, #1
2924     uadalp          v4.8h, v0.16b
2925     uadalp          v4.8h, v1.16b
2926     shrn            v6.8b, v4.8h, #2
2927     st1             {v6.8b}, [OUTPTR], #8
2928     b.ne            2b
2929 3:  /* last columns */
2930     ld1             {v0.16b}, [INPTR0], #16
2931     ld1             {v1.16b}, [INPTR1], #16
2932     mov             v4.16b, v16.16b
2933     subs            V_SAMP, V_SAMP, #1
2934     /* expand right */
2935     tbl             v2.16b, {v0.16b}, v18.16b
2936     tbl             v3.16b, {v1.16b}, v18.16b
2937     uadalp          v4.8h, v2.16b
2938     uadalp          v4.8h, v3.16b
2939     shrn            v6.8b, v4.8h, #2
2940     st1             {v6.8b}, [OUTPTR], #8
2941     b.ne            1b
2942
2943     br              x30
2944
2945     .unreq          IMAGE_WIDTH
2946     .unreq          MAX_V_SAMP
2947     .unreq          V_SAMP
2948     .unreq          BLOCK_WIDTH
2949     .unreq          INPUT_DATA
2950     .unreq          OUTPUT_DATA
2951     .unreq          OUTPTR
2952     .unreq          INPTR0
2953     .unreq          INPTR1
2954     .unreq          TMP1
2955     .unreq          TMP2
2956     .unreq          TMP3
2957     .unreq          TMPDUP
2958
2959
2960 /*****************************************************************************/
2961
2962 /*
2963  * GLOBAL(JOCTET*)
2964  * jsimd_huff_encode_one_block (working_state *state, JOCTET *buffer,
2965  *                              JCOEFPTR block, int last_dc_val,
2966  *                              c_derived_tbl *dctbl, c_derived_tbl *actbl)
2967  *
2968  */
2969
2970     BUFFER          .req x1
2971     PUT_BUFFER      .req x6
2972     PUT_BITS        .req x7
2973     PUT_BITSw       .req w7
2974
2975 .macro emit_byte
2976     sub             PUT_BITS, PUT_BITS, #0x8
2977     lsr             x19, PUT_BUFFER, PUT_BITS
2978     uxtb            w19, w19
2979     strb            w19, [BUFFER, #1]!
2980     cmp             w19, #0xff
2981     b.ne            14f
2982     strb            wzr, [BUFFER, #1]!
2983 14:
2984 .endm
2985 .macro put_bits CODE, SIZE
2986     lsl             PUT_BUFFER, PUT_BUFFER, \SIZE
2987     add             PUT_BITS, PUT_BITS, \SIZE
2988     orr             PUT_BUFFER, PUT_BUFFER, \CODE
2989 .endm
2990 .macro checkbuf31
2991     cmp             PUT_BITS, #0x20
2992     b.lt            31f
2993     emit_byte
2994     emit_byte
2995     emit_byte
2996     emit_byte
2997 31:
2998 .endm
2999 .macro checkbuf47
3000     cmp             PUT_BITS, #0x30
3001     b.lt            47f
3002     emit_byte
3003     emit_byte
3004     emit_byte
3005     emit_byte
3006     emit_byte
3007     emit_byte
3008 47:
3009 .endm
3010
3011 .macro generate_jsimd_huff_encode_one_block fast_tbl
3012
3013 .balign 16
3014 .if \fast_tbl == 1
3015 Ljsimd_huff_encode_one_block_neon_consts:
3016 .else
3017 Ljsimd_huff_encode_one_block_neon_slowtbl_consts:
3018 .endif
3019     .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
3020           0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
3021 .if \fast_tbl == 1
3022     .byte    0,   1,   2,   3,  16,  17,  32,  33, \
3023             18,  19,   4,   5,   6,   7,  20,  21  /* L0 => L3 : 4 lines OK */
3024     .byte   34,  35,  48,  49, 255, 255,  50,  51, \
3025             36,  37,  22,  23,   8,   9,  10,  11  /* L0 => L3 : 4 lines OK */
3026     .byte    8,   9,  22,  23,  36,  37,  50,  51, \
3027            255, 255, 255, 255, 255, 255,  52,  53  /* L1 => L4 : 4 lines OK */
3028     .byte   54,  55,  40,  41,  26,  27,  12,  13, \
3029             14,  15,  28,  29,  42,  43,  56,  57  /* L0 => L3 : 4 lines OK */
3030     .byte    6,   7,  20,  21,  34,  35,  48,  49, \
3031             50,  51,  36,  37,  22,  23,   8,   9  /* L4 => L7 : 4 lines OK */
3032     .byte   42,  43,  28,  29,  14,  15,  30,  31, \
3033             44,  45,  58,  59, 255, 255, 255, 255  /* L1 => L4 : 4 lines OK */
3034     .byte  255, 255, 255, 255,  56,  57,  42,  43, \
3035             28,  29,  14,  15,  30,  31,  44,  45  /* L3 => L6 : 4 lines OK */
3036     .byte   26,  27,  40,  41,  42,  43,  28,  29, \
3037             14,  15,  30,  31,  44,  45,  46,  47  /* L5 => L7 : 3 lines OK */
3038     .byte  255, 255, 255, 255,   0,   1, 255, 255, \
3039            255, 255, 255, 255, 255, 255, 255, 255  /* L4 : 1 lines OK */
3040     .byte  255, 255, 255, 255, 255, 255, 255, 255, \
3041              0,   1,  16,  17,   2,   3, 255, 255  /* L5 => L6 : 2 lines OK */
3042     .byte  255, 255, 255, 255, 255, 255, 255, 255, \
3043            255, 255, 255, 255,   8,   9,  22,  23  /* L5 => L6 : 2 lines OK */
3044     .byte    4,   5,   6,   7, 255, 255, 255, 255, \
3045            255, 255, 255, 255, 255, 255, 255, 255  /* L7 : 1 line OK */
3046 .endif
3047
3048 .if \fast_tbl == 1
3049 asm_function jsimd_huff_encode_one_block_neon
3050 .else
3051 asm_function jsimd_huff_encode_one_block_neon_slowtbl
3052 .endif
3053     sub             sp, sp, 272
3054     sub             BUFFER, BUFFER, #0x1    /* BUFFER=buffer-- */
3055     /* Save ARM registers */
3056     stp             x19, x20, [sp]
3057 .if \fast_tbl == 1
3058     adr             x15, Ljsimd_huff_encode_one_block_neon_consts
3059 .else
3060     adr             x15, Ljsimd_huff_encode_one_block_neon_slowtbl_consts
3061 .endif
3062     ldr             PUT_BUFFER, [x0, #0x10]
3063     ldr             PUT_BITSw, [x0, #0x18]
3064     ldrsh           w12, [x2]               /* load DC coeff in w12 */
3065     /* prepare data */
3066 .if \fast_tbl == 1
3067     ld1             {v23.16b}, [x15], #16
3068     ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64
3069     ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64
3070     ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64
3071     ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64
3072     ld1             {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64
3073     sub             w12, w12, w3      /* last_dc_val, not used afterwards */
3074     /* ZigZag 8x8 */
3075     tbl             v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b
3076     tbl             v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b
3077     tbl             v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b
3078     tbl             v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b
3079     tbl             v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b
3080     tbl             v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b
3081     tbl             v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b
3082     tbl             v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b
3083     ins             v0.h[0], w12
3084     tbx             v1.16b, {v28.16b}, v16.16b
3085     tbx             v2.16b, {v29.16b, v30.16b}, v17.16b
3086     tbx             v5.16b, {v29.16b, v30.16b}, v18.16b
3087     tbx             v6.16b, {v31.16b}, v19.16b
3088 .else
3089       add             x13, x2, #0x22
3090       sub             w12, w12, w3    /* last_dc_val, not used afterwards */
3091     ld1             {v23.16b}, [x15]
3092       add             x14, x2, #0x18
3093       add             x3, x2, #0x36
3094     ins             v0.h[0], w12
3095       add             x9, x2, #0x2
3096     ld1             {v1.h}[0], [x13]
3097       add             x15, x2, #0x30
3098     ld1             {v2.h}[0], [x14]
3099       add             x19, x2, #0x26
3100     ld1             {v3.h}[0], [x3]
3101       add             x20, x2, #0x28
3102     ld1             {v0.h}[1], [x9]
3103       add             x12, x2, #0x10
3104     ld1             {v1.h}[1], [x15]
3105       add             x13, x2, #0x40
3106     ld1             {v2.h}[1], [x19]
3107       add             x14, x2, #0x34
3108     ld1             {v3.h}[1], [x20]
3109       add             x3, x2, #0x1a
3110     ld1             {v0.h}[2], [x12]
3111       add             x9, x2, #0x20
3112     ld1             {v1.h}[2], [x13]
3113       add             x15, x2, #0x32
3114     ld1             {v2.h}[2], [x14]
3115       add             x19, x2, #0x42
3116     ld1             {v3.h}[2], [x3]
3117       add             x20, x2, #0xc
3118     ld1             {v0.h}[3], [x9]
3119       add             x12, x2, #0x12
3120     ld1             {v1.h}[3], [x15]
3121       add             x13, x2, #0x24
3122     ld1             {v2.h}[3], [x19]
3123       add             x14, x2, #0x50
3124     ld1             {v3.h}[3], [x20]
3125       add             x3, x2, #0xe
3126     ld1             {v0.h}[4], [x12]
3127       add             x9, x2, #0x4
3128     ld1             {v1.h}[4], [x13]
3129       add             x15, x2, #0x16
3130     ld1             {v2.h}[4], [x14]
3131       add             x19, x2, #0x60
3132     ld1             {v3.h}[4], [x3]
3133       add             x20, x2, #0x1c
3134     ld1             {v0.h}[5], [x9]
3135       add             x12, x2, #0x6
3136     ld1             {v1.h}[5], [x15]
3137       add             x13, x2, #0x8
3138     ld1             {v2.h}[5], [x19]
3139       add             x14, x2, #0x52
3140     ld1             {v3.h}[5], [x20]
3141       add             x3, x2, #0x2a
3142     ld1             {v0.h}[6], [x12]
3143       add             x9, x2, #0x14
3144     ld1             {v1.h}[6], [x13]
3145       add             x15, x2, #0xa
3146     ld1             {v2.h}[6], [x14]
3147       add             x19, x2, #0x44
3148     ld1             {v3.h}[6], [x3]
3149       add             x20, x2, #0x38
3150     ld1             {v0.h}[7], [x9]
3151       add             x12, x2, #0x46
3152     ld1             {v1.h}[7], [x15]
3153       add             x13, x2, #0x3a
3154     ld1             {v2.h}[7], [x19]
3155       add             x14, x2, #0x74
3156     ld1             {v3.h}[7], [x20]
3157       add             x3, x2, #0x6a
3158     ld1             {v4.h}[0], [x12]
3159       add             x9, x2, #0x54
3160     ld1             {v5.h}[0], [x13]
3161       add             x15, x2, #0x2c
3162     ld1             {v6.h}[0], [x14]
3163       add             x19, x2, #0x76
3164     ld1             {v7.h}[0], [x3]
3165       add             x20, x2, #0x78
3166     ld1             {v4.h}[1], [x9]
3167       add             x12, x2, #0x62
3168     ld1             {v5.h}[1], [x15]
3169       add             x13, x2, #0x1e
3170     ld1             {v6.h}[1], [x19]
3171       add             x14, x2, #0x68
3172     ld1             {v7.h}[1], [x20]
3173       add             x3, x2, #0x7a
3174     ld1             {v4.h}[2], [x12]
3175       add             x9, x2, #0x70
3176     ld1             {v5.h}[2], [x13]
3177       add             x15, x2, #0x2e
3178     ld1             {v6.h}[2], [x14]
3179       add             x19, x2, #0x5a
3180     ld1             {v7.h}[2], [x3]
3181       add             x20, x2, #0x6c
3182     ld1             {v4.h}[3], [x9]
3183       add             x12, x2, #0x72
3184     ld1             {v5.h}[3], [x15]
3185       add             x13, x2, #0x3c
3186     ld1             {v6.h}[3], [x19]
3187       add             x14, x2, #0x4c
3188     ld1             {v7.h}[3], [x20]
3189       add             x3, x2, #0x5e
3190     ld1             {v4.h}[4], [x12]
3191       add             x9, x2, #0x64
3192     ld1             {v5.h}[4], [x13]
3193       add             x15, x2, #0x4a
3194     ld1             {v6.h}[4], [x14]
3195       add             x19, x2, #0x3e
3196     ld1             {v7.h}[4], [x3]
3197       add             x20, x2, #0x6e
3198     ld1             {v4.h}[5], [x9]
3199       add             x12, x2, #0x56
3200     ld1             {v5.h}[5], [x15]
3201       add             x13, x2, #0x58
3202     ld1             {v6.h}[5], [x19]
3203       add             x14, x2, #0x4e
3204     ld1             {v7.h}[5], [x20]
3205       add             x3, x2, #0x7c
3206     ld1             {v4.h}[6], [x12]
3207       add             x9, x2, #0x48
3208     ld1             {v5.h}[6], [x13]
3209       add             x15, x2, #0x66
3210     ld1             {v6.h}[6], [x14]
3211       add             x19, x2, #0x5c
3212     ld1             {v7.h}[6], [x3]
3213       add             x20, x2, #0x7e
3214     ld1             {v4.h}[7], [x9]
3215     ld1             {v5.h}[7], [x15]
3216     ld1             {v6.h}[7], [x19]
3217     ld1             {v7.h}[7], [x20]
3218 .endif
3219     cmlt            v24.8h, v0.8h, #0
3220     cmlt            v25.8h, v1.8h, #0
3221     cmlt            v26.8h, v2.8h, #0
3222     cmlt            v27.8h, v3.8h, #0
3223     cmlt            v28.8h, v4.8h, #0
3224     cmlt            v29.8h, v5.8h, #0
3225     cmlt            v30.8h, v6.8h, #0
3226     cmlt            v31.8h, v7.8h, #0
3227     abs             v0.8h, v0.8h
3228     abs             v1.8h, v1.8h
3229     abs             v2.8h, v2.8h
3230     abs             v3.8h, v3.8h
3231     abs             v4.8h, v4.8h
3232     abs             v5.8h, v5.8h
3233     abs             v6.8h, v6.8h
3234     abs             v7.8h, v7.8h
3235     eor             v24.16b, v24.16b, v0.16b
3236     eor             v25.16b, v25.16b, v1.16b
3237     eor             v26.16b, v26.16b, v2.16b
3238     eor             v27.16b, v27.16b, v3.16b
3239     eor             v28.16b, v28.16b, v4.16b
3240     eor             v29.16b, v29.16b, v5.16b
3241     eor             v30.16b, v30.16b, v6.16b
3242     eor             v31.16b, v31.16b, v7.16b
3243     cmeq            v16.8h, v0.8h, #0
3244     cmeq            v17.8h, v1.8h, #0
3245     cmeq            v18.8h, v2.8h, #0
3246     cmeq            v19.8h, v3.8h, #0
3247     cmeq            v20.8h, v4.8h, #0
3248     cmeq            v21.8h, v5.8h, #0
3249     cmeq            v22.8h, v6.8h, #0
3250     xtn             v16.8b, v16.8h
3251     xtn             v18.8b, v18.8h
3252     xtn             v20.8b, v20.8h
3253     xtn             v22.8b, v22.8h
3254       umov            w14, v0.h[0]
3255     xtn2            v16.16b, v17.8h
3256       umov            w13, v24.h[0]
3257     xtn2            v18.16b, v19.8h
3258       clz             w14, w14
3259     xtn2            v20.16b, v21.8h
3260       lsl             w13, w13, w14
3261     cmeq            v17.8h, v7.8h, #0
3262       sub             w12, w14, #32
3263     xtn2            v22.16b, v17.8h
3264       lsr             w13, w13, w14
3265     and             v16.16b, v16.16b, v23.16b
3266       neg             w12, w12
3267     and             v18.16b, v18.16b, v23.16b
3268       add             x3, x4, #0x400           /* r1 = dctbl->ehufsi */
3269     and             v20.16b, v20.16b, v23.16b
3270       add             x15, sp, #0x90           /* x15 = t2 */
3271     and             v22.16b, v22.16b, v23.16b
3272       ldr             w10, [x4, x12, lsl #2]
3273     addp            v16.16b, v16.16b, v18.16b
3274       ldrb            w11, [x3, x12]
3275     addp            v20.16b, v20.16b, v22.16b
3276       checkbuf47
3277     addp            v16.16b, v16.16b, v20.16b
3278       put_bits        x10, x11
3279     addp            v16.16b, v16.16b, v18.16b
3280       checkbuf47
3281     umov            x9,v16.D[0]
3282       put_bits        x13, x12
3283     cnt             v17.8b, v16.8b
3284       mvn             x9, x9
3285     addv            B18, v17.8b
3286       add             x4, x5, #0x400   /* x4 = actbl->ehufsi */
3287     umov            w12, v18.b[0]
3288       lsr             x9, x9, #0x1     /* clear AC coeff */
3289     ldr             w13, [x5, #0x3c0]  /* x13 = actbl->ehufco[0xf0] */
3290     rbit            x9, x9             /* x9 = index0 */
3291     ldrb            w14, [x4, #0xf0]   /* x14 = actbl->ehufsi[0xf0] */
3292     cmp             w12, #(64-8)
3293     add             x11, sp, #16
3294     b.lt            4f
3295     cbz             x9, 6f
3296     st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
3297     st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
3298     st1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
3299     st1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
3300 1:
3301     clz             x2, x9
3302     add             x15, x15, x2, lsl #1
3303     lsl             x9, x9, x2
3304     ldrh            w20, [x15, #-126]
3305 2:
3306     cmp             x2, #0x10
3307     b.lt            3f
3308     sub             x2, x2, #0x10
3309     checkbuf47
3310     put_bits        x13, x14
3311     b               2b
3312 3:
3313     clz             w20, w20
3314     ldrh            w3, [x15, #2]!
3315     sub             w11, w20, #32
3316     lsl             w3, w3, w20
3317     neg             w11, w11
3318     lsr             w3, w3, w20
3319     add             x2, x11, x2, lsl #4
3320     lsl             x9, x9, #0x1
3321     ldr             w12, [x5, x2, lsl #2]
3322     ldrb            w10, [x4, x2]
3323     checkbuf31
3324     put_bits        x12, x10
3325     put_bits        x3, x11
3326     cbnz            x9, 1b
3327     b               6f
3328 4:
3329     movi            v21.8h, #0x0010
3330     clz             v0.8h, v0.8h
3331     clz             v1.8h, v1.8h
3332     clz             v2.8h, v2.8h
3333     clz             v3.8h, v3.8h
3334     clz             v4.8h, v4.8h
3335     clz             v5.8h, v5.8h
3336     clz             v6.8h, v6.8h
3337     clz             v7.8h, v7.8h
3338     ushl            v24.8h, v24.8h, v0.8h
3339     ushl            v25.8h, v25.8h, v1.8h
3340     ushl            v26.8h, v26.8h, v2.8h
3341     ushl            v27.8h, v27.8h, v3.8h
3342     ushl            v28.8h, v28.8h, v4.8h
3343     ushl            v29.8h, v29.8h, v5.8h
3344     ushl            v30.8h, v30.8h, v6.8h
3345     ushl            v31.8h, v31.8h, v7.8h
3346     neg             v0.8h, v0.8h
3347     neg             v1.8h, v1.8h
3348     neg             v2.8h, v2.8h
3349     neg             v3.8h, v3.8h
3350     neg             v4.8h, v4.8h
3351     neg             v5.8h, v5.8h
3352     neg             v6.8h, v6.8h
3353     neg             v7.8h, v7.8h
3354     ushl            v24.8h, v24.8h, v0.8h
3355     ushl            v25.8h, v25.8h, v1.8h
3356     ushl            v26.8h, v26.8h, v2.8h
3357     ushl            v27.8h, v27.8h, v3.8h
3358     ushl            v28.8h, v28.8h, v4.8h
3359     ushl            v29.8h, v29.8h, v5.8h
3360     ushl            v30.8h, v30.8h, v6.8h
3361     ushl            v31.8h, v31.8h, v7.8h
3362     add             v0.8h, v21.8h, v0.8h
3363     add             v1.8h, v21.8h, v1.8h
3364     add             v2.8h, v21.8h, v2.8h
3365     add             v3.8h, v21.8h, v3.8h
3366     add             v4.8h, v21.8h, v4.8h
3367     add             v5.8h, v21.8h, v5.8h
3368     add             v6.8h, v21.8h, v6.8h
3369     add             v7.8h, v21.8h, v7.8h
3370     st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
3371     st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
3372     st1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
3373     st1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
3374 1:
3375     clz             x2, x9
3376     add             x15, x15, x2, lsl #1
3377     lsl             x9, x9, x2
3378     ldrh            w11, [x15, #-126]
3379 2:
3380     cmp             x2, #0x10
3381     b.lt            3f
3382     sub             x2, x2, #0x10
3383     checkbuf47
3384     put_bits        x13, x14
3385     b               2b
3386 3:
3387     ldrh            w3, [x15, #2]!
3388     add             x2, x11, x2, lsl #4
3389     lsl             x9, x9, #0x1
3390     ldr             w12, [x5, x2, lsl #2]
3391     ldrb            w10, [x4, x2]
3392     checkbuf31
3393     put_bits        x12, x10
3394     put_bits        x3, x11
3395     cbnz            x9, 1b
3396 6:
3397     add             x13, sp, #0x10e
3398     cmp             x15, x13
3399     b.hs            1f
3400     ldr             w12, [x5]
3401     ldrb            w14, [x4]
3402     checkbuf47
3403     put_bits        x12, x14
3404 1:
3405     str             PUT_BUFFER, [x0, #0x10]
3406     str             PUT_BITSw, [x0, #0x18]
3407     ldp             x19, x20, [sp], 16
3408     add             x0, BUFFER, #0x1
3409     add             sp, sp, 256
3410     br              x30
3411
3412 .endm
3413
3414 generate_jsimd_huff_encode_one_block 1
3415 generate_jsimd_huff_encode_one_block 0
3416
3417     .unreq          BUFFER
3418     .unreq          PUT_BUFFER
3419     .unreq          PUT_BITS
3420     .unreq          PUT_BITSw
3421
3422 .purgem emit_byte
3423 .purgem put_bits
3424 .purgem checkbuf31
3425 .purgem checkbuf47