Imported Upstream version 2.0.1
[platform/upstream/libjpeg-turbo.git] / simd / arm / jsimd_neon.S
similarity index 91%
rename from simd/jsimd_arm_neon.S
rename to simd/arm/jsimd_neon.S
index cd26127..af929fe 100644 (file)
@@ -2,12 +2,12 @@
  * ARMv7 NEON optimizations for libjpeg-turbo
  *
  * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
- * All Rights Reserved.
- * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+ *                          All Rights Reserved.
+ * Author:  Siarhei Siamashka <siarhei.siamashka@nokia.com>
  * Copyright (C) 2014, Siarhei Siamashka.  All Rights Reserved.
  * Copyright (C) 2014, Linaro Limited.  All Rights Reserved.
  * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
- * Copyright (C) 2015-2016, Matthieu Darbois.  All Rights Reserved.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.  All Rights Reserved.
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
@@ -38,7 +38,7 @@
 .syntax unified
 
 
-#define RESPECT_STRICT_ALIGNMENT 1
+#define RESPECT_STRICT_ALIGNMENT  1
 
 
 /*****************************************************************************/
@@ -46,6 +46,7 @@
 /* Supplementary macro for setting function attributes */
 .macro asm_function fname
 #ifdef __APPLE__
+    .private_extern _\fname
     .globl _\fname
 _\fname:
 #else
@@ -67,7 +68,7 @@ _\fname:
 .endm
 
 
-#define CENTERJSAMPLE 128
+#define CENTERJSAMPLE  128
 
 /*****************************************************************************/
 
@@ -75,115 +76,114 @@ _\fname:
  * Perform dequantization and inverse DCT on one block of coefficients.
  *
  * GLOBAL(void)
- * jsimd_idct_islow_neon (void *dct_table, JCOEFPTR coef_block,
- *                        JSAMPARRAY output_buf, JDIMENSION output_col)
+ * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
+ *                       JSAMPARRAY output_buf, JDIMENSION output_col)
  */
 
-#define FIX_0_298631336 (2446)
-#define FIX_0_390180644 (3196)
-#define FIX_0_541196100 (4433)
-#define FIX_0_765366865 (6270)
-#define FIX_0_899976223 (7373)
-#define FIX_1_175875602 (9633)
-#define FIX_1_501321110 (12299)
-#define FIX_1_847759065 (15137)
-#define FIX_1_961570560 (16069)
-#define FIX_2_053119869 (16819)
-#define FIX_2_562915447 (20995)
-#define FIX_3_072711026 (25172)
-
-#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
-#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
-#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
-#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
-#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
-#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
-#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
-#define FIX_0_541196100_PLUS_0_765366865  (FIX_0_541196100 + FIX_0_765366865)
+#define FIX_0_298631336  (2446)
+#define FIX_0_390180644  (3196)
+#define FIX_0_541196100  (4433)
+#define FIX_0_765366865  (6270)
+#define FIX_0_899976223  (7373)
+#define FIX_1_175875602  (9633)
+#define FIX_1_501321110  (12299)
+#define FIX_1_847759065  (15137)
+#define FIX_1_961570560  (16069)
+#define FIX_2_053119869  (16819)
+#define FIX_2_562915447  (20995)
+#define FIX_3_072711026  (25172)
+
+#define FIX_1_175875602_MINUS_1_961570560  (FIX_1_175875602 - FIX_1_961570560)
+#define FIX_1_175875602_MINUS_0_390180644  (FIX_1_175875602 - FIX_0_390180644)
+#define FIX_0_541196100_MINUS_1_847759065  (FIX_0_541196100 - FIX_1_847759065)
+#define FIX_3_072711026_MINUS_2_562915447  (FIX_3_072711026 - FIX_2_562915447)
+#define FIX_0_298631336_MINUS_0_899976223  (FIX_0_298631336 - FIX_0_899976223)
+#define FIX_1_501321110_MINUS_0_899976223  (FIX_1_501321110 - FIX_0_899976223)
+#define FIX_2_053119869_MINUS_2_562915447  (FIX_2_053119869 - FIX_2_562915447)
+#define FIX_0_541196100_PLUS_0_765366865   (FIX_0_541196100 + FIX_0_765366865)
 
 /*
  * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
  * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
  */
-#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7)   \
-{                                                                             \
-    DCTELEM row0, row1, row2, row3, row4, row5, row6, row7;                   \
-    JLONG   q1, q2, q3, q4, q5, q6, q7;                                       \
-    JLONG   tmp11_plus_tmp2, tmp11_minus_tmp2;                                \
-                                                                              \
-    /* 1-D iDCT input data */                                                 \
-    row0 = xrow0;                                                             \
-    row1 = xrow1;                                                             \
-    row2 = xrow2;                                                             \
-    row3 = xrow3;                                                             \
-    row4 = xrow4;                                                             \
-    row5 = xrow5;                                                             \
-    row6 = xrow6;                                                             \
-    row7 = xrow7;                                                             \
-                                                                              \
-    q5 = row7 + row3;                                                         \
-    q4 = row5 + row1;                                                         \
-    q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) +                    \
-         MULTIPLY(q4, FIX_1_175875602);                                       \
-    q7 = MULTIPLY(q5, FIX_1_175875602) +                                      \
-         MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644);                     \
-    q2 = MULTIPLY(row2, FIX_0_541196100) +                                    \
-         MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065);                   \
-    q4 = q6;                                                                  \
-    q3 = ((JLONG) row0 - (JLONG) row4) << 13;                                 \
-    q6 += MULTIPLY(row5, -FIX_2_562915447) +                                  \
-          MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447);                  \
-    /* now we can use q1 (reloadable constants have been used up) */          \
-    q1 = q3 + q2;                                                             \
-    q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) +                 \
-          MULTIPLY(row1, -FIX_0_899976223);                                   \
-    q5 = q7;                                                                  \
-    q1 = q1 + q6;                                                             \
-    q7 += MULTIPLY(row7, -FIX_0_899976223) +                                  \
-          MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223);                  \
-                                                                              \
-    /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */        \
-    tmp11_plus_tmp2 = q1;                                                     \
-    row1 = 0;                                                                 \
-                                                                              \
-    q1 = q1 - q6;                                                             \
-    q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) +                 \
-          MULTIPLY(row3, -FIX_2_562915447);                                   \
-    q1 = q1 - q6;                                                             \
-    q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) +                   \
-         MULTIPLY(row6, FIX_0_541196100);                                     \
-    q3 = q3 - q2;                                                             \
-                                                                              \
-    /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */        \
-    tmp11_minus_tmp2 = q1;                                                    \
-                                                                              \
-    q1 = ((JLONG) row0 + (JLONG) row4) << 13;                                 \
-    q2 = q1 + q6;                                                             \
-    q1 = q1 - q6;                                                             \
-                                                                              \
-    /* pick up the results */                                                 \
-    tmp0  = q4;                                                               \
-    tmp1  = q5;                                                               \
-    tmp2  = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2;                         \
-    tmp3  = q7;                                                               \
-    tmp10 = q2;                                                               \
-    tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2;                         \
-    tmp12 = q3;                                                               \
-    tmp13 = q1;                                                               \
+#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) { \
+    DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
+    JLONG   q1, q2, q3, q4, q5, q6, q7; \
+    JLONG   tmp11_plus_tmp2, tmp11_minus_tmp2; \
+    \
+    /* 1-D iDCT input data */ \
+    row0 = xrow0; \
+    row1 = xrow1; \
+    row2 = xrow2; \
+    row3 = xrow3; \
+    row4 = xrow4; \
+    row5 = xrow5; \
+    row6 = xrow6; \
+    row7 = xrow7; \
+    \
+    q5 = row7 + row3; \
+    q4 = row5 + row1; \
+    q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
+         MULTIPLY(q4, FIX_1_175875602); \
+    q7 = MULTIPLY(q5, FIX_1_175875602) + \
+         MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
+    q2 = MULTIPLY(row2, FIX_0_541196100) + \
+         MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
+    q4 = q6; \
+    q3 = ((JLONG)row0 - (JLONG)row4) << 13; \
+    q6 += MULTIPLY(row5, -FIX_2_562915447) + \
+          MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
+    /* now we can use q1 (reloadable constants have been used up) */ \
+    q1 = q3 + q2; \
+    q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
+          MULTIPLY(row1, -FIX_0_899976223); \
+    q5 = q7; \
+    q1 = q1 + q6; \
+    q7 += MULTIPLY(row7, -FIX_0_899976223) + \
+          MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
+    \
+    /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
+    tmp11_plus_tmp2 = q1; \
+    row1 = 0; \
+    \
+    q1 = q1 - q6; \
+    q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
+          MULTIPLY(row3, -FIX_2_562915447); \
+    q1 = q1 - q6; \
+    q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
+         MULTIPLY(row6, FIX_0_541196100); \
+    q3 = q3 - q2; \
+    \
+    /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
+    tmp11_minus_tmp2 = q1; \
+    \
+    q1 = ((JLONG)row0 + (JLONG)row4) << 13; \
+    q2 = q1 + q6; \
+    q1 = q1 - q6; \
+    \
+    /* pick up the results */ \
+    tmp0  = q4; \
+    tmp1  = q5; \
+    tmp2  = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
+    tmp3  = q7; \
+    tmp10 = q2; \
+    tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
+    tmp12 = q3; \
+    tmp13 = q1; \
 }
 
-#define XFIX_0_899976223                   d0[0]
-#define XFIX_0_541196100                   d0[1]
-#define XFIX_2_562915447                   d0[2]
-#define XFIX_0_298631336_MINUS_0_899976223 d0[3]
-#define XFIX_1_501321110_MINUS_0_899976223 d1[0]
-#define XFIX_2_053119869_MINUS_2_562915447 d1[1]
-#define XFIX_0_541196100_PLUS_0_765366865  d1[2]
-#define XFIX_1_175875602                   d1[3]
-#define XFIX_1_175875602_MINUS_0_390180644 d2[0]
-#define XFIX_0_541196100_MINUS_1_847759065 d2[1]
-#define XFIX_3_072711026_MINUS_2_562915447 d2[2]
-#define XFIX_1_175875602_MINUS_1_961570560 d2[3]
+#define XFIX_0_899976223                    d0[0]
+#define XFIX_0_541196100                    d0[1]
+#define XFIX_2_562915447                    d0[2]
+#define XFIX_0_298631336_MINUS_0_899976223  d0[3]
+#define XFIX_1_501321110_MINUS_0_899976223  d1[0]
+#define XFIX_2_053119869_MINUS_2_562915447  d1[1]
+#define XFIX_0_541196100_PLUS_0_765366865   d1[2]
+#define XFIX_1_175875602                    d1[3]
+#define XFIX_1_175875602_MINUS_0_390180644  d2[0]
+#define XFIX_0_541196100_MINUS_1_847759065  d2[1]
+#define XFIX_3_072711026_MINUS_2_562915447  d2[2]
+#define XFIX_1_175875602_MINUS_1_961570560  d2[3]
 
 .balign 16
 jsimd_idct_islow_neon_consts:
@@ -695,10 +695,10 @@ asm_function jsimd_idct_islow_neon
  * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
  */
 
-#define XFIX_1_082392200 d0[0]
-#define XFIX_1_414213562 d0[1]
-#define XFIX_1_847759065 d0[2]
-#define XFIX_2_613125930 d0[3]
+#define XFIX_1_082392200  d0[0]
+#define XFIX_1_414213562  d0[1]
+#define XFIX_1_847759065  d0[2]
+#define XFIX_2_613125930  d0[3]
 
 .balign 16
 jsimd_idct_ifast_neon_consts:
@@ -923,35 +923,35 @@ asm_function jsimd_idct_ifast_neon
 
 #define CONST_BITS  13
 
-#define FIX_0_211164243 (1730)   /* FIX(0.211164243) */
-#define FIX_0_509795579 (4176)   /* FIX(0.509795579) */
-#define FIX_0_601344887 (4926)   /* FIX(0.601344887) */
-#define FIX_0_720959822 (5906)   /* FIX(0.720959822) */
-#define FIX_0_765366865 (6270)   /* FIX(0.765366865) */
-#define FIX_0_850430095 (6967)   /* FIX(0.850430095) */
-#define FIX_0_899976223 (7373)   /* FIX(0.899976223) */
-#define FIX_1_061594337 (8697)   /* FIX(1.061594337) */
-#define FIX_1_272758580 (10426)  /* FIX(1.272758580) */
-#define FIX_1_451774981 (11893)  /* FIX(1.451774981) */
-#define FIX_1_847759065 (15137)  /* FIX(1.847759065) */
-#define FIX_2_172734803 (17799)  /* FIX(2.172734803) */
-#define FIX_2_562915447 (20995)  /* FIX(2.562915447) */
-#define FIX_3_624509785 (29692)  /* FIX(3.624509785) */
+#define FIX_0_211164243  (1730)   /* FIX(0.211164243) */
+#define FIX_0_509795579  (4176)   /* FIX(0.509795579) */
+#define FIX_0_601344887  (4926)   /* FIX(0.601344887) */
+#define FIX_0_720959822  (5906)   /* FIX(0.720959822) */
+#define FIX_0_765366865  (6270)   /* FIX(0.765366865) */
+#define FIX_0_850430095  (6967)   /* FIX(0.850430095) */
+#define FIX_0_899976223  (7373)   /* FIX(0.899976223) */
+#define FIX_1_061594337  (8697)   /* FIX(1.061594337) */
+#define FIX_1_272758580  (10426)  /* FIX(1.272758580) */
+#define FIX_1_451774981  (11893)  /* FIX(1.451774981) */
+#define FIX_1_847759065  (15137)  /* FIX(1.847759065) */
+#define FIX_2_172734803  (17799)  /* FIX(2.172734803) */
+#define FIX_2_562915447  (20995)  /* FIX(2.562915447) */
+#define FIX_3_624509785  (29692)  /* FIX(3.624509785) */
 
 .balign 16
 jsimd_idct_4x4_neon_consts:
-  .short FIX_1_847759065      /* d0[0] */
-  .short -FIX_0_765366865     /* d0[1] */
-  .short -FIX_0_211164243     /* d0[2] */
-  .short FIX_1_451774981      /* d0[3] */
-  .short -FIX_2_172734803     /* d1[0] */
-  .short FIX_1_061594337      /* d1[1] */
-  .short -FIX_0_509795579     /* d1[2] */
-  .short -FIX_0_601344887     /* d1[3] */
-  .short FIX_0_899976223      /* d2[0] */
-  .short FIX_2_562915447      /* d2[1] */
-  .short 1 << (CONST_BITS+1)  /* d2[2] */
-  .short 0                    /* d2[3] */
+  .short FIX_1_847759065        /* d0[0] */
+  .short -FIX_0_765366865       /* d0[1] */
+  .short -FIX_0_211164243       /* d0[2] */
+  .short FIX_1_451774981        /* d0[3] */
+  .short -FIX_2_172734803       /* d1[0] */
+  .short FIX_1_061594337        /* d1[1] */
+  .short -FIX_0_509795579       /* d1[2] */
+  .short -FIX_0_601344887       /* d1[3] */
+  .short FIX_0_899976223        /* d2[0] */
+  .short FIX_2_562915447        /* d2[1] */
+  .short 1 << (CONST_BITS + 1)  /* d2[2] */
+  .short 0                      /* d2[3] */
 
 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
     vmull.s16       q14, \x4, d2[2]
@@ -1994,10 +1994,10 @@ asm_function jsimd_convsamp_neon
  *       rid of a bunch of VLD1.16 instructions
  */
 
-#define XFIX_0_382683433 d0[0]
-#define XFIX_0_541196100 d0[1]
-#define XFIX_0_707106781 d0[2]
-#define XFIX_1_306562965 d0[3]
+#define XFIX_0_382683433  d0[0]
+#define XFIX_0_541196100  d0[1]
+#define XFIX_0_707106781  d0[2]
+#define XFIX_1_306562965  d0[3]
 
 .balign 16
 jsimd_fdct_ifast_neon_consts:
@@ -2107,8 +2107,8 @@ asm_function jsimd_fdct_ifast_neon
 
 /*
  * GLOBAL(void)
- * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM *divisors,
- *                      DCTELEM *workspace);
+ * jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
+ *                     DCTELEM *workspace);
  *
  * Note: the code uses 2 stage pipelining in order to improve instructions
  *       scheduling and eliminate stalls (this provides ~15% better
@@ -2208,10 +2208,10 @@ asm_function jsimd_quantize_neon
 
 /*
  * GLOBAL(void)
- * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor,
- *                                 JDIMENSION downsampled_width,
- *                                 JSAMPARRAY input_data,
- *                                 JSAMPARRAY *output_data_ptr);
+ * jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,
+ *                                JDIMENSION downsampled_width,
+ *                                JSAMPARRAY input_data,
+ *                                JSAMPARRAY *output_data_ptr);
  *
  * Note: the use of unaligned writes is the main remaining bottleneck in
  *       this code, which can be potentially solved to get up to tens
@@ -2444,10 +2444,10 @@ asm_function jsimd_h2v1_fancy_upsample_neon
 /*****************************************************************************/
 
 /*
- * GLOBAL(JOCTET*)
- * jsimd_huff_encode_one_block (working_state *state, JOCTET *buffer,
- *                              JCOEFPTR block, int last_dc_val,
- *                              c_derived_tbl *dctbl, c_derived_tbl *actbl)
+ * GLOBAL(JOCTET *)
+ * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer,
+ *                             JCOEFPTR block, int last_dc_val,
+ *                             c_derived_tbl *dctbl, c_derived_tbl *actbl)
  *
  */
 
@@ -2731,7 +2731,7 @@ asm_function jsimd_huff_encode_one_block_neon
     ldr             r11, [r0, #0x8]       /* r11 = put_buffer */
     ldr             r4, [r0, #0xc]        /* r4  = put_bits */
     ldrh            r2, [r6, #-128]       /* r2  = nbits */
-    ldrh            r3, [r6]              /* r3  = temp2 & (((JLONG) 1)<<nbits) - 1; */
+    ldrh            r3, [r6]              /* r3  = temp2 & (((JLONG)1)<<nbits) - 1; */
     ldr             r0, [lr, r2, lsl #2]
     ldrb            r5, [r1, r2]
     put_bits        r11, r4, r0, r5