simd/arm/jcphuff-neon.c

   1 /*
   2  * jcphuff-neon.c - prepare data for progressive Huffman encoding (Arm Neon)
   3  *
   4  * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
   5  * Copyright (C) 2022, Matthieu Darbois.  All Rights Reserved.
   6  * Copyright (C) 2022, D. R. Commander.  All Rights Reserved.
   7  *
   8  * This software is provided 'as-is', without any express or implied
   9  * warranty.  In no event will the authors be held liable for any damages
  10  * arising from the use of this software.
  11  *
  12  * Permission is granted to anyone to use this software for any purpose,
  13  * including commercial applications, and to alter it and redistribute it
  14  * freely, subject to the following restrictions:
  15  *
  16  * 1. The origin of this software must not be misrepresented; you must not
  17  *    claim that you wrote the original software. If you use this software
  18  *    in a product, an acknowledgment in the product documentation would be
  19  *    appreciated but is not required.
  20  * 2. Altered source versions must be plainly marked as such, and must not be
  21  *    misrepresented as being the original software.
  22  * 3. This notice may not be removed or altered from any source distribution.
  23  */
  24
  25 #define JPEG_INTERNALS
  26 #include "../../jinclude.h"
  27 #include "../../jpeglib.h"
  28 #include "../../jsimd.h"
  29 #include "../../jdct.h"
  30 #include "../../jsimddct.h"
  31 #include "../jsimd.h"
  32 #include "neon-compat.h"
  33
  34 #include <arm_neon.h>
  35
  36
  37 /* Data preparation for encode_mcu_AC_first().
  38  *
  39  * The equivalent scalar C function (encode_mcu_AC_first_prepare()) can be
  40  * found in jcphuff.c.
  41  */
  42
  43 void jsimd_encode_mcu_AC_first_prepare_neon
  44   (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
  45    UJCOEF *values, size_t *zerobits)
  46 {
  47   UJCOEF *values_ptr = values;
  48   UJCOEF *diff_values_ptr = values + DCTSIZE2;
  49
  50   /* Rows of coefficients to zero (since they haven't been processed) */
  51   int i, rows_to_zero = 8;
  52
  53   for (i = 0; i < Sl / 16; i++) {
  54     int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
  55     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
  56     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
  57     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
  58     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
  59     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
  60     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
  61     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
  62     int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
  63     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
  64     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
  65     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
  66     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
  67     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
  68     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
  69     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
  70
  71     /* Isolate sign of coefficients. */
  72     uint16x8_t sign_coefs1 = vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15));
  73     uint16x8_t sign_coefs2 = vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15));
  74     /* Compute absolute value of coefficients and apply point transform Al. */
  75     uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
  76     uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
  77     abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
  78     abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
  79
  80     /* Compute diff values. */
  81     uint16x8_t diff1 = veorq_u16(abs_coefs1, sign_coefs1);
  82     uint16x8_t diff2 = veorq_u16(abs_coefs2, sign_coefs2);
  83
  84     /* Store transformed coefficients and diff values. */
  85     vst1q_u16(values_ptr, abs_coefs1);
  86     vst1q_u16(values_ptr + DCTSIZE, abs_coefs2);
  87     vst1q_u16(diff_values_ptr, diff1);
  88     vst1q_u16(diff_values_ptr + DCTSIZE, diff2);
  89     values_ptr += 16;
  90     diff_values_ptr += 16;
  91     jpeg_natural_order_start += 16;
  92     rows_to_zero -= 2;
  93   }
  94
  95   /* Same operation but for remaining partial vector */
  96   int remaining_coefs = Sl % 16;
  97   if (remaining_coefs > 8) {
  98     int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
  99     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
 100     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
 101     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
 102     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
 103     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
 104     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
 105     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
 106     int16x8_t coefs2 = vdupq_n_s16(0);
 107     switch (remaining_coefs) {
 108     case 15:
 109       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
 110       FALLTHROUGH               /*FALLTHROUGH*/
 111     case 14:
 112       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
 113       FALLTHROUGH               /*FALLTHROUGH*/
 114     case 13:
 115       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
 116       FALLTHROUGH               /*FALLTHROUGH*/
 117     case 12:
 118       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
 119       FALLTHROUGH               /*FALLTHROUGH*/
 120     case 11:
 121       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
 122       FALLTHROUGH               /*FALLTHROUGH*/
 123     case 10:
 124       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
 125       FALLTHROUGH               /*FALLTHROUGH*/
 126     case 9:
 127       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
 128       FALLTHROUGH               /*FALLTHROUGH*/
 129     default:
 130       break;
 131     }
 132
 133     /* Isolate sign of coefficients. */
 134     uint16x8_t sign_coefs1 = vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15));
 135     uint16x8_t sign_coefs2 = vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15));
 136     /* Compute absolute value of coefficients and apply point transform Al. */
 137     uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
 138     uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
 139     abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
 140     abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
 141
 142     /* Compute diff values. */
 143     uint16x8_t diff1 = veorq_u16(abs_coefs1, sign_coefs1);
 144     uint16x8_t diff2 = veorq_u16(abs_coefs2, sign_coefs2);
 145
 146     /* Store transformed coefficients and diff values. */
 147     vst1q_u16(values_ptr, abs_coefs1);
 148     vst1q_u16(values_ptr + DCTSIZE, abs_coefs2);
 149     vst1q_u16(diff_values_ptr, diff1);
 150     vst1q_u16(diff_values_ptr + DCTSIZE, diff2);
 151     values_ptr += 16;
 152     diff_values_ptr += 16;
 153     rows_to_zero -= 2;
 154
 155   } else if (remaining_coefs > 0) {
 156     int16x8_t coefs = vdupq_n_s16(0);
 157
 158     switch (remaining_coefs) {
 159     case 8:
 160       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
 161       FALLTHROUGH               /*FALLTHROUGH*/
 162     case 7:
 163       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
 164       FALLTHROUGH               /*FALLTHROUGH*/
 165     case 6:
 166       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
 167       FALLTHROUGH               /*FALLTHROUGH*/
 168     case 5:
 169       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
 170       FALLTHROUGH               /*FALLTHROUGH*/
 171     case 4:
 172       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
 173       FALLTHROUGH               /*FALLTHROUGH*/
 174     case 3:
 175       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
 176       FALLTHROUGH               /*FALLTHROUGH*/
 177     case 2:
 178       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
 179       FALLTHROUGH               /*FALLTHROUGH*/
 180     case 1:
 181       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
 182       FALLTHROUGH               /*FALLTHROUGH*/
 183     default:
 184       break;
 185     }
 186
 187     /* Isolate sign of coefficients. */
 188     uint16x8_t sign_coefs = vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15));
 189     /* Compute absolute value of coefficients and apply point transform Al. */
 190     uint16x8_t abs_coefs = vreinterpretq_u16_s16(vabsq_s16(coefs));
 191     abs_coefs = vshlq_u16(abs_coefs, vdupq_n_s16(-Al));
 192
 193     /* Compute diff values. */
 194     uint16x8_t diff = veorq_u16(abs_coefs, sign_coefs);
 195
 196     /* Store transformed coefficients and diff values. */
 197     vst1q_u16(values_ptr, abs_coefs);
 198     vst1q_u16(diff_values_ptr, diff);
 199     values_ptr += 8;
 200     diff_values_ptr += 8;
 201     rows_to_zero--;
 202   }
 203
 204   /* Zero remaining memory in the values and diff_values blocks. */
 205   for (i = 0; i < rows_to_zero; i++) {
 206     vst1q_u16(values_ptr, vdupq_n_u16(0));
 207     vst1q_u16(diff_values_ptr, vdupq_n_u16(0));
 208     values_ptr += 8;
 209     diff_values_ptr += 8;
 210   }
 211
 212   /* Construct zerobits bitmap.  A set bit means that the corresponding
 213    * coefficient != 0.
 214    */
 215   uint16x8_t row0 = vld1q_u16(values + 0 * DCTSIZE);
 216   uint16x8_t row1 = vld1q_u16(values + 1 * DCTSIZE);
 217   uint16x8_t row2 = vld1q_u16(values + 2 * DCTSIZE);
 218   uint16x8_t row3 = vld1q_u16(values + 3 * DCTSIZE);
 219   uint16x8_t row4 = vld1q_u16(values + 4 * DCTSIZE);
 220   uint16x8_t row5 = vld1q_u16(values + 5 * DCTSIZE);
 221   uint16x8_t row6 = vld1q_u16(values + 6 * DCTSIZE);
 222   uint16x8_t row7 = vld1q_u16(values + 7 * DCTSIZE);
 223
 224   uint8x8_t row0_eq0 = vmovn_u16(vceqq_u16(row0, vdupq_n_u16(0)));
 225   uint8x8_t row1_eq0 = vmovn_u16(vceqq_u16(row1, vdupq_n_u16(0)));
 226   uint8x8_t row2_eq0 = vmovn_u16(vceqq_u16(row2, vdupq_n_u16(0)));
 227   uint8x8_t row3_eq0 = vmovn_u16(vceqq_u16(row3, vdupq_n_u16(0)));
 228   uint8x8_t row4_eq0 = vmovn_u16(vceqq_u16(row4, vdupq_n_u16(0)));
 229   uint8x8_t row5_eq0 = vmovn_u16(vceqq_u16(row5, vdupq_n_u16(0)));
 230   uint8x8_t row6_eq0 = vmovn_u16(vceqq_u16(row6, vdupq_n_u16(0)));
 231   uint8x8_t row7_eq0 = vmovn_u16(vceqq_u16(row7, vdupq_n_u16(0)));
 232
 233   /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
 234   const uint8x8_t bitmap_mask =
 235     vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
 236
 237   row0_eq0 = vand_u8(row0_eq0, bitmap_mask);
 238   row1_eq0 = vand_u8(row1_eq0, bitmap_mask);
 239   row2_eq0 = vand_u8(row2_eq0, bitmap_mask);
 240   row3_eq0 = vand_u8(row3_eq0, bitmap_mask);
 241   row4_eq0 = vand_u8(row4_eq0, bitmap_mask);
 242   row5_eq0 = vand_u8(row5_eq0, bitmap_mask);
 243   row6_eq0 = vand_u8(row6_eq0, bitmap_mask);
 244   row7_eq0 = vand_u8(row7_eq0, bitmap_mask);
 245
 246   uint8x8_t bitmap_rows_01 = vpadd_u8(row0_eq0, row1_eq0);
 247   uint8x8_t bitmap_rows_23 = vpadd_u8(row2_eq0, row3_eq0);
 248   uint8x8_t bitmap_rows_45 = vpadd_u8(row4_eq0, row5_eq0);
 249   uint8x8_t bitmap_rows_67 = vpadd_u8(row6_eq0, row7_eq0);
 250   uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
 251   uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
 252   uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
 253
 254 #if defined(__aarch64__) || defined(_M_ARM64)
 255   /* Move bitmap to a 64-bit scalar register. */
 256   uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
 257   /* Store zerobits bitmap. */
 258   *zerobits = ~bitmap;
 259 #else
 260   /* Move bitmap to two 32-bit scalar registers. */
 261   uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
 262   uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
 263   /* Store zerobits bitmap. */
 264   zerobits[0] = ~bitmap0;
 265   zerobits[1] = ~bitmap1;
 266 #endif
 267 }
 268
 269
 270 /* Data preparation for encode_mcu_AC_refine().
 271  *
 272  * The equivalent scalar C function (encode_mcu_AC_refine_prepare()) can be
 273  * found in jcphuff.c.
 274  */
 275
 276 int jsimd_encode_mcu_AC_refine_prepare_neon
 277   (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
 278    UJCOEF *absvalues, size_t *bits)
 279 {
 280   /* Temporary storage buffers for data used to compute the signbits bitmap and
 281    * the end-of-block (EOB) position
 282    */
 283   uint8_t coef_sign_bits[64];
 284   uint8_t coef_eq1_bits[64];
 285
 286   UJCOEF *absvalues_ptr = absvalues;
 287   uint8_t *coef_sign_bits_ptr = coef_sign_bits;
 288   uint8_t *eq1_bits_ptr = coef_eq1_bits;
 289
 290   /* Rows of coefficients to zero (since they haven't been processed) */
 291   int i, rows_to_zero = 8;
 292
 293   for (i = 0; i < Sl / 16; i++) {
 294     int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
 295     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
 296     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
 297     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
 298     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
 299     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
 300     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
 301     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
 302     int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
 303     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
 304     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
 305     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
 306     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
 307     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
 308     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
 309     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
 310
 311     /* Compute and store data for signbits bitmap. */
 312     uint8x8_t sign_coefs1 =
 313       vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
 314     uint8x8_t sign_coefs2 =
 315       vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
 316     vst1_u8(coef_sign_bits_ptr, sign_coefs1);
 317     vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
 318
 319     /* Compute absolute value of coefficients and apply point transform Al. */
 320     uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
 321     uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
 322     abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
 323     abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
 324     vst1q_u16(absvalues_ptr, abs_coefs1);
 325     vst1q_u16(absvalues_ptr + DCTSIZE, abs_coefs2);
 326
 327     /* Test whether transformed coefficient values == 1 (used to find EOB
 328      * position.)
 329      */
 330     uint8x8_t coefs_eq11 = vmovn_u16(vceqq_u16(abs_coefs1, vdupq_n_u16(1)));
 331     uint8x8_t coefs_eq12 = vmovn_u16(vceqq_u16(abs_coefs2, vdupq_n_u16(1)));
 332     vst1_u8(eq1_bits_ptr, coefs_eq11);
 333     vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
 334
 335     absvalues_ptr += 16;
 336     coef_sign_bits_ptr += 16;
 337     eq1_bits_ptr += 16;
 338     jpeg_natural_order_start += 16;
 339     rows_to_zero -= 2;
 340   }
 341
 342   /* Same operation but for remaining partial vector */
 343   int remaining_coefs = Sl % 16;
 344   if (remaining_coefs > 8) {
 345     int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
 346     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
 347     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
 348     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
 349     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
 350     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
 351     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
 352     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
 353     int16x8_t coefs2 = vdupq_n_s16(0);
 354     switch (remaining_coefs) {
 355     case 15:
 356       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
 357       FALLTHROUGH               /*FALLTHROUGH*/
 358     case 14:
 359       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
 360       FALLTHROUGH               /*FALLTHROUGH*/
 361     case 13:
 362       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
 363       FALLTHROUGH               /*FALLTHROUGH*/
 364     case 12:
 365       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
 366       FALLTHROUGH               /*FALLTHROUGH*/
 367     case 11:
 368       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
 369       FALLTHROUGH               /*FALLTHROUGH*/
 370     case 10:
 371       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
 372       FALLTHROUGH               /*FALLTHROUGH*/
 373     case 9:
 374       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
 375       FALLTHROUGH               /*FALLTHROUGH*/
 376     default:
 377       break;
 378     }
 379
 380     /* Compute and store data for signbits bitmap. */
 381     uint8x8_t sign_coefs1 =
 382       vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
 383     uint8x8_t sign_coefs2 =
 384       vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
 385     vst1_u8(coef_sign_bits_ptr, sign_coefs1);
 386     vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
 387
 388     /* Compute absolute value of coefficients and apply point transform Al. */
 389     uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
 390     uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
 391     abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
 392     abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
 393     vst1q_u16(absvalues_ptr, abs_coefs1);
 394     vst1q_u16(absvalues_ptr + DCTSIZE, abs_coefs2);
 395
 396     /* Test whether transformed coefficient values == 1 (used to find EOB
 397      * position.)
 398      */
 399     uint8x8_t coefs_eq11 = vmovn_u16(vceqq_u16(abs_coefs1, vdupq_n_u16(1)));
 400     uint8x8_t coefs_eq12 = vmovn_u16(vceqq_u16(abs_coefs2, vdupq_n_u16(1)));
 401     vst1_u8(eq1_bits_ptr, coefs_eq11);
 402     vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
 403
 404     absvalues_ptr += 16;
 405     coef_sign_bits_ptr += 16;
 406     eq1_bits_ptr += 16;
 407     jpeg_natural_order_start += 16;
 408     rows_to_zero -= 2;
 409
 410   } else if (remaining_coefs > 0) {
 411     int16x8_t coefs = vdupq_n_s16(0);
 412
 413     switch (remaining_coefs) {
 414     case 8:
 415       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
 416       FALLTHROUGH               /*FALLTHROUGH*/
 417     case 7:
 418       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
 419       FALLTHROUGH               /*FALLTHROUGH*/
 420     case 6:
 421       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
 422       FALLTHROUGH               /*FALLTHROUGH*/
 423     case 5:
 424       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
 425       FALLTHROUGH               /*FALLTHROUGH*/
 426     case 4:
 427       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
 428       FALLTHROUGH               /*FALLTHROUGH*/
 429     case 3:
 430       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
 431       FALLTHROUGH               /*FALLTHROUGH*/
 432     case 2:
 433       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
 434       FALLTHROUGH               /*FALLTHROUGH*/
 435     case 1:
 436       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
 437       FALLTHROUGH               /*FALLTHROUGH*/
 438     default:
 439       break;
 440     }
 441
 442     /* Compute and store data for signbits bitmap. */
 443     uint8x8_t sign_coefs =
 444       vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15)));
 445     vst1_u8(coef_sign_bits_ptr, sign_coefs);
 446
 447     /* Compute absolute value of coefficients and apply point transform Al. */
 448     uint16x8_t abs_coefs = vreinterpretq_u16_s16(vabsq_s16(coefs));
 449     abs_coefs = vshlq_u16(abs_coefs, vdupq_n_s16(-Al));
 450     vst1q_u16(absvalues_ptr, abs_coefs);
 451
 452     /* Test whether transformed coefficient values == 1 (used to find EOB
 453      * position.)
 454      */
 455     uint8x8_t coefs_eq1 = vmovn_u16(vceqq_u16(abs_coefs, vdupq_n_u16(1)));
 456     vst1_u8(eq1_bits_ptr, coefs_eq1);
 457
 458     absvalues_ptr += 8;
 459     coef_sign_bits_ptr += 8;
 460     eq1_bits_ptr += 8;
 461     rows_to_zero--;
 462   }
 463
 464   /* Zero remaining memory in blocks. */
 465   for (i = 0; i < rows_to_zero; i++) {
 466     vst1q_u16(absvalues_ptr, vdupq_n_u16(0));
 467     vst1_u8(coef_sign_bits_ptr, vdup_n_u8(0));
 468     vst1_u8(eq1_bits_ptr, vdup_n_u8(0));
 469     absvalues_ptr += 8;
 470     coef_sign_bits_ptr += 8;
 471     eq1_bits_ptr += 8;
 472   }
 473
 474   /* Construct zerobits bitmap. */
 475   uint16x8_t abs_row0 = vld1q_u16(absvalues + 0 * DCTSIZE);
 476   uint16x8_t abs_row1 = vld1q_u16(absvalues + 1 * DCTSIZE);
 477   uint16x8_t abs_row2 = vld1q_u16(absvalues + 2 * DCTSIZE);
 478   uint16x8_t abs_row3 = vld1q_u16(absvalues + 3 * DCTSIZE);
 479   uint16x8_t abs_row4 = vld1q_u16(absvalues + 4 * DCTSIZE);
 480   uint16x8_t abs_row5 = vld1q_u16(absvalues + 5 * DCTSIZE);
 481   uint16x8_t abs_row6 = vld1q_u16(absvalues + 6 * DCTSIZE);
 482   uint16x8_t abs_row7 = vld1q_u16(absvalues + 7 * DCTSIZE);
 483
 484   uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_u16(abs_row0, vdupq_n_u16(0)));
 485   uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_u16(abs_row1, vdupq_n_u16(0)));
 486   uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_u16(abs_row2, vdupq_n_u16(0)));
 487   uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_u16(abs_row3, vdupq_n_u16(0)));
 488   uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_u16(abs_row4, vdupq_n_u16(0)));
 489   uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_u16(abs_row5, vdupq_n_u16(0)));
 490   uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_u16(abs_row6, vdupq_n_u16(0)));
 491   uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_u16(abs_row7, vdupq_n_u16(0)));
 492
 493   /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
 494   const uint8x8_t bitmap_mask =
 495     vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
 496
 497   abs_row0_eq0 = vand_u8(abs_row0_eq0, bitmap_mask);
 498   abs_row1_eq0 = vand_u8(abs_row1_eq0, bitmap_mask);
 499   abs_row2_eq0 = vand_u8(abs_row2_eq0, bitmap_mask);
 500   abs_row3_eq0 = vand_u8(abs_row3_eq0, bitmap_mask);
 501   abs_row4_eq0 = vand_u8(abs_row4_eq0, bitmap_mask);
 502   abs_row5_eq0 = vand_u8(abs_row5_eq0, bitmap_mask);
 503   abs_row6_eq0 = vand_u8(abs_row6_eq0, bitmap_mask);
 504   abs_row7_eq0 = vand_u8(abs_row7_eq0, bitmap_mask);
 505
 506   uint8x8_t bitmap_rows_01 = vpadd_u8(abs_row0_eq0, abs_row1_eq0);
 507   uint8x8_t bitmap_rows_23 = vpadd_u8(abs_row2_eq0, abs_row3_eq0);
 508   uint8x8_t bitmap_rows_45 = vpadd_u8(abs_row4_eq0, abs_row5_eq0);
 509   uint8x8_t bitmap_rows_67 = vpadd_u8(abs_row6_eq0, abs_row7_eq0);
 510   uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
 511   uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
 512   uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
 513
 514 #if defined(__aarch64__) || defined(_M_ARM64)
 515   /* Move bitmap to a 64-bit scalar register. */
 516   uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
 517   /* Store zerobits bitmap. */
 518   bits[0] = ~bitmap;
 519 #else
 520   /* Move bitmap to two 32-bit scalar registers. */
 521   uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
 522   uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
 523   /* Store zerobits bitmap. */
 524   bits[0] = ~bitmap0;
 525   bits[1] = ~bitmap1;
 526 #endif
 527
 528   /* Construct signbits bitmap. */
 529   uint8x8_t signbits_row0 = vld1_u8(coef_sign_bits + 0 * DCTSIZE);
 530   uint8x8_t signbits_row1 = vld1_u8(coef_sign_bits + 1 * DCTSIZE);
 531   uint8x8_t signbits_row2 = vld1_u8(coef_sign_bits + 2 * DCTSIZE);
 532   uint8x8_t signbits_row3 = vld1_u8(coef_sign_bits + 3 * DCTSIZE);
 533   uint8x8_t signbits_row4 = vld1_u8(coef_sign_bits + 4 * DCTSIZE);
 534   uint8x8_t signbits_row5 = vld1_u8(coef_sign_bits + 5 * DCTSIZE);
 535   uint8x8_t signbits_row6 = vld1_u8(coef_sign_bits + 6 * DCTSIZE);
 536   uint8x8_t signbits_row7 = vld1_u8(coef_sign_bits + 7 * DCTSIZE);
 537
 538   signbits_row0 = vand_u8(signbits_row0, bitmap_mask);
 539   signbits_row1 = vand_u8(signbits_row1, bitmap_mask);
 540   signbits_row2 = vand_u8(signbits_row2, bitmap_mask);
 541   signbits_row3 = vand_u8(signbits_row3, bitmap_mask);
 542   signbits_row4 = vand_u8(signbits_row4, bitmap_mask);
 543   signbits_row5 = vand_u8(signbits_row5, bitmap_mask);
 544   signbits_row6 = vand_u8(signbits_row6, bitmap_mask);
 545   signbits_row7 = vand_u8(signbits_row7, bitmap_mask);
 546
 547   bitmap_rows_01 = vpadd_u8(signbits_row0, signbits_row1);
 548   bitmap_rows_23 = vpadd_u8(signbits_row2, signbits_row3);
 549   bitmap_rows_45 = vpadd_u8(signbits_row4, signbits_row5);
 550   bitmap_rows_67 = vpadd_u8(signbits_row6, signbits_row7);
 551   bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
 552   bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
 553   bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
 554
 555 #if defined(__aarch64__) || defined(_M_ARM64)
 556   /* Move bitmap to a 64-bit scalar register. */
 557   bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
 558   /* Store signbits bitmap. */
 559   bits[1] = ~bitmap;
 560 #else
 561   /* Move bitmap to two 32-bit scalar registers. */
 562   bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
 563   bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
 564   /* Store signbits bitmap. */
 565   bits[2] = ~bitmap0;
 566   bits[3] = ~bitmap1;
 567 #endif
 568
 569   /* Construct bitmap to find EOB position (the index of the last coefficient
 570    * equal to 1.)
 571    */
 572   uint8x8_t row0_eq1 = vld1_u8(coef_eq1_bits + 0 * DCTSIZE);
 573   uint8x8_t row1_eq1 = vld1_u8(coef_eq1_bits + 1 * DCTSIZE);
 574   uint8x8_t row2_eq1 = vld1_u8(coef_eq1_bits + 2 * DCTSIZE);
 575   uint8x8_t row3_eq1 = vld1_u8(coef_eq1_bits + 3 * DCTSIZE);
 576   uint8x8_t row4_eq1 = vld1_u8(coef_eq1_bits + 4 * DCTSIZE);
 577   uint8x8_t row5_eq1 = vld1_u8(coef_eq1_bits + 5 * DCTSIZE);
 578   uint8x8_t row6_eq1 = vld1_u8(coef_eq1_bits + 6 * DCTSIZE);
 579   uint8x8_t row7_eq1 = vld1_u8(coef_eq1_bits + 7 * DCTSIZE);
 580
 581   row0_eq1 = vand_u8(row0_eq1, bitmap_mask);
 582   row1_eq1 = vand_u8(row1_eq1, bitmap_mask);
 583   row2_eq1 = vand_u8(row2_eq1, bitmap_mask);
 584   row3_eq1 = vand_u8(row3_eq1, bitmap_mask);
 585   row4_eq1 = vand_u8(row4_eq1, bitmap_mask);
 586   row5_eq1 = vand_u8(row5_eq1, bitmap_mask);
 587   row6_eq1 = vand_u8(row6_eq1, bitmap_mask);
 588   row7_eq1 = vand_u8(row7_eq1, bitmap_mask);
 589
 590   bitmap_rows_01 = vpadd_u8(row0_eq1, row1_eq1);
 591   bitmap_rows_23 = vpadd_u8(row2_eq1, row3_eq1);
 592   bitmap_rows_45 = vpadd_u8(row4_eq1, row5_eq1);
 593   bitmap_rows_67 = vpadd_u8(row6_eq1, row7_eq1);
 594   bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
 595   bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
 596   bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
 597
 598 #if defined(__aarch64__) || defined(_M_ARM64)
 599   /* Move bitmap to a 64-bit scalar register. */
 600   bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
 601
 602   /* Return EOB position. */
 603   if (bitmap == 0) {
 604     /* EOB position is defined to be 0 if all coefficients != 1. */
 605     return 0;
 606   } else {
 607     return 63 - BUILTIN_CLZLL(bitmap);
 608   }
 609 #else
 610   /* Move bitmap to two 32-bit scalar registers. */
 611   bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
 612   bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
 613
 614   /* Return EOB position. */
 615   if (bitmap0 == 0 && bitmap1 == 0) {
 616     return 0;
 617   } else if (bitmap1 != 0) {
 618     return 63 - BUILTIN_CLZ(bitmap1);
 619   } else {
 620     return 31 - BUILTIN_CLZ(bitmap0);
 621   }
 622 #endif
 623 }