simd/arm/jcphuff-neon.c

   1 /*
   2  * jcphuff-neon.c - prepare data for progressive Huffman encoding (Arm Neon)
   3  *
   4  * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
   5  *
   6  * This software is provided 'as-is', without any express or implied
   7  * warranty.  In no event will the authors be held liable for any damages
   8  * arising from the use of this software.
   9  *
  10  * Permission is granted to anyone to use this software for any purpose,
  11  * including commercial applications, and to alter it and redistribute it
  12  * freely, subject to the following restrictions:
  13  *
  14  * 1. The origin of this software must not be misrepresented; you must not
  15  *    claim that you wrote the original software. If you use this software
  16  *    in a product, an acknowledgment in the product documentation would be
  17  *    appreciated but is not required.
  18  * 2. Altered source versions must be plainly marked as such, and must not be
  19  *    misrepresented as being the original software.
  20  * 3. This notice may not be removed or altered from any source distribution.
  21  */
  22
  23 #define JPEG_INTERNALS
  24 #include "jconfigint.h"
  25 #include "../../jinclude.h"
  26 #include "../../jpeglib.h"
  27 #include "../../jsimd.h"
  28 #include "../../jdct.h"
  29 #include "../../jsimddct.h"
  30 #include "../jsimd.h"
  31 #include "neon-compat.h"
  32
  33 #include <arm_neon.h>
  34
  35
  36 /* Data preparation for encode_mcu_AC_first().
  37  *
  38  * The equivalent scalar C function (encode_mcu_AC_first_prepare()) can be
  39  * found in jcphuff.c.
  40  */
  41
  42 void jsimd_encode_mcu_AC_first_prepare_neon
  43   (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
  44    JCOEF *values, size_t *zerobits)
  45 {
  46   JCOEF *values_ptr = values;
  47   JCOEF *diff_values_ptr = values + DCTSIZE2;
  48
  49   /* Rows of coefficients to zero (since they haven't been processed) */
  50   int i, rows_to_zero = 8;
  51
  52   for (i = 0; i < Sl / 16; i++) {
  53     int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
  54     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
  55     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
  56     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
  57     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
  58     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
  59     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
  60     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
  61     int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
  62     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
  63     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
  64     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
  65     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
  66     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
  67     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
  68     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
  69
  70     /* Isolate sign of coefficients. */
  71     int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
  72     int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
  73     /* Compute absolute value of coefficients and apply point transform Al. */
  74     int16x8_t abs_coefs1 = vabsq_s16(coefs1);
  75     int16x8_t abs_coefs2 = vabsq_s16(coefs2);
  76     coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
  77     coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
  78
  79     /* Compute diff values. */
  80     int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
  81     int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
  82
  83     /* Store transformed coefficients and diff values. */
  84     vst1q_s16(values_ptr, coefs1);
  85     vst1q_s16(values_ptr + DCTSIZE, coefs2);
  86     vst1q_s16(diff_values_ptr, diff1);
  87     vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
  88     values_ptr += 16;
  89     diff_values_ptr += 16;
  90     jpeg_natural_order_start += 16;
  91     rows_to_zero -= 2;
  92   }
  93
  94   /* Same operation but for remaining partial vector */
  95   int remaining_coefs = Sl % 16;
  96   if (remaining_coefs > 8) {
  97     int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
  98     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
  99     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
 100     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
 101     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
 102     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
 103     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
 104     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
 105     int16x8_t coefs2 = vdupq_n_s16(0);
 106     switch (remaining_coefs) {
 107     case 15:
 108       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
 109       FALLTHROUGH               /*FALLTHROUGH*/
 110     case 14:
 111       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
 112       FALLTHROUGH               /*FALLTHROUGH*/
 113     case 13:
 114       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
 115       FALLTHROUGH               /*FALLTHROUGH*/
 116     case 12:
 117       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
 118       FALLTHROUGH               /*FALLTHROUGH*/
 119     case 11:
 120       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
 121       FALLTHROUGH               /*FALLTHROUGH*/
 122     case 10:
 123       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
 124       FALLTHROUGH               /*FALLTHROUGH*/
 125     case 9:
 126       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
 127       FALLTHROUGH               /*FALLTHROUGH*/
 128     default:
 129       break;
 130     }
 131
 132     /* Isolate sign of coefficients. */
 133     int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
 134     int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
 135     /* Compute absolute value of coefficients and apply point transform Al. */
 136     int16x8_t abs_coefs1 = vabsq_s16(coefs1);
 137     int16x8_t abs_coefs2 = vabsq_s16(coefs2);
 138     coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
 139     coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
 140
 141     /* Compute diff values. */
 142     int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
 143     int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
 144
 145     /* Store transformed coefficients and diff values. */
 146     vst1q_s16(values_ptr, coefs1);
 147     vst1q_s16(values_ptr + DCTSIZE, coefs2);
 148     vst1q_s16(diff_values_ptr, diff1);
 149     vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
 150     values_ptr += 16;
 151     diff_values_ptr += 16;
 152     rows_to_zero -= 2;
 153
 154   } else if (remaining_coefs > 0) {
 155     int16x8_t coefs = vdupq_n_s16(0);
 156
 157     switch (remaining_coefs) {
 158     case 8:
 159       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
 160       FALLTHROUGH               /*FALLTHROUGH*/
 161     case 7:
 162       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
 163       FALLTHROUGH               /*FALLTHROUGH*/
 164     case 6:
 165       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
 166       FALLTHROUGH               /*FALLTHROUGH*/
 167     case 5:
 168       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
 169       FALLTHROUGH               /*FALLTHROUGH*/
 170     case 4:
 171       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
 172       FALLTHROUGH               /*FALLTHROUGH*/
 173     case 3:
 174       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
 175       FALLTHROUGH               /*FALLTHROUGH*/
 176     case 2:
 177       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
 178       FALLTHROUGH               /*FALLTHROUGH*/
 179     case 1:
 180       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
 181       FALLTHROUGH               /*FALLTHROUGH*/
 182     default:
 183       break;
 184     }
 185
 186     /* Isolate sign of coefficients. */
 187     int16x8_t sign_coefs = vshrq_n_s16(coefs, 15);
 188     /* Compute absolute value of coefficients and apply point transform Al. */
 189     int16x8_t abs_coefs = vabsq_s16(coefs);
 190     coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
 191
 192     /* Compute diff values. */
 193     int16x8_t diff = veorq_s16(coefs, sign_coefs);
 194
 195     /* Store transformed coefficients and diff values. */
 196     vst1q_s16(values_ptr, coefs);
 197     vst1q_s16(diff_values_ptr, diff);
 198     values_ptr += 8;
 199     diff_values_ptr += 8;
 200     rows_to_zero--;
 201   }
 202
 203   /* Zero remaining memory in the values and diff_values blocks. */
 204   for (i = 0; i < rows_to_zero; i++) {
 205     vst1q_s16(values_ptr, vdupq_n_s16(0));
 206     vst1q_s16(diff_values_ptr, vdupq_n_s16(0));
 207     values_ptr += 8;
 208     diff_values_ptr += 8;
 209   }
 210
 211   /* Construct zerobits bitmap.  A set bit means that the corresponding
 212    * coefficient != 0.
 213    */
 214   int16x8_t row0 = vld1q_s16(values + 0 * DCTSIZE);
 215   int16x8_t row1 = vld1q_s16(values + 1 * DCTSIZE);
 216   int16x8_t row2 = vld1q_s16(values + 2 * DCTSIZE);
 217   int16x8_t row3 = vld1q_s16(values + 3 * DCTSIZE);
 218   int16x8_t row4 = vld1q_s16(values + 4 * DCTSIZE);
 219   int16x8_t row5 = vld1q_s16(values + 5 * DCTSIZE);
 220   int16x8_t row6 = vld1q_s16(values + 6 * DCTSIZE);
 221   int16x8_t row7 = vld1q_s16(values + 7 * DCTSIZE);
 222
 223   uint8x8_t row0_eq0 = vmovn_u16(vceqq_s16(row0, vdupq_n_s16(0)));
 224   uint8x8_t row1_eq0 = vmovn_u16(vceqq_s16(row1, vdupq_n_s16(0)));
 225   uint8x8_t row2_eq0 = vmovn_u16(vceqq_s16(row2, vdupq_n_s16(0)));
 226   uint8x8_t row3_eq0 = vmovn_u16(vceqq_s16(row3, vdupq_n_s16(0)));
 227   uint8x8_t row4_eq0 = vmovn_u16(vceqq_s16(row4, vdupq_n_s16(0)));
 228   uint8x8_t row5_eq0 = vmovn_u16(vceqq_s16(row5, vdupq_n_s16(0)));
 229   uint8x8_t row6_eq0 = vmovn_u16(vceqq_s16(row6, vdupq_n_s16(0)));
 230   uint8x8_t row7_eq0 = vmovn_u16(vceqq_s16(row7, vdupq_n_s16(0)));
 231
 232   /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
 233   const uint8x8_t bitmap_mask =
 234     vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
 235
 236   row0_eq0 = vand_u8(row0_eq0, bitmap_mask);
 237   row1_eq0 = vand_u8(row1_eq0, bitmap_mask);
 238   row2_eq0 = vand_u8(row2_eq0, bitmap_mask);
 239   row3_eq0 = vand_u8(row3_eq0, bitmap_mask);
 240   row4_eq0 = vand_u8(row4_eq0, bitmap_mask);
 241   row5_eq0 = vand_u8(row5_eq0, bitmap_mask);
 242   row6_eq0 = vand_u8(row6_eq0, bitmap_mask);
 243   row7_eq0 = vand_u8(row7_eq0, bitmap_mask);
 244
 245   uint8x8_t bitmap_rows_01 = vpadd_u8(row0_eq0, row1_eq0);
 246   uint8x8_t bitmap_rows_23 = vpadd_u8(row2_eq0, row3_eq0);
 247   uint8x8_t bitmap_rows_45 = vpadd_u8(row4_eq0, row5_eq0);
 248   uint8x8_t bitmap_rows_67 = vpadd_u8(row6_eq0, row7_eq0);
 249   uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
 250   uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
 251   uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
 252
 253 #if defined(__aarch64__) || defined(_M_ARM64)
 254   /* Move bitmap to a 64-bit scalar register. */
 255   uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
 256   /* Store zerobits bitmap. */
 257   *zerobits = ~bitmap;
 258 #else
 259   /* Move bitmap to two 32-bit scalar registers. */
 260   uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
 261   uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
 262   /* Store zerobits bitmap. */
 263   zerobits[0] = ~bitmap0;
 264   zerobits[1] = ~bitmap1;
 265 #endif
 266 }
 267
 268
 269 /* Data preparation for encode_mcu_AC_refine().
 270  *
 271  * The equivalent scalar C function (encode_mcu_AC_refine_prepare()) can be
 272  * found in jcphuff.c.
 273  */
 274
 275 int jsimd_encode_mcu_AC_refine_prepare_neon
 276   (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
 277    JCOEF *absvalues, size_t *bits)
 278 {
 279   /* Temporary storage buffers for data used to compute the signbits bitmap and
 280    * the end-of-block (EOB) position
 281    */
 282   uint8_t coef_sign_bits[64];
 283   uint8_t coef_eq1_bits[64];
 284
 285   JCOEF *absvalues_ptr = absvalues;
 286   uint8_t *coef_sign_bits_ptr = coef_sign_bits;
 287   uint8_t *eq1_bits_ptr = coef_eq1_bits;
 288
 289   /* Rows of coefficients to zero (since they haven't been processed) */
 290   int i, rows_to_zero = 8;
 291
 292   for (i = 0; i < Sl / 16; i++) {
 293     int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
 294     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
 295     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
 296     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
 297     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
 298     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
 299     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
 300     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
 301     int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
 302     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
 303     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
 304     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
 305     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
 306     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
 307     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
 308     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
 309
 310     /* Compute and store data for signbits bitmap. */
 311     uint8x8_t sign_coefs1 =
 312       vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
 313     uint8x8_t sign_coefs2 =
 314       vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
 315     vst1_u8(coef_sign_bits_ptr, sign_coefs1);
 316     vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
 317
 318     /* Compute absolute value of coefficients and apply point transform Al. */
 319     int16x8_t abs_coefs1 = vabsq_s16(coefs1);
 320     int16x8_t abs_coefs2 = vabsq_s16(coefs2);
 321     coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
 322     coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
 323     vst1q_s16(absvalues_ptr, coefs1);
 324     vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
 325
 326     /* Test whether transformed coefficient values == 1 (used to find EOB
 327      * position.)
 328      */
 329     uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
 330     uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
 331     vst1_u8(eq1_bits_ptr, coefs_eq11);
 332     vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
 333
 334     absvalues_ptr += 16;
 335     coef_sign_bits_ptr += 16;
 336     eq1_bits_ptr += 16;
 337     jpeg_natural_order_start += 16;
 338     rows_to_zero -= 2;
 339   }
 340
 341   /* Same operation but for remaining partial vector */
 342   int remaining_coefs = Sl % 16;
 343   if (remaining_coefs > 8) {
 344     int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
 345     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
 346     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
 347     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
 348     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
 349     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
 350     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
 351     coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
 352     int16x8_t coefs2 = vdupq_n_s16(0);
 353     switch (remaining_coefs) {
 354     case 15:
 355       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
 356       FALLTHROUGH               /*FALLTHROUGH*/
 357     case 14:
 358       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
 359       FALLTHROUGH               /*FALLTHROUGH*/
 360     case 13:
 361       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
 362       FALLTHROUGH               /*FALLTHROUGH*/
 363     case 12:
 364       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
 365       FALLTHROUGH               /*FALLTHROUGH*/
 366     case 11:
 367       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
 368       FALLTHROUGH               /*FALLTHROUGH*/
 369     case 10:
 370       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
 371       FALLTHROUGH               /*FALLTHROUGH*/
 372     case 9:
 373       coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
 374       FALLTHROUGH               /*FALLTHROUGH*/
 375     default:
 376       break;
 377     }
 378
 379     /* Compute and store data for signbits bitmap. */
 380     uint8x8_t sign_coefs1 =
 381       vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
 382     uint8x8_t sign_coefs2 =
 383       vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
 384     vst1_u8(coef_sign_bits_ptr, sign_coefs1);
 385     vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
 386
 387     /* Compute absolute value of coefficients and apply point transform Al. */
 388     int16x8_t abs_coefs1 = vabsq_s16(coefs1);
 389     int16x8_t abs_coefs2 = vabsq_s16(coefs2);
 390     coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
 391     coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
 392     vst1q_s16(absvalues_ptr, coefs1);
 393     vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
 394
 395     /* Test whether transformed coefficient values == 1 (used to find EOB
 396      * position.)
 397      */
 398     uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
 399     uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
 400     vst1_u8(eq1_bits_ptr, coefs_eq11);
 401     vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
 402
 403     absvalues_ptr += 16;
 404     coef_sign_bits_ptr += 16;
 405     eq1_bits_ptr += 16;
 406     jpeg_natural_order_start += 16;
 407     rows_to_zero -= 2;
 408
 409   } else if (remaining_coefs > 0) {
 410     int16x8_t coefs = vdupq_n_s16(0);
 411
 412     switch (remaining_coefs) {
 413     case 8:
 414       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
 415       FALLTHROUGH               /*FALLTHROUGH*/
 416     case 7:
 417       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
 418       FALLTHROUGH               /*FALLTHROUGH*/
 419     case 6:
 420       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
 421       FALLTHROUGH               /*FALLTHROUGH*/
 422     case 5:
 423       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
 424       FALLTHROUGH               /*FALLTHROUGH*/
 425     case 4:
 426       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
 427       FALLTHROUGH               /*FALLTHROUGH*/
 428     case 3:
 429       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
 430       FALLTHROUGH               /*FALLTHROUGH*/
 431     case 2:
 432       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
 433       FALLTHROUGH               /*FALLTHROUGH*/
 434     case 1:
 435       coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
 436       FALLTHROUGH               /*FALLTHROUGH*/
 437     default:
 438       break;
 439     }
 440
 441     /* Compute and store data for signbits bitmap. */
 442     uint8x8_t sign_coefs =
 443       vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15)));
 444     vst1_u8(coef_sign_bits_ptr, sign_coefs);
 445
 446     /* Compute absolute value of coefficients and apply point transform Al. */
 447     int16x8_t abs_coefs = vabsq_s16(coefs);
 448     coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
 449     vst1q_s16(absvalues_ptr, coefs);
 450
 451     /* Test whether transformed coefficient values == 1 (used to find EOB
 452      * position.)
 453      */
 454     uint8x8_t coefs_eq1 = vmovn_u16(vceqq_s16(coefs, vdupq_n_s16(1)));
 455     vst1_u8(eq1_bits_ptr, coefs_eq1);
 456
 457     absvalues_ptr += 8;
 458     coef_sign_bits_ptr += 8;
 459     eq1_bits_ptr += 8;
 460     rows_to_zero--;
 461   }
 462
 463   /* Zero remaining memory in blocks. */
 464   for (i = 0; i < rows_to_zero; i++) {
 465     vst1q_s16(absvalues_ptr, vdupq_n_s16(0));
 466     vst1_u8(coef_sign_bits_ptr, vdup_n_u8(0));
 467     vst1_u8(eq1_bits_ptr, vdup_n_u8(0));
 468     absvalues_ptr += 8;
 469     coef_sign_bits_ptr += 8;
 470     eq1_bits_ptr += 8;
 471   }
 472
 473   /* Construct zerobits bitmap. */
 474   int16x8_t abs_row0 = vld1q_s16(absvalues + 0 * DCTSIZE);
 475   int16x8_t abs_row1 = vld1q_s16(absvalues + 1 * DCTSIZE);
 476   int16x8_t abs_row2 = vld1q_s16(absvalues + 2 * DCTSIZE);
 477   int16x8_t abs_row3 = vld1q_s16(absvalues + 3 * DCTSIZE);
 478   int16x8_t abs_row4 = vld1q_s16(absvalues + 4 * DCTSIZE);
 479   int16x8_t abs_row5 = vld1q_s16(absvalues + 5 * DCTSIZE);
 480   int16x8_t abs_row6 = vld1q_s16(absvalues + 6 * DCTSIZE);
 481   int16x8_t abs_row7 = vld1q_s16(absvalues + 7 * DCTSIZE);
 482
 483   uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_s16(abs_row0, vdupq_n_s16(0)));
 484   uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_s16(abs_row1, vdupq_n_s16(0)));
 485   uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_s16(abs_row2, vdupq_n_s16(0)));
 486   uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_s16(abs_row3, vdupq_n_s16(0)));
 487   uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_s16(abs_row4, vdupq_n_s16(0)));
 488   uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_s16(abs_row5, vdupq_n_s16(0)));
 489   uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_s16(abs_row6, vdupq_n_s16(0)));
 490   uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_s16(abs_row7, vdupq_n_s16(0)));
 491
 492   /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
 493   const uint8x8_t bitmap_mask =
 494     vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
 495
 496   abs_row0_eq0 = vand_u8(abs_row0_eq0, bitmap_mask);
 497   abs_row1_eq0 = vand_u8(abs_row1_eq0, bitmap_mask);
 498   abs_row2_eq0 = vand_u8(abs_row2_eq0, bitmap_mask);
 499   abs_row3_eq0 = vand_u8(abs_row3_eq0, bitmap_mask);
 500   abs_row4_eq0 = vand_u8(abs_row4_eq0, bitmap_mask);
 501   abs_row5_eq0 = vand_u8(abs_row5_eq0, bitmap_mask);
 502   abs_row6_eq0 = vand_u8(abs_row6_eq0, bitmap_mask);
 503   abs_row7_eq0 = vand_u8(abs_row7_eq0, bitmap_mask);
 504
 505   uint8x8_t bitmap_rows_01 = vpadd_u8(abs_row0_eq0, abs_row1_eq0);
 506   uint8x8_t bitmap_rows_23 = vpadd_u8(abs_row2_eq0, abs_row3_eq0);
 507   uint8x8_t bitmap_rows_45 = vpadd_u8(abs_row4_eq0, abs_row5_eq0);
 508   uint8x8_t bitmap_rows_67 = vpadd_u8(abs_row6_eq0, abs_row7_eq0);
 509   uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
 510   uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
 511   uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
 512
 513 #if defined(__aarch64__) || defined(_M_ARM64)
 514   /* Move bitmap to a 64-bit scalar register. */
 515   uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
 516   /* Store zerobits bitmap. */
 517   bits[0] = ~bitmap;
 518 #else
 519   /* Move bitmap to two 32-bit scalar registers. */
 520   uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
 521   uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
 522   /* Store zerobits bitmap. */
 523   bits[0] = ~bitmap0;
 524   bits[1] = ~bitmap1;
 525 #endif
 526
 527   /* Construct signbits bitmap. */
 528   uint8x8_t signbits_row0 = vld1_u8(coef_sign_bits + 0 * DCTSIZE);
 529   uint8x8_t signbits_row1 = vld1_u8(coef_sign_bits + 1 * DCTSIZE);
 530   uint8x8_t signbits_row2 = vld1_u8(coef_sign_bits + 2 * DCTSIZE);
 531   uint8x8_t signbits_row3 = vld1_u8(coef_sign_bits + 3 * DCTSIZE);
 532   uint8x8_t signbits_row4 = vld1_u8(coef_sign_bits + 4 * DCTSIZE);
 533   uint8x8_t signbits_row5 = vld1_u8(coef_sign_bits + 5 * DCTSIZE);
 534   uint8x8_t signbits_row6 = vld1_u8(coef_sign_bits + 6 * DCTSIZE);
 535   uint8x8_t signbits_row7 = vld1_u8(coef_sign_bits + 7 * DCTSIZE);
 536
 537   signbits_row0 = vand_u8(signbits_row0, bitmap_mask);
 538   signbits_row1 = vand_u8(signbits_row1, bitmap_mask);
 539   signbits_row2 = vand_u8(signbits_row2, bitmap_mask);
 540   signbits_row3 = vand_u8(signbits_row3, bitmap_mask);
 541   signbits_row4 = vand_u8(signbits_row4, bitmap_mask);
 542   signbits_row5 = vand_u8(signbits_row5, bitmap_mask);
 543   signbits_row6 = vand_u8(signbits_row6, bitmap_mask);
 544   signbits_row7 = vand_u8(signbits_row7, bitmap_mask);
 545
 546   bitmap_rows_01 = vpadd_u8(signbits_row0, signbits_row1);
 547   bitmap_rows_23 = vpadd_u8(signbits_row2, signbits_row3);
 548   bitmap_rows_45 = vpadd_u8(signbits_row4, signbits_row5);
 549   bitmap_rows_67 = vpadd_u8(signbits_row6, signbits_row7);
 550   bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
 551   bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
 552   bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
 553
 554 #if defined(__aarch64__) || defined(_M_ARM64)
 555   /* Move bitmap to a 64-bit scalar register. */
 556   bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
 557   /* Store signbits bitmap. */
 558   bits[1] = ~bitmap;
 559 #else
 560   /* Move bitmap to two 32-bit scalar registers. */
 561   bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
 562   bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
 563   /* Store signbits bitmap. */
 564   bits[2] = ~bitmap0;
 565   bits[3] = ~bitmap1;
 566 #endif
 567
 568   /* Construct bitmap to find EOB position (the index of the last coefficient
 569    * equal to 1.)
 570    */
 571   uint8x8_t row0_eq1 = vld1_u8(coef_eq1_bits + 0 * DCTSIZE);
 572   uint8x8_t row1_eq1 = vld1_u8(coef_eq1_bits + 1 * DCTSIZE);
 573   uint8x8_t row2_eq1 = vld1_u8(coef_eq1_bits + 2 * DCTSIZE);
 574   uint8x8_t row3_eq1 = vld1_u8(coef_eq1_bits + 3 * DCTSIZE);
 575   uint8x8_t row4_eq1 = vld1_u8(coef_eq1_bits + 4 * DCTSIZE);
 576   uint8x8_t row5_eq1 = vld1_u8(coef_eq1_bits + 5 * DCTSIZE);
 577   uint8x8_t row6_eq1 = vld1_u8(coef_eq1_bits + 6 * DCTSIZE);
 578   uint8x8_t row7_eq1 = vld1_u8(coef_eq1_bits + 7 * DCTSIZE);
 579
 580   row0_eq1 = vand_u8(row0_eq1, bitmap_mask);
 581   row1_eq1 = vand_u8(row1_eq1, bitmap_mask);
 582   row2_eq1 = vand_u8(row2_eq1, bitmap_mask);
 583   row3_eq1 = vand_u8(row3_eq1, bitmap_mask);
 584   row4_eq1 = vand_u8(row4_eq1, bitmap_mask);
 585   row5_eq1 = vand_u8(row5_eq1, bitmap_mask);
 586   row6_eq1 = vand_u8(row6_eq1, bitmap_mask);
 587   row7_eq1 = vand_u8(row7_eq1, bitmap_mask);
 588
 589   bitmap_rows_01 = vpadd_u8(row0_eq1, row1_eq1);
 590   bitmap_rows_23 = vpadd_u8(row2_eq1, row3_eq1);
 591   bitmap_rows_45 = vpadd_u8(row4_eq1, row5_eq1);
 592   bitmap_rows_67 = vpadd_u8(row6_eq1, row7_eq1);
 593   bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
 594   bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
 595   bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
 596
 597 #if defined(__aarch64__) || defined(_M_ARM64)
 598   /* Move bitmap to a 64-bit scalar register. */
 599   bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
 600
 601   /* Return EOB position. */
 602   if (bitmap == 0) {
 603     /* EOB position is defined to be 0 if all coefficients != 1. */
 604     return 0;
 605   } else {
 606     return 63 - BUILTIN_CLZLL(bitmap);
 607   }
 608 #else
 609   /* Move bitmap to two 32-bit scalar registers. */
 610   bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
 611   bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
 612
 613   /* Return EOB position. */
 614   if (bitmap0 == 0 && bitmap1 == 0) {
 615     return 0;
 616   } else if (bitmap1 != 0) {
 617     return 63 - BUILTIN_CLZ(bitmap1);
 618   } else {
 619     return 31 - BUILTIN_CLZ(bitmap0);
 620   }
 621 #endif
 622 }