2 * jcphuff-neon.c - prepare data for progressive Huffman encoding (Arm Neon)
4 * Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
5 * Copyright (C) 2022, Matthieu Darbois. All Rights Reserved.
6 * Copyright (C) 2022, D. R. Commander. All Rights Reserved.
8 * This software is provided 'as-is', without any express or implied
9 * warranty. In no event will the authors be held liable for any damages
10 * arising from the use of this software.
12 * Permission is granted to anyone to use this software for any purpose,
13 * including commercial applications, and to alter it and redistribute it
14 * freely, subject to the following restrictions:
16 * 1. The origin of this software must not be misrepresented; you must not
17 * claim that you wrote the original software. If you use this software
18 * in a product, an acknowledgment in the product documentation would be
19 * appreciated but is not required.
20 * 2. Altered source versions must be plainly marked as such, and must not be
21 * misrepresented as being the original software.
22 * 3. This notice may not be removed or altered from any source distribution.
25 #define JPEG_INTERNALS
26 #include "../../jinclude.h"
27 #include "../../jpeglib.h"
28 #include "../../jsimd.h"
29 #include "../../jdct.h"
30 #include "../../jsimddct.h"
32 #include "neon-compat.h"
37 /* Data preparation for encode_mcu_AC_first().
39 * The equivalent scalar C function (encode_mcu_AC_first_prepare()) can be
43 void jsimd_encode_mcu_AC_first_prepare_neon
44 (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
45 UJCOEF *values, size_t *zerobits)
47 UJCOEF *values_ptr = values;
48 UJCOEF *diff_values_ptr = values + DCTSIZE2;
50 /* Rows of coefficients to zero (since they haven't been processed) */
51 int i, rows_to_zero = 8;
53 for (i = 0; i < Sl / 16; i++) {
54 int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
55 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
56 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
57 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
58 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
59 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
60 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
61 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
62 int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
63 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
64 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
65 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
66 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
67 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
68 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
69 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
71 /* Isolate sign of coefficients. */
72 uint16x8_t sign_coefs1 = vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15));
73 uint16x8_t sign_coefs2 = vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15));
74 /* Compute absolute value of coefficients and apply point transform Al. */
75 uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
76 uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
77 abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
78 abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
80 /* Compute diff values. */
81 uint16x8_t diff1 = veorq_u16(abs_coefs1, sign_coefs1);
82 uint16x8_t diff2 = veorq_u16(abs_coefs2, sign_coefs2);
84 /* Store transformed coefficients and diff values. */
85 vst1q_u16(values_ptr, abs_coefs1);
86 vst1q_u16(values_ptr + DCTSIZE, abs_coefs2);
87 vst1q_u16(diff_values_ptr, diff1);
88 vst1q_u16(diff_values_ptr + DCTSIZE, diff2);
90 diff_values_ptr += 16;
91 jpeg_natural_order_start += 16;
95 /* Same operation but for remaining partial vector */
96 int remaining_coefs = Sl % 16;
97 if (remaining_coefs > 8) {
98 int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
99 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
100 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
101 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
102 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
103 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
104 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
105 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
106 int16x8_t coefs2 = vdupq_n_s16(0);
107 switch (remaining_coefs) {
109 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
110 FALLTHROUGH /*FALLTHROUGH*/
112 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
113 FALLTHROUGH /*FALLTHROUGH*/
115 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
116 FALLTHROUGH /*FALLTHROUGH*/
118 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
119 FALLTHROUGH /*FALLTHROUGH*/
121 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
122 FALLTHROUGH /*FALLTHROUGH*/
124 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
125 FALLTHROUGH /*FALLTHROUGH*/
127 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
128 FALLTHROUGH /*FALLTHROUGH*/
133 /* Isolate sign of coefficients. */
134 uint16x8_t sign_coefs1 = vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15));
135 uint16x8_t sign_coefs2 = vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15));
136 /* Compute absolute value of coefficients and apply point transform Al. */
137 uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
138 uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
139 abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
140 abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
142 /* Compute diff values. */
143 uint16x8_t diff1 = veorq_u16(abs_coefs1, sign_coefs1);
144 uint16x8_t diff2 = veorq_u16(abs_coefs2, sign_coefs2);
146 /* Store transformed coefficients and diff values. */
147 vst1q_u16(values_ptr, abs_coefs1);
148 vst1q_u16(values_ptr + DCTSIZE, abs_coefs2);
149 vst1q_u16(diff_values_ptr, diff1);
150 vst1q_u16(diff_values_ptr + DCTSIZE, diff2);
152 diff_values_ptr += 16;
155 } else if (remaining_coefs > 0) {
156 int16x8_t coefs = vdupq_n_s16(0);
158 switch (remaining_coefs) {
160 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
161 FALLTHROUGH /*FALLTHROUGH*/
163 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
164 FALLTHROUGH /*FALLTHROUGH*/
166 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
167 FALLTHROUGH /*FALLTHROUGH*/
169 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
170 FALLTHROUGH /*FALLTHROUGH*/
172 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
173 FALLTHROUGH /*FALLTHROUGH*/
175 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
176 FALLTHROUGH /*FALLTHROUGH*/
178 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
179 FALLTHROUGH /*FALLTHROUGH*/
181 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
182 FALLTHROUGH /*FALLTHROUGH*/
187 /* Isolate sign of coefficients. */
188 uint16x8_t sign_coefs = vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15));
189 /* Compute absolute value of coefficients and apply point transform Al. */
190 uint16x8_t abs_coefs = vreinterpretq_u16_s16(vabsq_s16(coefs));
191 abs_coefs = vshlq_u16(abs_coefs, vdupq_n_s16(-Al));
193 /* Compute diff values. */
194 uint16x8_t diff = veorq_u16(abs_coefs, sign_coefs);
196 /* Store transformed coefficients and diff values. */
197 vst1q_u16(values_ptr, abs_coefs);
198 vst1q_u16(diff_values_ptr, diff);
200 diff_values_ptr += 8;
204 /* Zero remaining memory in the values and diff_values blocks. */
205 for (i = 0; i < rows_to_zero; i++) {
206 vst1q_u16(values_ptr, vdupq_n_u16(0));
207 vst1q_u16(diff_values_ptr, vdupq_n_u16(0));
209 diff_values_ptr += 8;
212 /* Construct zerobits bitmap. A set bit means that the corresponding
215 uint16x8_t row0 = vld1q_u16(values + 0 * DCTSIZE);
216 uint16x8_t row1 = vld1q_u16(values + 1 * DCTSIZE);
217 uint16x8_t row2 = vld1q_u16(values + 2 * DCTSIZE);
218 uint16x8_t row3 = vld1q_u16(values + 3 * DCTSIZE);
219 uint16x8_t row4 = vld1q_u16(values + 4 * DCTSIZE);
220 uint16x8_t row5 = vld1q_u16(values + 5 * DCTSIZE);
221 uint16x8_t row6 = vld1q_u16(values + 6 * DCTSIZE);
222 uint16x8_t row7 = vld1q_u16(values + 7 * DCTSIZE);
224 uint8x8_t row0_eq0 = vmovn_u16(vceqq_u16(row0, vdupq_n_u16(0)));
225 uint8x8_t row1_eq0 = vmovn_u16(vceqq_u16(row1, vdupq_n_u16(0)));
226 uint8x8_t row2_eq0 = vmovn_u16(vceqq_u16(row2, vdupq_n_u16(0)));
227 uint8x8_t row3_eq0 = vmovn_u16(vceqq_u16(row3, vdupq_n_u16(0)));
228 uint8x8_t row4_eq0 = vmovn_u16(vceqq_u16(row4, vdupq_n_u16(0)));
229 uint8x8_t row5_eq0 = vmovn_u16(vceqq_u16(row5, vdupq_n_u16(0)));
230 uint8x8_t row6_eq0 = vmovn_u16(vceqq_u16(row6, vdupq_n_u16(0)));
231 uint8x8_t row7_eq0 = vmovn_u16(vceqq_u16(row7, vdupq_n_u16(0)));
233 /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
234 const uint8x8_t bitmap_mask =
235 vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
237 row0_eq0 = vand_u8(row0_eq0, bitmap_mask);
238 row1_eq0 = vand_u8(row1_eq0, bitmap_mask);
239 row2_eq0 = vand_u8(row2_eq0, bitmap_mask);
240 row3_eq0 = vand_u8(row3_eq0, bitmap_mask);
241 row4_eq0 = vand_u8(row4_eq0, bitmap_mask);
242 row5_eq0 = vand_u8(row5_eq0, bitmap_mask);
243 row6_eq0 = vand_u8(row6_eq0, bitmap_mask);
244 row7_eq0 = vand_u8(row7_eq0, bitmap_mask);
246 uint8x8_t bitmap_rows_01 = vpadd_u8(row0_eq0, row1_eq0);
247 uint8x8_t bitmap_rows_23 = vpadd_u8(row2_eq0, row3_eq0);
248 uint8x8_t bitmap_rows_45 = vpadd_u8(row4_eq0, row5_eq0);
249 uint8x8_t bitmap_rows_67 = vpadd_u8(row6_eq0, row7_eq0);
250 uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
251 uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
252 uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
254 #if defined(__aarch64__) || defined(_M_ARM64)
255 /* Move bitmap to a 64-bit scalar register. */
256 uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
257 /* Store zerobits bitmap. */
260 /* Move bitmap to two 32-bit scalar registers. */
261 uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
262 uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
263 /* Store zerobits bitmap. */
264 zerobits[0] = ~bitmap0;
265 zerobits[1] = ~bitmap1;
270 /* Data preparation for encode_mcu_AC_refine().
272 * The equivalent scalar C function (encode_mcu_AC_refine_prepare()) can be
273 * found in jcphuff.c.
276 int jsimd_encode_mcu_AC_refine_prepare_neon
277 (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
278 UJCOEF *absvalues, size_t *bits)
280 /* Temporary storage buffers for data used to compute the signbits bitmap and
281 * the end-of-block (EOB) position
283 uint8_t coef_sign_bits[64];
284 uint8_t coef_eq1_bits[64];
286 UJCOEF *absvalues_ptr = absvalues;
287 uint8_t *coef_sign_bits_ptr = coef_sign_bits;
288 uint8_t *eq1_bits_ptr = coef_eq1_bits;
290 /* Rows of coefficients to zero (since they haven't been processed) */
291 int i, rows_to_zero = 8;
293 for (i = 0; i < Sl / 16; i++) {
294 int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
295 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
296 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
297 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
298 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
299 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
300 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
301 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
302 int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
303 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
304 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
305 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
306 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
307 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
308 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
309 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
311 /* Compute and store data for signbits bitmap. */
312 uint8x8_t sign_coefs1 =
313 vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
314 uint8x8_t sign_coefs2 =
315 vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
316 vst1_u8(coef_sign_bits_ptr, sign_coefs1);
317 vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
319 /* Compute absolute value of coefficients and apply point transform Al. */
320 uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
321 uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
322 abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
323 abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
324 vst1q_u16(absvalues_ptr, abs_coefs1);
325 vst1q_u16(absvalues_ptr + DCTSIZE, abs_coefs2);
327 /* Test whether transformed coefficient values == 1 (used to find EOB
330 uint8x8_t coefs_eq11 = vmovn_u16(vceqq_u16(abs_coefs1, vdupq_n_u16(1)));
331 uint8x8_t coefs_eq12 = vmovn_u16(vceqq_u16(abs_coefs2, vdupq_n_u16(1)));
332 vst1_u8(eq1_bits_ptr, coefs_eq11);
333 vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
336 coef_sign_bits_ptr += 16;
338 jpeg_natural_order_start += 16;
342 /* Same operation but for remaining partial vector */
343 int remaining_coefs = Sl % 16;
344 if (remaining_coefs > 8) {
345 int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
346 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
347 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
348 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
349 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
350 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
351 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
352 coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
353 int16x8_t coefs2 = vdupq_n_s16(0);
354 switch (remaining_coefs) {
356 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
357 FALLTHROUGH /*FALLTHROUGH*/
359 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
360 FALLTHROUGH /*FALLTHROUGH*/
362 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
363 FALLTHROUGH /*FALLTHROUGH*/
365 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
366 FALLTHROUGH /*FALLTHROUGH*/
368 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
369 FALLTHROUGH /*FALLTHROUGH*/
371 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
372 FALLTHROUGH /*FALLTHROUGH*/
374 coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
375 FALLTHROUGH /*FALLTHROUGH*/
380 /* Compute and store data for signbits bitmap. */
381 uint8x8_t sign_coefs1 =
382 vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
383 uint8x8_t sign_coefs2 =
384 vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
385 vst1_u8(coef_sign_bits_ptr, sign_coefs1);
386 vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
388 /* Compute absolute value of coefficients and apply point transform Al. */
389 uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
390 uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
391 abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
392 abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
393 vst1q_u16(absvalues_ptr, abs_coefs1);
394 vst1q_u16(absvalues_ptr + DCTSIZE, abs_coefs2);
396 /* Test whether transformed coefficient values == 1 (used to find EOB
399 uint8x8_t coefs_eq11 = vmovn_u16(vceqq_u16(abs_coefs1, vdupq_n_u16(1)));
400 uint8x8_t coefs_eq12 = vmovn_u16(vceqq_u16(abs_coefs2, vdupq_n_u16(1)));
401 vst1_u8(eq1_bits_ptr, coefs_eq11);
402 vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
405 coef_sign_bits_ptr += 16;
407 jpeg_natural_order_start += 16;
410 } else if (remaining_coefs > 0) {
411 int16x8_t coefs = vdupq_n_s16(0);
413 switch (remaining_coefs) {
415 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
416 FALLTHROUGH /*FALLTHROUGH*/
418 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
419 FALLTHROUGH /*FALLTHROUGH*/
421 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
422 FALLTHROUGH /*FALLTHROUGH*/
424 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
425 FALLTHROUGH /*FALLTHROUGH*/
427 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
428 FALLTHROUGH /*FALLTHROUGH*/
430 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
431 FALLTHROUGH /*FALLTHROUGH*/
433 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
434 FALLTHROUGH /*FALLTHROUGH*/
436 coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
437 FALLTHROUGH /*FALLTHROUGH*/
442 /* Compute and store data for signbits bitmap. */
443 uint8x8_t sign_coefs =
444 vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15)));
445 vst1_u8(coef_sign_bits_ptr, sign_coefs);
447 /* Compute absolute value of coefficients and apply point transform Al. */
448 uint16x8_t abs_coefs = vreinterpretq_u16_s16(vabsq_s16(coefs));
449 abs_coefs = vshlq_u16(abs_coefs, vdupq_n_s16(-Al));
450 vst1q_u16(absvalues_ptr, abs_coefs);
452 /* Test whether transformed coefficient values == 1 (used to find EOB
455 uint8x8_t coefs_eq1 = vmovn_u16(vceqq_u16(abs_coefs, vdupq_n_u16(1)));
456 vst1_u8(eq1_bits_ptr, coefs_eq1);
459 coef_sign_bits_ptr += 8;
464 /* Zero remaining memory in blocks. */
465 for (i = 0; i < rows_to_zero; i++) {
466 vst1q_u16(absvalues_ptr, vdupq_n_u16(0));
467 vst1_u8(coef_sign_bits_ptr, vdup_n_u8(0));
468 vst1_u8(eq1_bits_ptr, vdup_n_u8(0));
470 coef_sign_bits_ptr += 8;
474 /* Construct zerobits bitmap. */
475 uint16x8_t abs_row0 = vld1q_u16(absvalues + 0 * DCTSIZE);
476 uint16x8_t abs_row1 = vld1q_u16(absvalues + 1 * DCTSIZE);
477 uint16x8_t abs_row2 = vld1q_u16(absvalues + 2 * DCTSIZE);
478 uint16x8_t abs_row3 = vld1q_u16(absvalues + 3 * DCTSIZE);
479 uint16x8_t abs_row4 = vld1q_u16(absvalues + 4 * DCTSIZE);
480 uint16x8_t abs_row5 = vld1q_u16(absvalues + 5 * DCTSIZE);
481 uint16x8_t abs_row6 = vld1q_u16(absvalues + 6 * DCTSIZE);
482 uint16x8_t abs_row7 = vld1q_u16(absvalues + 7 * DCTSIZE);
484 uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_u16(abs_row0, vdupq_n_u16(0)));
485 uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_u16(abs_row1, vdupq_n_u16(0)));
486 uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_u16(abs_row2, vdupq_n_u16(0)));
487 uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_u16(abs_row3, vdupq_n_u16(0)));
488 uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_u16(abs_row4, vdupq_n_u16(0)));
489 uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_u16(abs_row5, vdupq_n_u16(0)));
490 uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_u16(abs_row6, vdupq_n_u16(0)));
491 uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_u16(abs_row7, vdupq_n_u16(0)));
493 /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
494 const uint8x8_t bitmap_mask =
495 vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
497 abs_row0_eq0 = vand_u8(abs_row0_eq0, bitmap_mask);
498 abs_row1_eq0 = vand_u8(abs_row1_eq0, bitmap_mask);
499 abs_row2_eq0 = vand_u8(abs_row2_eq0, bitmap_mask);
500 abs_row3_eq0 = vand_u8(abs_row3_eq0, bitmap_mask);
501 abs_row4_eq0 = vand_u8(abs_row4_eq0, bitmap_mask);
502 abs_row5_eq0 = vand_u8(abs_row5_eq0, bitmap_mask);
503 abs_row6_eq0 = vand_u8(abs_row6_eq0, bitmap_mask);
504 abs_row7_eq0 = vand_u8(abs_row7_eq0, bitmap_mask);
506 uint8x8_t bitmap_rows_01 = vpadd_u8(abs_row0_eq0, abs_row1_eq0);
507 uint8x8_t bitmap_rows_23 = vpadd_u8(abs_row2_eq0, abs_row3_eq0);
508 uint8x8_t bitmap_rows_45 = vpadd_u8(abs_row4_eq0, abs_row5_eq0);
509 uint8x8_t bitmap_rows_67 = vpadd_u8(abs_row6_eq0, abs_row7_eq0);
510 uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
511 uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
512 uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
514 #if defined(__aarch64__) || defined(_M_ARM64)
515 /* Move bitmap to a 64-bit scalar register. */
516 uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
517 /* Store zerobits bitmap. */
520 /* Move bitmap to two 32-bit scalar registers. */
521 uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
522 uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
523 /* Store zerobits bitmap. */
528 /* Construct signbits bitmap. */
529 uint8x8_t signbits_row0 = vld1_u8(coef_sign_bits + 0 * DCTSIZE);
530 uint8x8_t signbits_row1 = vld1_u8(coef_sign_bits + 1 * DCTSIZE);
531 uint8x8_t signbits_row2 = vld1_u8(coef_sign_bits + 2 * DCTSIZE);
532 uint8x8_t signbits_row3 = vld1_u8(coef_sign_bits + 3 * DCTSIZE);
533 uint8x8_t signbits_row4 = vld1_u8(coef_sign_bits + 4 * DCTSIZE);
534 uint8x8_t signbits_row5 = vld1_u8(coef_sign_bits + 5 * DCTSIZE);
535 uint8x8_t signbits_row6 = vld1_u8(coef_sign_bits + 6 * DCTSIZE);
536 uint8x8_t signbits_row7 = vld1_u8(coef_sign_bits + 7 * DCTSIZE);
538 signbits_row0 = vand_u8(signbits_row0, bitmap_mask);
539 signbits_row1 = vand_u8(signbits_row1, bitmap_mask);
540 signbits_row2 = vand_u8(signbits_row2, bitmap_mask);
541 signbits_row3 = vand_u8(signbits_row3, bitmap_mask);
542 signbits_row4 = vand_u8(signbits_row4, bitmap_mask);
543 signbits_row5 = vand_u8(signbits_row5, bitmap_mask);
544 signbits_row6 = vand_u8(signbits_row6, bitmap_mask);
545 signbits_row7 = vand_u8(signbits_row7, bitmap_mask);
547 bitmap_rows_01 = vpadd_u8(signbits_row0, signbits_row1);
548 bitmap_rows_23 = vpadd_u8(signbits_row2, signbits_row3);
549 bitmap_rows_45 = vpadd_u8(signbits_row4, signbits_row5);
550 bitmap_rows_67 = vpadd_u8(signbits_row6, signbits_row7);
551 bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
552 bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
553 bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
555 #if defined(__aarch64__) || defined(_M_ARM64)
556 /* Move bitmap to a 64-bit scalar register. */
557 bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
558 /* Store signbits bitmap. */
561 /* Move bitmap to two 32-bit scalar registers. */
562 bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
563 bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
564 /* Store signbits bitmap. */
569 /* Construct bitmap to find EOB position (the index of the last coefficient
572 uint8x8_t row0_eq1 = vld1_u8(coef_eq1_bits + 0 * DCTSIZE);
573 uint8x8_t row1_eq1 = vld1_u8(coef_eq1_bits + 1 * DCTSIZE);
574 uint8x8_t row2_eq1 = vld1_u8(coef_eq1_bits + 2 * DCTSIZE);
575 uint8x8_t row3_eq1 = vld1_u8(coef_eq1_bits + 3 * DCTSIZE);
576 uint8x8_t row4_eq1 = vld1_u8(coef_eq1_bits + 4 * DCTSIZE);
577 uint8x8_t row5_eq1 = vld1_u8(coef_eq1_bits + 5 * DCTSIZE);
578 uint8x8_t row6_eq1 = vld1_u8(coef_eq1_bits + 6 * DCTSIZE);
579 uint8x8_t row7_eq1 = vld1_u8(coef_eq1_bits + 7 * DCTSIZE);
581 row0_eq1 = vand_u8(row0_eq1, bitmap_mask);
582 row1_eq1 = vand_u8(row1_eq1, bitmap_mask);
583 row2_eq1 = vand_u8(row2_eq1, bitmap_mask);
584 row3_eq1 = vand_u8(row3_eq1, bitmap_mask);
585 row4_eq1 = vand_u8(row4_eq1, bitmap_mask);
586 row5_eq1 = vand_u8(row5_eq1, bitmap_mask);
587 row6_eq1 = vand_u8(row6_eq1, bitmap_mask);
588 row7_eq1 = vand_u8(row7_eq1, bitmap_mask);
590 bitmap_rows_01 = vpadd_u8(row0_eq1, row1_eq1);
591 bitmap_rows_23 = vpadd_u8(row2_eq1, row3_eq1);
592 bitmap_rows_45 = vpadd_u8(row4_eq1, row5_eq1);
593 bitmap_rows_67 = vpadd_u8(row6_eq1, row7_eq1);
594 bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
595 bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
596 bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
598 #if defined(__aarch64__) || defined(_M_ARM64)
599 /* Move bitmap to a 64-bit scalar register. */
600 bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
602 /* Return EOB position. */
604 /* EOB position is defined to be 0 if all coefficients != 1. */
607 return 63 - BUILTIN_CLZLL(bitmap);
610 /* Move bitmap to two 32-bit scalar registers. */
611 bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
612 bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
614 /* Return EOB position. */
615 if (bitmap0 == 0 && bitmap1 == 0) {
617 } else if (bitmap1 != 0) {
618 return 63 - BUILTIN_CLZ(bitmap1);
620 return 31 - BUILTIN_CLZ(bitmap0);