2 * jquanti-neon.c - sample data conversion and quantization (Arm Neon)
4 * Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
6 * This software is provided 'as-is', without any express or implied
7 * warranty. In no event will the authors be held liable for any damages
8 * arising from the use of this software.
10 * Permission is granted to anyone to use this software for any purpose,
11 * including commercial applications, and to alter it and redistribute it
12 * freely, subject to the following restrictions:
14 * 1. The origin of this software must not be misrepresented; you must not
15 * claim that you wrote the original software. If you use this software
16 * in a product, an acknowledgment in the product documentation would be
17 * appreciated but is not required.
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 * misrepresented as being the original software.
20 * 3. This notice may not be removed or altered from any source distribution.
23 #define JPEG_INTERNALS
24 #include "../../jinclude.h"
25 #include "../../jpeglib.h"
26 #include "../../jsimd.h"
27 #include "../../jdct.h"
28 #include "../../jsimddct.h"
34 /* After downsampling, the resulting sample values are in the range [0, 255],
35 * but the Discrete Cosine Transform (DCT) operates on values centered around
38 * To prepare sample values for the DCT, load samples into a DCT workspace,
39 * subtracting CENTERJSAMPLE (128). The samples, now in the range [-128, 127],
40 * are also widened from 8- to 16-bit.
42 * The equivalent scalar C function convsamp() can be found in jcdctmgr.c.
45 void jsimd_convsamp_neon(JSAMPARRAY sample_data, JDIMENSION start_col,
48 uint8x8_t samp_row0 = vld1_u8(sample_data[0] + start_col);
49 uint8x8_t samp_row1 = vld1_u8(sample_data[1] + start_col);
50 uint8x8_t samp_row2 = vld1_u8(sample_data[2] + start_col);
51 uint8x8_t samp_row3 = vld1_u8(sample_data[3] + start_col);
52 uint8x8_t samp_row4 = vld1_u8(sample_data[4] + start_col);
53 uint8x8_t samp_row5 = vld1_u8(sample_data[5] + start_col);
54 uint8x8_t samp_row6 = vld1_u8(sample_data[6] + start_col);
55 uint8x8_t samp_row7 = vld1_u8(sample_data[7] + start_col);
58 vreinterpretq_s16_u16(vsubl_u8(samp_row0, vdup_n_u8(CENTERJSAMPLE)));
60 vreinterpretq_s16_u16(vsubl_u8(samp_row1, vdup_n_u8(CENTERJSAMPLE)));
62 vreinterpretq_s16_u16(vsubl_u8(samp_row2, vdup_n_u8(CENTERJSAMPLE)));
64 vreinterpretq_s16_u16(vsubl_u8(samp_row3, vdup_n_u8(CENTERJSAMPLE)));
66 vreinterpretq_s16_u16(vsubl_u8(samp_row4, vdup_n_u8(CENTERJSAMPLE)));
68 vreinterpretq_s16_u16(vsubl_u8(samp_row5, vdup_n_u8(CENTERJSAMPLE)));
70 vreinterpretq_s16_u16(vsubl_u8(samp_row6, vdup_n_u8(CENTERJSAMPLE)));
72 vreinterpretq_s16_u16(vsubl_u8(samp_row7, vdup_n_u8(CENTERJSAMPLE)));
74 vst1q_s16(workspace + 0 * DCTSIZE, row0);
75 vst1q_s16(workspace + 1 * DCTSIZE, row1);
76 vst1q_s16(workspace + 2 * DCTSIZE, row2);
77 vst1q_s16(workspace + 3 * DCTSIZE, row3);
78 vst1q_s16(workspace + 4 * DCTSIZE, row4);
79 vst1q_s16(workspace + 5 * DCTSIZE, row5);
80 vst1q_s16(workspace + 6 * DCTSIZE, row6);
81 vst1q_s16(workspace + 7 * DCTSIZE, row7);
85 /* After the DCT, the resulting array of coefficient values needs to be divided
86 * by an array of quantization values.
88 * To avoid a slow division operation, the DCT coefficients are multiplied by
89 * the (scaled) reciprocals of the quantization values and then right-shifted.
91 * The equivalent scalar C function quantize() can be found in jcdctmgr.c.
94 void jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
97 JCOEFPTR out_ptr = coef_block;
98 UDCTELEM *recip_ptr = (UDCTELEM *)divisors;
99 UDCTELEM *corr_ptr = (UDCTELEM *)divisors + DCTSIZE2;
100 DCTELEM *shift_ptr = divisors + 3 * DCTSIZE2;
103 #if defined(__clang__) && (defined(__aarch64__) || defined(_M_ARM64))
106 for (i = 0; i < DCTSIZE; i += DCTSIZE / 2) {
107 /* Load DCT coefficients. */
108 int16x8_t row0 = vld1q_s16(workspace + (i + 0) * DCTSIZE);
109 int16x8_t row1 = vld1q_s16(workspace + (i + 1) * DCTSIZE);
110 int16x8_t row2 = vld1q_s16(workspace + (i + 2) * DCTSIZE);
111 int16x8_t row3 = vld1q_s16(workspace + (i + 3) * DCTSIZE);
112 /* Load reciprocals of quantization values. */
113 uint16x8_t recip0 = vld1q_u16(recip_ptr + (i + 0) * DCTSIZE);
114 uint16x8_t recip1 = vld1q_u16(recip_ptr + (i + 1) * DCTSIZE);
115 uint16x8_t recip2 = vld1q_u16(recip_ptr + (i + 2) * DCTSIZE);
116 uint16x8_t recip3 = vld1q_u16(recip_ptr + (i + 3) * DCTSIZE);
117 uint16x8_t corr0 = vld1q_u16(corr_ptr + (i + 0) * DCTSIZE);
118 uint16x8_t corr1 = vld1q_u16(corr_ptr + (i + 1) * DCTSIZE);
119 uint16x8_t corr2 = vld1q_u16(corr_ptr + (i + 2) * DCTSIZE);
120 uint16x8_t corr3 = vld1q_u16(corr_ptr + (i + 3) * DCTSIZE);
121 int16x8_t shift0 = vld1q_s16(shift_ptr + (i + 0) * DCTSIZE);
122 int16x8_t shift1 = vld1q_s16(shift_ptr + (i + 1) * DCTSIZE);
123 int16x8_t shift2 = vld1q_s16(shift_ptr + (i + 2) * DCTSIZE);
124 int16x8_t shift3 = vld1q_s16(shift_ptr + (i + 3) * DCTSIZE);
126 /* Extract sign from coefficients. */
127 int16x8_t sign_row0 = vshrq_n_s16(row0, 15);
128 int16x8_t sign_row1 = vshrq_n_s16(row1, 15);
129 int16x8_t sign_row2 = vshrq_n_s16(row2, 15);
130 int16x8_t sign_row3 = vshrq_n_s16(row3, 15);
131 /* Get absolute value of DCT coefficients. */
132 uint16x8_t abs_row0 = vreinterpretq_u16_s16(vabsq_s16(row0));
133 uint16x8_t abs_row1 = vreinterpretq_u16_s16(vabsq_s16(row1));
134 uint16x8_t abs_row2 = vreinterpretq_u16_s16(vabsq_s16(row2));
135 uint16x8_t abs_row3 = vreinterpretq_u16_s16(vabsq_s16(row3));
136 /* Add correction. */
137 abs_row0 = vaddq_u16(abs_row0, corr0);
138 abs_row1 = vaddq_u16(abs_row1, corr1);
139 abs_row2 = vaddq_u16(abs_row2, corr2);
140 abs_row3 = vaddq_u16(abs_row3, corr3);
142 /* Multiply DCT coefficients by quantization reciprocals. */
143 int32x4_t row0_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row0),
144 vget_low_u16(recip0)));
145 int32x4_t row0_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row0),
146 vget_high_u16(recip0)));
147 int32x4_t row1_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row1),
148 vget_low_u16(recip1)));
149 int32x4_t row1_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row1),
150 vget_high_u16(recip1)));
151 int32x4_t row2_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row2),
152 vget_low_u16(recip2)));
153 int32x4_t row2_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row2),
154 vget_high_u16(recip2)));
155 int32x4_t row3_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row3),
156 vget_low_u16(recip3)));
157 int32x4_t row3_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row3),
158 vget_high_u16(recip3)));
159 /* Narrow back to 16-bit. */
160 row0 = vcombine_s16(vshrn_n_s32(row0_l, 16), vshrn_n_s32(row0_h, 16));
161 row1 = vcombine_s16(vshrn_n_s32(row1_l, 16), vshrn_n_s32(row1_h, 16));
162 row2 = vcombine_s16(vshrn_n_s32(row2_l, 16), vshrn_n_s32(row2_h, 16));
163 row3 = vcombine_s16(vshrn_n_s32(row3_l, 16), vshrn_n_s32(row3_h, 16));
165 /* Since VSHR only supports an immediate as its second argument, negate the
166 * shift value and shift left.
168 row0 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row0),
170 row1 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row1),
172 row2 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row2),
174 row3 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row3),
177 /* Restore sign to original product. */
178 row0 = veorq_s16(row0, sign_row0);
179 row0 = vsubq_s16(row0, sign_row0);
180 row1 = veorq_s16(row1, sign_row1);
181 row1 = vsubq_s16(row1, sign_row1);
182 row2 = veorq_s16(row2, sign_row2);
183 row2 = vsubq_s16(row2, sign_row2);
184 row3 = veorq_s16(row3, sign_row3);
185 row3 = vsubq_s16(row3, sign_row3);
187 /* Store quantized coefficients to memory. */
188 vst1q_s16(out_ptr + (i + 0) * DCTSIZE, row0);
189 vst1q_s16(out_ptr + (i + 1) * DCTSIZE, row1);
190 vst1q_s16(out_ptr + (i + 2) * DCTSIZE, row2);
191 vst1q_s16(out_ptr + (i + 3) * DCTSIZE, row3);