simd/arm/jquanti-neon.c

   1 /*
   2  * jquanti-neon.c - sample data conversion and quantization (Arm Neon)
   3  *
   4  * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
   5  *
   6  * This software is provided 'as-is', without any express or implied
   7  * warranty.  In no event will the authors be held liable for any damages
   8  * arising from the use of this software.
   9  *
  10  * Permission is granted to anyone to use this software for any purpose,
  11  * including commercial applications, and to alter it and redistribute it
  12  * freely, subject to the following restrictions:
  13  *
  14  * 1. The origin of this software must not be misrepresented; you must not
  15  *    claim that you wrote the original software. If you use this software
  16  *    in a product, an acknowledgment in the product documentation would be
  17  *    appreciated but is not required.
  18  * 2. Altered source versions must be plainly marked as such, and must not be
  19  *    misrepresented as being the original software.
  20  * 3. This notice may not be removed or altered from any source distribution.
  21  */
  22
  23 #define JPEG_INTERNALS
  24 #include "../../jinclude.h"
  25 #include "../../jpeglib.h"
  26 #include "../../jsimd.h"
  27 #include "../../jdct.h"
  28 #include "../../jsimddct.h"
  29 #include "../jsimd.h"
  30
  31 #include <arm_neon.h>
  32
  33
  34 /* After downsampling, the resulting sample values are in the range [0, 255],
  35  * but the Discrete Cosine Transform (DCT) operates on values centered around
  36  * 0.
  37  *
  38  * To prepare sample values for the DCT, load samples into a DCT workspace,
  39  * subtracting CENTERJSAMPLE (128).  The samples, now in the range [-128, 127],
  40  * are also widened from 8- to 16-bit.
  41  *
  42  * The equivalent scalar C function convsamp() can be found in jcdctmgr.c.
  43  */
  44
  45 void jsimd_convsamp_neon(JSAMPARRAY sample_data, JDIMENSION start_col,
  46                          DCTELEM *workspace)
  47 {
  48   uint8x8_t samp_row0 = vld1_u8(sample_data[0] + start_col);
  49   uint8x8_t samp_row1 = vld1_u8(sample_data[1] + start_col);
  50   uint8x8_t samp_row2 = vld1_u8(sample_data[2] + start_col);
  51   uint8x8_t samp_row3 = vld1_u8(sample_data[3] + start_col);
  52   uint8x8_t samp_row4 = vld1_u8(sample_data[4] + start_col);
  53   uint8x8_t samp_row5 = vld1_u8(sample_data[5] + start_col);
  54   uint8x8_t samp_row6 = vld1_u8(sample_data[6] + start_col);
  55   uint8x8_t samp_row7 = vld1_u8(sample_data[7] + start_col);
  56
  57   int16x8_t row0 =
  58     vreinterpretq_s16_u16(vsubl_u8(samp_row0, vdup_n_u8(CENTERJSAMPLE)));
  59   int16x8_t row1 =
  60     vreinterpretq_s16_u16(vsubl_u8(samp_row1, vdup_n_u8(CENTERJSAMPLE)));
  61   int16x8_t row2 =
  62     vreinterpretq_s16_u16(vsubl_u8(samp_row2, vdup_n_u8(CENTERJSAMPLE)));
  63   int16x8_t row3 =
  64     vreinterpretq_s16_u16(vsubl_u8(samp_row3, vdup_n_u8(CENTERJSAMPLE)));
  65   int16x8_t row4 =
  66     vreinterpretq_s16_u16(vsubl_u8(samp_row4, vdup_n_u8(CENTERJSAMPLE)));
  67   int16x8_t row5 =
  68     vreinterpretq_s16_u16(vsubl_u8(samp_row5, vdup_n_u8(CENTERJSAMPLE)));
  69   int16x8_t row6 =
  70     vreinterpretq_s16_u16(vsubl_u8(samp_row6, vdup_n_u8(CENTERJSAMPLE)));
  71   int16x8_t row7 =
  72     vreinterpretq_s16_u16(vsubl_u8(samp_row7, vdup_n_u8(CENTERJSAMPLE)));
  73
  74   vst1q_s16(workspace + 0 * DCTSIZE, row0);
  75   vst1q_s16(workspace + 1 * DCTSIZE, row1);
  76   vst1q_s16(workspace + 2 * DCTSIZE, row2);
  77   vst1q_s16(workspace + 3 * DCTSIZE, row3);
  78   vst1q_s16(workspace + 4 * DCTSIZE, row4);
  79   vst1q_s16(workspace + 5 * DCTSIZE, row5);
  80   vst1q_s16(workspace + 6 * DCTSIZE, row6);
  81   vst1q_s16(workspace + 7 * DCTSIZE, row7);
  82 }
  83
  84
  85 /* After the DCT, the resulting array of coefficient values needs to be divided
  86  * by an array of quantization values.
  87  *
  88  * To avoid a slow division operation, the DCT coefficients are multiplied by
  89  * the (scaled) reciprocals of the quantization values and then right-shifted.
  90  *
  91  * The equivalent scalar C function quantize() can be found in jcdctmgr.c.
  92  */
  93
  94 void jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
  95                          DCTELEM *workspace)
  96 {
  97   JCOEFPTR out_ptr = coef_block;
  98   UDCTELEM *recip_ptr = (UDCTELEM *)divisors;
  99   UDCTELEM *corr_ptr = (UDCTELEM *)divisors + DCTSIZE2;
 100   DCTELEM *shift_ptr = divisors + 3 * DCTSIZE2;
 101   int i;
 102
 103 #if defined(__clang__) && (defined(__aarch64__) || defined(_M_ARM64))
 104 #pragma unroll
 105 #endif
 106   for (i = 0; i < DCTSIZE; i += DCTSIZE / 2) {
 107     /* Load DCT coefficients. */
 108     int16x8_t row0 = vld1q_s16(workspace + (i + 0) * DCTSIZE);
 109     int16x8_t row1 = vld1q_s16(workspace + (i + 1) * DCTSIZE);
 110     int16x8_t row2 = vld1q_s16(workspace + (i + 2) * DCTSIZE);
 111     int16x8_t row3 = vld1q_s16(workspace + (i + 3) * DCTSIZE);
 112     /* Load reciprocals of quantization values. */
 113     uint16x8_t recip0 = vld1q_u16(recip_ptr + (i + 0) * DCTSIZE);
 114     uint16x8_t recip1 = vld1q_u16(recip_ptr + (i + 1) * DCTSIZE);
 115     uint16x8_t recip2 = vld1q_u16(recip_ptr + (i + 2) * DCTSIZE);
 116     uint16x8_t recip3 = vld1q_u16(recip_ptr + (i + 3) * DCTSIZE);
 117     uint16x8_t corr0 = vld1q_u16(corr_ptr + (i + 0) * DCTSIZE);
 118     uint16x8_t corr1 = vld1q_u16(corr_ptr + (i + 1) * DCTSIZE);
 119     uint16x8_t corr2 = vld1q_u16(corr_ptr + (i + 2) * DCTSIZE);
 120     uint16x8_t corr3 = vld1q_u16(corr_ptr + (i + 3) * DCTSIZE);
 121     int16x8_t shift0 = vld1q_s16(shift_ptr + (i + 0) * DCTSIZE);
 122     int16x8_t shift1 = vld1q_s16(shift_ptr + (i + 1) * DCTSIZE);
 123     int16x8_t shift2 = vld1q_s16(shift_ptr + (i + 2) * DCTSIZE);
 124     int16x8_t shift3 = vld1q_s16(shift_ptr + (i + 3) * DCTSIZE);
 125
 126     /* Extract sign from coefficients. */
 127     int16x8_t sign_row0 = vshrq_n_s16(row0, 15);
 128     int16x8_t sign_row1 = vshrq_n_s16(row1, 15);
 129     int16x8_t sign_row2 = vshrq_n_s16(row2, 15);
 130     int16x8_t sign_row3 = vshrq_n_s16(row3, 15);
 131     /* Get absolute value of DCT coefficients. */
 132     uint16x8_t abs_row0 = vreinterpretq_u16_s16(vabsq_s16(row0));
 133     uint16x8_t abs_row1 = vreinterpretq_u16_s16(vabsq_s16(row1));
 134     uint16x8_t abs_row2 = vreinterpretq_u16_s16(vabsq_s16(row2));
 135     uint16x8_t abs_row3 = vreinterpretq_u16_s16(vabsq_s16(row3));
 136     /* Add correction. */
 137     abs_row0 = vaddq_u16(abs_row0, corr0);
 138     abs_row1 = vaddq_u16(abs_row1, corr1);
 139     abs_row2 = vaddq_u16(abs_row2, corr2);
 140     abs_row3 = vaddq_u16(abs_row3, corr3);
 141
 142     /* Multiply DCT coefficients by quantization reciprocals. */
 143     int32x4_t row0_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row0),
 144                                                        vget_low_u16(recip0)));
 145     int32x4_t row0_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row0),
 146                                                        vget_high_u16(recip0)));
 147     int32x4_t row1_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row1),
 148                                                        vget_low_u16(recip1)));
 149     int32x4_t row1_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row1),
 150                                                        vget_high_u16(recip1)));
 151     int32x4_t row2_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row2),
 152                                                        vget_low_u16(recip2)));
 153     int32x4_t row2_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row2),
 154                                                        vget_high_u16(recip2)));
 155     int32x4_t row3_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row3),
 156                                                        vget_low_u16(recip3)));
 157     int32x4_t row3_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row3),
 158                                                        vget_high_u16(recip3)));
 159     /* Narrow back to 16-bit. */
 160     row0 = vcombine_s16(vshrn_n_s32(row0_l, 16), vshrn_n_s32(row0_h, 16));
 161     row1 = vcombine_s16(vshrn_n_s32(row1_l, 16), vshrn_n_s32(row1_h, 16));
 162     row2 = vcombine_s16(vshrn_n_s32(row2_l, 16), vshrn_n_s32(row2_h, 16));
 163     row3 = vcombine_s16(vshrn_n_s32(row3_l, 16), vshrn_n_s32(row3_h, 16));
 164
 165     /* Since VSHR only supports an immediate as its second argument, negate the
 166      * shift value and shift left.
 167      */
 168     row0 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row0),
 169                                            vnegq_s16(shift0)));
 170     row1 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row1),
 171                                            vnegq_s16(shift1)));
 172     row2 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row2),
 173                                            vnegq_s16(shift2)));
 174     row3 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row3),
 175                                            vnegq_s16(shift3)));
 176
 177     /* Restore sign to original product. */
 178     row0 = veorq_s16(row0, sign_row0);
 179     row0 = vsubq_s16(row0, sign_row0);
 180     row1 = veorq_s16(row1, sign_row1);
 181     row1 = vsubq_s16(row1, sign_row1);
 182     row2 = veorq_s16(row2, sign_row2);
 183     row2 = vsubq_s16(row2, sign_row2);
 184     row3 = veorq_s16(row3, sign_row3);
 185     row3 = vsubq_s16(row3, sign_row3);
 186
 187     /* Store quantized coefficients to memory. */
 188     vst1q_s16(out_ptr + (i + 0) * DCTSIZE, row0);
 189     vst1q_s16(out_ptr + (i + 1) * DCTSIZE, row1);
 190     vst1q_s16(out_ptr + (i + 2) * DCTSIZE, row2);
 191     vst1q_s16(out_ptr + (i + 3) * DCTSIZE, row3);
 192   }
 193 }