4 * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5 * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
6 * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, D. R. Commander.
7 * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
9 * Based on the x86 SIMD extension for IJG JPEG library,
10 * Copyright (C) 1999-2006, MIYASAKA Masaru.
11 * For conditions of distribution and use, see copyright notice in jsimdext.inc
13 * This file contains the interface between the "normal" portions
14 * of the library and the SIMD implementations when running on a
15 * 64-bit ARM architecture.
18 #define JPEG_INTERNALS
19 #include "../../jinclude.h"
20 #include "../../jpeglib.h"
21 #include "../../jsimd.h"
22 #include "../../jdct.h"
23 #include "../../jsimddct.h"
30 #define JSIMD_FASTLD3 1
31 #define JSIMD_FASTST3 2
32 #define JSIMD_FASTTBL 4
34 static unsigned int simd_support = ~0;
35 static unsigned int simd_huffman = 1;
36 static unsigned int simd_features = JSIMD_FASTLD3 | JSIMD_FASTST3 |
39 #if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
41 #define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
44 check_cpuinfo(char *buffer, const char *field, char *value)
50 if (strncmp(buffer, field, strlen(field)) != 0)
52 buffer += strlen(field);
53 while (isspace(*buffer))
56 /* Check if 'value' is present in the buffer as a separate word */
57 while ((p = strstr(buffer, value))) {
58 if (p > buffer && !isspace(*(p - 1))) {
63 if (*p != 0 && !isspace(*p)) {
73 parse_proc_cpuinfo(int bufsize)
75 char *buffer = (char *)malloc(bufsize);
81 fd = fopen("/proc/cpuinfo", "r");
83 while (fgets(buffer, bufsize, fd)) {
84 if (!strchr(buffer, '\n') && !feof(fd)) {
85 /* "impossible" happened - insufficient size of the buffer! */
90 if (check_cpuinfo(buffer, "CPU part", "0xd03") ||
91 check_cpuinfo(buffer, "CPU part", "0xd07"))
92 /* The Cortex-A53 has a slow tbl implementation. We can gain a few
93 percent speedup by disabling the use of that instruction. The
94 speedup on Cortex-A57 is more subtle but still measurable. */
95 simd_features &= ~JSIMD_FASTTBL;
96 else if (check_cpuinfo(buffer, "CPU part", "0x0a1"))
97 /* The SIMD version of Huffman encoding is slower than the C version on
98 Cavium ThunderX. Also, ld3 and st3 are abyssmally slow on that
100 simd_huffman = simd_features = 0;
111 * Check what SIMD accelerations are supported.
113 * FIXME: This code is racy under a multi-threaded environment.
117 * ARMv8 architectures support NEON extensions by default.
118 * It is no longer optional as it was with ARMv7.
128 #if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
129 int bufsize = 1024; /* an initial guess for the line buffer size limit */
132 if (simd_support != ~0U)
137 simd_support |= JSIMD_NEON;
138 #if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
139 while (!parse_proc_cpuinfo(bufsize)) {
141 if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
147 /* Force different settings through environment variables */
148 env = getenv("JSIMD_FORCENEON");
149 if ((env != NULL) && (strcmp(env, "1") == 0))
150 simd_support = JSIMD_NEON;
151 env = getenv("JSIMD_FORCENONE");
152 if ((env != NULL) && (strcmp(env, "1") == 0))
154 env = getenv("JSIMD_NOHUFFENC");
155 if ((env != NULL) && (strcmp(env, "1") == 0))
157 env = getenv("JSIMD_FASTLD3");
158 if ((env != NULL) && (strcmp(env, "1") == 0))
159 simd_features |= JSIMD_FASTLD3;
160 if ((env != NULL) && (strcmp(env, "0") == 0))
161 simd_features &= ~JSIMD_FASTLD3;
162 env = getenv("JSIMD_FASTST3");
163 if ((env != NULL) && (strcmp(env, "1") == 0))
164 simd_features |= JSIMD_FASTST3;
165 if ((env != NULL) && (strcmp(env, "0") == 0))
166 simd_features &= ~JSIMD_FASTST3;
171 jsimd_can_rgb_ycc(void)
175 /* The code is optimised for these values only */
176 if (BITS_IN_JSAMPLE != 8)
178 if (sizeof(JDIMENSION) != 4)
180 if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
183 if (simd_support & JSIMD_NEON)
190 jsimd_can_rgb_gray(void)
196 jsimd_can_ycc_rgb(void)
200 /* The code is optimised for these values only */
201 if (BITS_IN_JSAMPLE != 8)
203 if (sizeof(JDIMENSION) != 4)
205 if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
208 if (simd_support & JSIMD_NEON)
215 jsimd_can_ycc_rgb565(void)
219 /* The code is optimised for these values only */
220 if (BITS_IN_JSAMPLE != 8)
222 if (sizeof(JDIMENSION) != 4)
225 if (simd_support & JSIMD_NEON)
232 jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
233 JSAMPIMAGE output_buf, JDIMENSION output_row,
236 void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
238 switch (cinfo->in_color_space) {
240 if (simd_features & JSIMD_FASTLD3)
241 neonfct = jsimd_extrgb_ycc_convert_neon;
243 neonfct = jsimd_extrgb_ycc_convert_neon_slowld3;
247 neonfct = jsimd_extrgbx_ycc_convert_neon;
250 if (simd_features & JSIMD_FASTLD3)
251 neonfct = jsimd_extbgr_ycc_convert_neon;
253 neonfct = jsimd_extbgr_ycc_convert_neon_slowld3;
257 neonfct = jsimd_extbgrx_ycc_convert_neon;
261 neonfct = jsimd_extxbgr_ycc_convert_neon;
265 neonfct = jsimd_extxrgb_ycc_convert_neon;
268 if (simd_features & JSIMD_FASTLD3)
269 neonfct = jsimd_extrgb_ycc_convert_neon;
271 neonfct = jsimd_extrgb_ycc_convert_neon_slowld3;
275 neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
279 jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
280 JSAMPIMAGE output_buf, JDIMENSION output_row,
286 jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
287 JDIMENSION input_row, JSAMPARRAY output_buf,
290 void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
292 switch (cinfo->out_color_space) {
294 if (simd_features & JSIMD_FASTST3)
295 neonfct = jsimd_ycc_extrgb_convert_neon;
297 neonfct = jsimd_ycc_extrgb_convert_neon_slowst3;
301 neonfct = jsimd_ycc_extrgbx_convert_neon;
304 if (simd_features & JSIMD_FASTST3)
305 neonfct = jsimd_ycc_extbgr_convert_neon;
307 neonfct = jsimd_ycc_extbgr_convert_neon_slowst3;
311 neonfct = jsimd_ycc_extbgrx_convert_neon;
315 neonfct = jsimd_ycc_extxbgr_convert_neon;
319 neonfct = jsimd_ycc_extxrgb_convert_neon;
322 if (simd_features & JSIMD_FASTST3)
323 neonfct = jsimd_ycc_extrgb_convert_neon;
325 neonfct = jsimd_ycc_extrgb_convert_neon_slowst3;
329 neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
333 jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
334 JDIMENSION input_row, JSAMPARRAY output_buf,
337 jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
338 output_buf, num_rows);
342 jsimd_can_h2v2_downsample(void)
346 /* The code is optimised for these values only */
347 if (BITS_IN_JSAMPLE != 8)
351 if (sizeof(JDIMENSION) != 4)
354 if (simd_support & JSIMD_NEON)
361 jsimd_can_h2v1_downsample(void)
365 /* The code is optimised for these values only */
366 if (BITS_IN_JSAMPLE != 8)
370 if (sizeof(JDIMENSION) != 4)
373 if (simd_support & JSIMD_NEON)
380 jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
381 JSAMPARRAY input_data, JSAMPARRAY output_data)
383 jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
384 compptr->v_samp_factor, compptr->width_in_blocks,
385 input_data, output_data);
389 jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
390 JSAMPARRAY input_data, JSAMPARRAY output_data)
392 jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
393 compptr->v_samp_factor, compptr->width_in_blocks,
394 input_data, output_data);
398 jsimd_can_h2v2_upsample(void)
404 jsimd_can_h2v1_upsample(void)
410 jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
411 JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
416 jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
417 JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
422 jsimd_can_h2v2_fancy_upsample(void)
428 jsimd_can_h2v1_fancy_upsample(void)
434 jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
435 JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
440 jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
441 JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
446 jsimd_can_h2v2_merged_upsample(void)
452 jsimd_can_h2v1_merged_upsample(void)
458 jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
459 JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
464 jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
465 JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
470 jsimd_can_convsamp(void)
474 /* The code is optimised for these values only */
477 if (BITS_IN_JSAMPLE != 8)
479 if (sizeof(JDIMENSION) != 4)
481 if (sizeof(DCTELEM) != 2)
484 if (simd_support & JSIMD_NEON)
491 jsimd_can_convsamp_float(void)
497 jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
500 jsimd_convsamp_neon(sample_data, start_col, workspace);
504 jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
505 FAST_FLOAT *workspace)
510 jsimd_can_fdct_islow(void)
514 /* The code is optimised for these values only */
517 if (sizeof(DCTELEM) != 2)
520 if (simd_support & JSIMD_NEON)
527 jsimd_can_fdct_ifast(void)
531 /* The code is optimised for these values only */
534 if (sizeof(DCTELEM) != 2)
537 if (simd_support & JSIMD_NEON)
544 jsimd_can_fdct_float(void)
550 jsimd_fdct_islow(DCTELEM *data)
552 jsimd_fdct_islow_neon(data);
556 jsimd_fdct_ifast(DCTELEM *data)
558 jsimd_fdct_ifast_neon(data);
562 jsimd_fdct_float(FAST_FLOAT *data)
567 jsimd_can_quantize(void)
571 /* The code is optimised for these values only */
574 if (sizeof(JCOEF) != 2)
576 if (sizeof(DCTELEM) != 2)
579 if (simd_support & JSIMD_NEON)
586 jsimd_can_quantize_float(void)
592 jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
594 jsimd_quantize_neon(coef_block, divisors, workspace);
598 jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
599 FAST_FLOAT *workspace)
604 jsimd_can_idct_2x2(void)
608 /* The code is optimised for these values only */
611 if (sizeof(JCOEF) != 2)
613 if (BITS_IN_JSAMPLE != 8)
615 if (sizeof(JDIMENSION) != 4)
617 if (sizeof(ISLOW_MULT_TYPE) != 2)
620 if (simd_support & JSIMD_NEON)
627 jsimd_can_idct_4x4(void)
631 /* The code is optimised for these values only */
634 if (sizeof(JCOEF) != 2)
636 if (BITS_IN_JSAMPLE != 8)
638 if (sizeof(JDIMENSION) != 4)
640 if (sizeof(ISLOW_MULT_TYPE) != 2)
643 if (simd_support & JSIMD_NEON)
650 jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
651 JCOEFPTR coef_block, JSAMPARRAY output_buf,
652 JDIMENSION output_col)
654 jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, output_col);
658 jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
659 JCOEFPTR coef_block, JSAMPARRAY output_buf,
660 JDIMENSION output_col)
662 jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, output_col);
666 jsimd_can_idct_islow(void)
670 /* The code is optimised for these values only */
673 if (sizeof(JCOEF) != 2)
675 if (BITS_IN_JSAMPLE != 8)
677 if (sizeof(JDIMENSION) != 4)
679 if (sizeof(ISLOW_MULT_TYPE) != 2)
682 if (simd_support & JSIMD_NEON)
689 jsimd_can_idct_ifast(void)
693 /* The code is optimised for these values only */
696 if (sizeof(JCOEF) != 2)
698 if (BITS_IN_JSAMPLE != 8)
700 if (sizeof(JDIMENSION) != 4)
702 if (sizeof(IFAST_MULT_TYPE) != 2)
704 if (IFAST_SCALE_BITS != 2)
707 if (simd_support & JSIMD_NEON)
714 jsimd_can_idct_float(void)
720 jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
721 JCOEFPTR coef_block, JSAMPARRAY output_buf,
722 JDIMENSION output_col)
724 jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
729 jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
730 JCOEFPTR coef_block, JSAMPARRAY output_buf,
731 JDIMENSION output_col)
733 jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
738 jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
739 JCOEFPTR coef_block, JSAMPARRAY output_buf,
740 JDIMENSION output_col)
745 jsimd_can_huff_encode_one_block(void)
751 if (sizeof(JCOEF) != 2)
754 if (simd_support & JSIMD_NEON && simd_huffman)
761 jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
762 int last_dc_val, c_derived_tbl *dctbl,
763 c_derived_tbl *actbl)
765 if (simd_features & JSIMD_FASTTBL)
766 return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
769 return jsimd_huff_encode_one_block_neon_slowtbl(state, buffer, block,
770 last_dc_val, dctbl, actbl);
774 jsimd_can_encode_mcu_AC_first_prepare(void)
780 jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
781 const int *jpeg_natural_order_start, int Sl,
782 int Al, JCOEF *values, size_t *zerobits)
787 jsimd_can_encode_mcu_AC_refine_prepare(void)
793 jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
794 const int *jpeg_natural_order_start, int Sl,
795 int Al, JCOEF *absvalues, size_t *bits)