4 * This file was part of the Independent JPEG Group's software:
5 * Copyright (C) 1994-1996, Thomas G. Lane.
6 * libjpeg-turbo Modifications:
7 * Copyright (C) 1999-2006, MIYASAKA Masaru.
8 * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
9 * Copyright (C) 2011, 2014-2015, 2022, D. R. Commander.
10 * For conditions of distribution and use, see the accompanying README.ijg
13 * This file contains the forward-DCT management logic.
14 * This code selects a particular DCT implementation to be used,
15 * and it performs related housekeeping chores including coefficient
19 #define JPEG_INTERNALS
22 #include "jdct.h" /* Private declarations for DCT subsystem */
26 /* Private subobject for this module */
28 typedef void (*forward_DCT_method_ptr) (DCTELEM *data);
29 typedef void (*float_DCT_method_ptr) (FAST_FLOAT *data);
31 typedef void (*convsamp_method_ptr) (_JSAMPARRAY sample_data,
34 typedef void (*float_convsamp_method_ptr) (_JSAMPARRAY sample_data,
36 FAST_FLOAT *workspace);
38 typedef void (*quantize_method_ptr) (JCOEFPTR coef_block, DCTELEM *divisors,
40 typedef void (*float_quantize_method_ptr) (JCOEFPTR coef_block,
42 FAST_FLOAT *workspace);
44 METHODDEF(void) quantize(JCOEFPTR, DCTELEM *, DCTELEM *);
47 struct jpeg_forward_dct pub; /* public fields */
49 /* Pointer to the DCT routine actually in use */
50 forward_DCT_method_ptr dct;
51 convsamp_method_ptr convsamp;
52 quantize_method_ptr quantize;
54 /* The actual post-DCT divisors --- not identical to the quant table
55 * entries, because of scaling (especially for an unnormalized DCT).
56 * Each table is given in normal array order.
58 DCTELEM *divisors[NUM_QUANT_TBLS];
60 /* work area for FDCT subroutine */
63 #ifdef DCT_FLOAT_SUPPORTED
64 /* Same as above for the floating-point case. */
65 float_DCT_method_ptr float_dct;
66 float_convsamp_method_ptr float_convsamp;
67 float_quantize_method_ptr float_quantize;
68 FAST_FLOAT *float_divisors[NUM_QUANT_TBLS];
69 FAST_FLOAT *float_workspace;
73 typedef my_fdct_controller *my_fdct_ptr;
76 #if BITS_IN_JSAMPLE == 8
79 * Find the highest bit in an integer through binary search.
92 if (!(val & 0xff00)) {
96 if (!(val & 0xf000)) {
100 if (!(val & 0xc000)) {
104 if (!(val & 0x8000)) {
114 * Compute values to do a division using reciprocal.
116 * This implementation is based on an algorithm described in
117 * "How to optimize for the Pentium family of microprocessors"
118 * (http://www.agner.org/assem/).
119 * More information about the basic algorithm can be found in
120 * the paper "Integer Division Using Reciprocals" by Robert Alverson.
122 * The basic idea is to replace x/d by x * d^-1. In order to store
123 * d^-1 with enough precision we shift it left a few places. It turns
124 * out that this algoright gives just enough precision, and also fits
127 * b = (the number of significant bits in divisor) - 1
128 * r = (word size) + b
131 * f will not be an integer for most cases, so we need to compensate
132 * for the rounding error introduced:
134 * no fractional part:
136 * result = input >> r
138 * fractional part of f < 0.5:
140 * round f down to nearest integer
141 * result = ((input + 1) * f) >> r
143 * fractional part of f > 0.5:
145 * round f up to nearest integer
146 * result = (input * f) >> r
148 * This is the original algorithm that gives truncated results. But we
149 * want properly rounded results, so we replace "input" with
150 * "input + divisor/2".
152 * In order to allow SIMD implementations we also tweak the values to
153 * allow the same calculation to be made at all times:
155 * dctbl[0] = f rounded to nearest integer
156 * dctbl[1] = divisor / 2 (+ 1 if fractional part of f < 0.5)
157 * dctbl[2] = 1 << ((word size) * 2 - r)
158 * dctbl[3] = r - (word size)
160 * dctbl[2] is for stupid instruction sets where the shift operation
161 * isn't member wise (e.g. MMX).
163 * The reason dctbl[2] and dctbl[3] reduce the shift with (word size)
164 * is that most SIMD implementations have a "multiply and store top
167 * Lastly, we store each of the values in their own table instead
168 * of in a consecutive manner, yet again in order to allow SIMD
173 compute_reciprocal(UINT16 divisor, DCTELEM *dtbl)
180 /* divisor == 1 means unquantized, so these reciprocal/correction/shift
181 * values will cause the C quantization algorithm to act like the
182 * identity function. Since only the C quantization algorithm is used in
183 * these cases, the scale value is irrelevant.
185 dtbl[DCTSIZE2 * 0] = (DCTELEM)1; /* reciprocal */
186 dtbl[DCTSIZE2 * 1] = (DCTELEM)0; /* correction */
187 dtbl[DCTSIZE2 * 2] = (DCTELEM)1; /* scale */
188 dtbl[DCTSIZE2 * 3] = -(DCTELEM)(sizeof(DCTELEM) * 8); /* shift */
192 b = flss(divisor) - 1;
193 r = sizeof(DCTELEM) * 8 + b;
195 fq = ((UDCTELEM2)1 << r) / divisor;
196 fr = ((UDCTELEM2)1 << r) % divisor;
198 c = divisor / 2; /* for rounding */
200 if (fr == 0) { /* divisor is power of two */
201 /* fq will be one bit too large to fit in DCTELEM, so adjust */
204 } else if (fr <= (divisor / 2U)) { /* fractional part is < 0.5 */
206 } else { /* fractional part is > 0.5 */
210 dtbl[DCTSIZE2 * 0] = (DCTELEM)fq; /* reciprocal */
211 dtbl[DCTSIZE2 * 1] = (DCTELEM)c; /* correction + roundfactor */
213 dtbl[DCTSIZE2 * 2] = (DCTELEM)(1 << (sizeof(DCTELEM) * 8 * 2 - r)); /* scale */
215 dtbl[DCTSIZE2 * 2] = 1;
217 dtbl[DCTSIZE2 * 3] = (DCTELEM)r - sizeof(DCTELEM) * 8; /* shift */
219 if (r <= 16) return 0;
227 * Initialize for a processing pass.
228 * Verify that all referenced Q-tables are present, and set up
229 * the divisor table for each one.
230 * In the current implementation, DCT of all components is done during
231 * the first pass, even if only some components will be output in the
232 * first scan. Hence all components should be examined here.
236 start_pass_fdctmgr(j_compress_ptr cinfo)
238 my_fdct_ptr fdct = (my_fdct_ptr)cinfo->fdct;
240 jpeg_component_info *compptr;
244 for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
246 qtblno = compptr->quant_tbl_no;
247 /* Make sure specified quantization table is present */
248 if (qtblno < 0 || qtblno >= NUM_QUANT_TBLS ||
249 cinfo->quant_tbl_ptrs[qtblno] == NULL)
250 ERREXIT1(cinfo, JERR_NO_QUANT_TABLE, qtblno);
251 qtbl = cinfo->quant_tbl_ptrs[qtblno];
252 /* Compute divisors for this quant table */
253 /* We may do this more than once for same table, but it's not a big deal */
254 switch (cinfo->dct_method) {
255 #ifdef DCT_ISLOW_SUPPORTED
257 /* For LL&M IDCT method, divisors are equal to raw quantization
258 * coefficients multiplied by 8 (to counteract scaling).
260 if (fdct->divisors[qtblno] == NULL) {
261 fdct->divisors[qtblno] = (DCTELEM *)
262 (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
263 (DCTSIZE2 * 4) * sizeof(DCTELEM));
265 dtbl = fdct->divisors[qtblno];
266 for (i = 0; i < DCTSIZE2; i++) {
267 #if BITS_IN_JSAMPLE == 8
269 if (!compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i]) &&
270 fdct->quantize == jsimd_quantize)
271 fdct->quantize = quantize;
273 compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i]);
276 dtbl[i] = ((DCTELEM)qtbl->quantval[i]) << 3;
281 #ifdef DCT_IFAST_SUPPORTED
284 /* For AA&N IDCT method, divisors are equal to quantization
285 * coefficients scaled by scalefactor[row]*scalefactor[col], where
287 * scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7
288 * We apply a further scale factor of 8.
290 #define CONST_BITS 14
291 static const INT16 aanscales[DCTSIZE2] = {
292 /* precomputed values scaled up by 14 bits */
293 16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520,
294 22725, 31521, 29692, 26722, 22725, 17855, 12299, 6270,
295 21407, 29692, 27969, 25172, 21407, 16819, 11585, 5906,
296 19266, 26722, 25172, 22654, 19266, 15137, 10426, 5315,
297 16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520,
298 12873, 17855, 16819, 15137, 12873, 10114, 6967, 3552,
299 8867, 12299, 11585, 10426, 8867, 6967, 4799, 2446,
300 4520, 6270, 5906, 5315, 4520, 3552, 2446, 1247
304 if (fdct->divisors[qtblno] == NULL) {
305 fdct->divisors[qtblno] = (DCTELEM *)
306 (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
307 (DCTSIZE2 * 4) * sizeof(DCTELEM));
309 dtbl = fdct->divisors[qtblno];
310 for (i = 0; i < DCTSIZE2; i++) {
311 #if BITS_IN_JSAMPLE == 8
313 if (!compute_reciprocal(
314 DESCALE(MULTIPLY16V16((JLONG)qtbl->quantval[i],
315 (JLONG)aanscales[i]),
316 CONST_BITS - 3), &dtbl[i]) &&
317 fdct->quantize == jsimd_quantize)
318 fdct->quantize = quantize;
321 DESCALE(MULTIPLY16V16((JLONG)qtbl->quantval[i],
322 (JLONG)aanscales[i]),
323 CONST_BITS-3), &dtbl[i]);
327 DESCALE(MULTIPLY16V16((JLONG)qtbl->quantval[i],
328 (JLONG)aanscales[i]),
335 #ifdef DCT_FLOAT_SUPPORTED
338 /* For float AA&N IDCT method, divisors are equal to quantization
339 * coefficients scaled by scalefactor[row]*scalefactor[col], where
341 * scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7
342 * We apply a further scale factor of 8.
343 * What's actually stored is 1/divisor so that the inner loop can
344 * use a multiplication rather than a division.
348 static const double aanscalefactor[DCTSIZE] = {
349 1.0, 1.387039845, 1.306562965, 1.175875602,
350 1.0, 0.785694958, 0.541196100, 0.275899379
353 if (fdct->float_divisors[qtblno] == NULL) {
354 fdct->float_divisors[qtblno] = (FAST_FLOAT *)
355 (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
356 DCTSIZE2 * sizeof(FAST_FLOAT));
358 fdtbl = fdct->float_divisors[qtblno];
360 for (row = 0; row < DCTSIZE; row++) {
361 for (col = 0; col < DCTSIZE; col++) {
362 fdtbl[i] = (FAST_FLOAT)
363 (1.0 / (((double)qtbl->quantval[i] *
364 aanscalefactor[row] * aanscalefactor[col] * 8.0)));
372 ERREXIT(cinfo, JERR_NOT_COMPILED);
380 * Load data into workspace, applying unsigned->signed conversion.
384 convsamp(_JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace)
386 register DCTELEM *workspaceptr;
387 register _JSAMPROW elemptr;
390 workspaceptr = workspace;
391 for (elemr = 0; elemr < DCTSIZE; elemr++) {
392 elemptr = sample_data[elemr] + start_col;
394 #if DCTSIZE == 8 /* unroll the inner loop */
395 *workspaceptr++ = (*elemptr++) - _CENTERJSAMPLE;
396 *workspaceptr++ = (*elemptr++) - _CENTERJSAMPLE;
397 *workspaceptr++ = (*elemptr++) - _CENTERJSAMPLE;
398 *workspaceptr++ = (*elemptr++) - _CENTERJSAMPLE;
399 *workspaceptr++ = (*elemptr++) - _CENTERJSAMPLE;
400 *workspaceptr++ = (*elemptr++) - _CENTERJSAMPLE;
401 *workspaceptr++ = (*elemptr++) - _CENTERJSAMPLE;
402 *workspaceptr++ = (*elemptr++) - _CENTERJSAMPLE;
406 for (elemc = DCTSIZE; elemc > 0; elemc--)
407 *workspaceptr++ = (*elemptr++) - _CENTERJSAMPLE;
415 * Quantize/descale the coefficients, and store into coef_blocks[].
419 quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
423 JCOEFPTR output_ptr = coef_block;
425 #if BITS_IN_JSAMPLE == 8
427 UDCTELEM recip, corr;
431 for (i = 0; i < DCTSIZE2; i++) {
433 recip = divisors[i + DCTSIZE2 * 0];
434 corr = divisors[i + DCTSIZE2 * 1];
435 shift = divisors[i + DCTSIZE2 * 3];
439 product = (UDCTELEM2)(temp + corr) * recip;
440 product >>= shift + sizeof(DCTELEM) * 8;
441 temp = (DCTELEM)product;
444 product = (UDCTELEM2)(temp + corr) * recip;
445 product >>= shift + sizeof(DCTELEM) * 8;
446 temp = (DCTELEM)product;
448 output_ptr[i] = (JCOEF)temp;
453 register DCTELEM qval;
455 for (i = 0; i < DCTSIZE2; i++) {
458 /* Divide the coefficient value by qval, ensuring proper rounding.
459 * Since C does not specify the direction of rounding for negative
460 * quotients, we have to force the dividend positive for portability.
462 * In most files, at least half of the output values will be zero
463 * (at default quantization settings, more like three-quarters...)
464 * so we should ensure that this case is fast. On many machines,
465 * a comparison is enough cheaper than a divide to make a special test
466 * a win. Since both inputs will be nonnegative, we need only test
467 * for a < b to discover whether a/b is 0.
468 * If your machine's division is fast enough, define FAST_DIVIDE.
471 #define DIVIDE_BY(a, b) a /= b
473 #define DIVIDE_BY(a, b) if (a >= b) a /= b; else a = 0
477 temp += qval >> 1; /* for rounding */
478 DIVIDE_BY(temp, qval);
481 temp += qval >> 1; /* for rounding */
482 DIVIDE_BY(temp, qval);
484 output_ptr[i] = (JCOEF)temp;
493 * Perform forward DCT on one or more blocks of a component.
495 * The input samples are taken from the sample_data[] array starting at
496 * position start_row/start_col, and moving to the right for any additional
497 * blocks. The quantized coefficients are returned in coef_blocks[].
501 forward_DCT(j_compress_ptr cinfo, jpeg_component_info *compptr,
502 _JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
503 JDIMENSION start_row, JDIMENSION start_col, JDIMENSION num_blocks)
504 /* This version is used for integer DCT implementations. */
506 /* This routine is heavily used, so it's worth coding it tightly. */
507 my_fdct_ptr fdct = (my_fdct_ptr)cinfo->fdct;
508 DCTELEM *divisors = fdct->divisors[compptr->quant_tbl_no];
512 /* Make sure the compiler doesn't look up these every pass */
513 forward_DCT_method_ptr do_dct = fdct->dct;
514 convsamp_method_ptr do_convsamp = fdct->convsamp;
515 quantize_method_ptr do_quantize = fdct->quantize;
516 workspace = fdct->workspace;
518 sample_data += start_row; /* fold in the vertical offset once */
520 for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
521 /* Load data into workspace, applying unsigned->signed conversion */
522 (*do_convsamp) (sample_data, start_col, workspace);
524 /* Perform the DCT */
525 (*do_dct) (workspace);
527 /* Quantize/descale the coefficients, and store into coef_blocks[] */
528 (*do_quantize) (coef_blocks[bi], divisors, workspace);
533 #ifdef DCT_FLOAT_SUPPORTED
536 convsamp_float(_JSAMPARRAY sample_data, JDIMENSION start_col,
537 FAST_FLOAT *workspace)
539 register FAST_FLOAT *workspaceptr;
540 register _JSAMPROW elemptr;
543 workspaceptr = workspace;
544 for (elemr = 0; elemr < DCTSIZE; elemr++) {
545 elemptr = sample_data[elemr] + start_col;
546 #if DCTSIZE == 8 /* unroll the inner loop */
547 *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - _CENTERJSAMPLE);
548 *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - _CENTERJSAMPLE);
549 *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - _CENTERJSAMPLE);
550 *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - _CENTERJSAMPLE);
551 *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - _CENTERJSAMPLE);
552 *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - _CENTERJSAMPLE);
553 *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - _CENTERJSAMPLE);
554 *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - _CENTERJSAMPLE);
558 for (elemc = DCTSIZE; elemc > 0; elemc--)
559 *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - _CENTERJSAMPLE);
567 quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
568 FAST_FLOAT *workspace)
570 register FAST_FLOAT temp;
572 register JCOEFPTR output_ptr = coef_block;
574 for (i = 0; i < DCTSIZE2; i++) {
575 /* Apply the quantization and scaling factor */
576 temp = workspace[i] * divisors[i];
578 /* Round to nearest integer.
579 * Since C does not specify the direction of rounding for negative
580 * quotients, we have to force the dividend positive for portability.
581 * The maximum coefficient size is +-16K (for 12-bit data), so this
582 * code should work for either 16-bit or 32-bit ints.
584 output_ptr[i] = (JCOEF)((int)(temp + (FAST_FLOAT)16384.5) - 16384);
590 forward_DCT_float(j_compress_ptr cinfo, jpeg_component_info *compptr,
591 _JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
592 JDIMENSION start_row, JDIMENSION start_col,
593 JDIMENSION num_blocks)
594 /* This version is used for floating-point DCT implementations. */
596 /* This routine is heavily used, so it's worth coding it tightly. */
597 my_fdct_ptr fdct = (my_fdct_ptr)cinfo->fdct;
598 FAST_FLOAT *divisors = fdct->float_divisors[compptr->quant_tbl_no];
599 FAST_FLOAT *workspace;
603 /* Make sure the compiler doesn't look up these every pass */
604 float_DCT_method_ptr do_dct = fdct->float_dct;
605 float_convsamp_method_ptr do_convsamp = fdct->float_convsamp;
606 float_quantize_method_ptr do_quantize = fdct->float_quantize;
607 workspace = fdct->float_workspace;
609 sample_data += start_row; /* fold in the vertical offset once */
611 for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
612 /* Load data into workspace, applying unsigned->signed conversion */
613 (*do_convsamp) (sample_data, start_col, workspace);
615 /* Perform the DCT */
616 (*do_dct) (workspace);
618 /* Quantize/descale the coefficients, and store into coef_blocks[] */
619 (*do_quantize) (coef_blocks[bi], divisors, workspace);
623 #endif /* DCT_FLOAT_SUPPORTED */
627 * Initialize FDCT manager.
631 _jinit_forward_dct(j_compress_ptr cinfo)
636 if (cinfo->data_precision != BITS_IN_JSAMPLE)
637 ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
640 (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
641 sizeof(my_fdct_controller));
642 cinfo->fdct = (struct jpeg_forward_dct *)fdct;
643 fdct->pub.start_pass = start_pass_fdctmgr;
645 /* First determine the DCT... */
646 switch (cinfo->dct_method) {
647 #ifdef DCT_ISLOW_SUPPORTED
649 fdct->pub._forward_DCT = forward_DCT;
651 if (jsimd_can_fdct_islow())
652 fdct->dct = jsimd_fdct_islow;
655 fdct->dct = _jpeg_fdct_islow;
658 #ifdef DCT_IFAST_SUPPORTED
660 fdct->pub._forward_DCT = forward_DCT;
662 if (jsimd_can_fdct_ifast())
663 fdct->dct = jsimd_fdct_ifast;
666 fdct->dct = _jpeg_fdct_ifast;
669 #ifdef DCT_FLOAT_SUPPORTED
671 fdct->pub._forward_DCT = forward_DCT_float;
673 if (jsimd_can_fdct_float())
674 fdct->float_dct = jsimd_fdct_float;
677 fdct->float_dct = jpeg_fdct_float;
681 ERREXIT(cinfo, JERR_NOT_COMPILED);
685 /* ...then the supporting stages. */
686 switch (cinfo->dct_method) {
687 #ifdef DCT_ISLOW_SUPPORTED
690 #ifdef DCT_IFAST_SUPPORTED
693 #if defined(DCT_ISLOW_SUPPORTED) || defined(DCT_IFAST_SUPPORTED)
695 if (jsimd_can_convsamp())
696 fdct->convsamp = jsimd_convsamp;
699 fdct->convsamp = convsamp;
701 if (jsimd_can_quantize())
702 fdct->quantize = jsimd_quantize;
705 fdct->quantize = quantize;
708 #ifdef DCT_FLOAT_SUPPORTED
711 if (jsimd_can_convsamp_float())
712 fdct->float_convsamp = jsimd_convsamp_float;
715 fdct->float_convsamp = convsamp_float;
717 if (jsimd_can_quantize_float())
718 fdct->float_quantize = jsimd_quantize_float;
721 fdct->float_quantize = quantize_float;
725 ERREXIT(cinfo, JERR_NOT_COMPILED);
729 /* Allocate workspace memory */
730 #ifdef DCT_FLOAT_SUPPORTED
731 if (cinfo->dct_method == JDCT_FLOAT)
732 fdct->float_workspace = (FAST_FLOAT *)
733 (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
734 sizeof(FAST_FLOAT) * DCTSIZE2);
737 fdct->workspace = (DCTELEM *)
738 (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
739 sizeof(DCTELEM) * DCTSIZE2);
741 /* Mark divisor tables unallocated */
742 for (i = 0; i < NUM_QUANT_TBLS; i++) {
743 fdct->divisors[i] = NULL;
744 #ifdef DCT_FLOAT_SUPPORTED
745 fdct->float_divisors[i] = NULL;