4 * Copyright (C) 1994-1996, Thomas G. Lane.
5 * Copyright (C) 1999-2006, MIYASAKA Masaru.
6 * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
7 * Copyright (C) 2011 D. R. Commander
8 * This file is part of the Independent JPEG Group's software.
9 * For conditions of distribution and use, see the accompanying README file.
11 * This file contains the forward-DCT management logic.
12 * This code selects a particular DCT implementation to be used,
13 * and it performs related housekeeping chores including coefficient
17 #define JPEG_INTERNALS
20 #include "jdct.h" /* Private declarations for DCT subsystem */
24 /* Private subobject for this module */
26 typedef JMETHOD(void, forward_DCT_method_ptr, (DCTELEM * data));
27 typedef JMETHOD(void, float_DCT_method_ptr, (FAST_FLOAT * data));
29 typedef JMETHOD(void, convsamp_method_ptr,
30 (JSAMPARRAY sample_data, JDIMENSION start_col,
31 DCTELEM * workspace));
32 typedef JMETHOD(void, float_convsamp_method_ptr,
33 (JSAMPARRAY sample_data, JDIMENSION start_col,
34 FAST_FLOAT *workspace));
36 typedef JMETHOD(void, quantize_method_ptr,
37 (JCOEFPTR coef_block, DCTELEM * divisors,
38 DCTELEM * workspace));
39 typedef JMETHOD(void, float_quantize_method_ptr,
40 (JCOEFPTR coef_block, FAST_FLOAT * divisors,
41 FAST_FLOAT * workspace));
43 METHODDEF(void) quantize (JCOEFPTR, DCTELEM *, DCTELEM *);
46 struct jpeg_forward_dct pub; /* public fields */
48 /* Pointer to the DCT routine actually in use */
49 forward_DCT_method_ptr dct;
50 convsamp_method_ptr convsamp;
51 quantize_method_ptr quantize;
53 /* The actual post-DCT divisors --- not identical to the quant table
54 * entries, because of scaling (especially for an unnormalized DCT).
55 * Each table is given in normal array order.
57 DCTELEM * divisors[NUM_QUANT_TBLS];
59 /* work area for FDCT subroutine */
62 #ifdef DCT_FLOAT_SUPPORTED
63 /* Same as above for the floating-point case. */
64 float_DCT_method_ptr float_dct;
65 float_convsamp_method_ptr float_convsamp;
66 float_quantize_method_ptr float_quantize;
67 FAST_FLOAT * float_divisors[NUM_QUANT_TBLS];
68 FAST_FLOAT * float_workspace;
72 typedef my_fdct_controller * my_fdct_ptr;
76 * Find the highest bit in an integer through binary search.
88 if (!(val & 0xff00)) {
92 if (!(val & 0xf000)) {
96 if (!(val & 0xc000)) {
100 if (!(val & 0x8000)) {
109 * Compute values to do a division using reciprocal.
111 * This implementation is based on an algorithm described in
112 * "How to optimize for the Pentium family of microprocessors"
113 * (http://www.agner.org/assem/).
114 * More information about the basic algorithm can be found in
115 * the paper "Integer Division Using Reciprocals" by Robert Alverson.
117 * The basic idea is to replace x/d by x * d^-1. In order to store
118 * d^-1 with enough precision we shift it left a few places. It turns
119 * out that this algoright gives just enough precision, and also fits
122 * b = (the number of significant bits in divisor) - 1
123 * r = (word size) + b
126 * f will not be an integer for most cases, so we need to compensate
127 * for the rounding error introduced:
129 * no fractional part:
131 * result = input >> r
133 * fractional part of f < 0.5:
135 * round f down to nearest integer
136 * result = ((input + 1) * f) >> r
138 * fractional part of f > 0.5:
140 * round f up to nearest integer
141 * result = (input * f) >> r
143 * This is the original algorithm that gives truncated results. But we
144 * want properly rounded results, so we replace "input" with
145 * "input + divisor/2".
147 * In order to allow SIMD implementations we also tweak the values to
148 * allow the same calculation to be made at all times:
150 * dctbl[0] = f rounded to nearest integer
151 * dctbl[1] = divisor / 2 (+ 1 if fractional part of f < 0.5)
152 * dctbl[2] = 1 << ((word size) * 2 - r)
153 * dctbl[3] = r - (word size)
155 * dctbl[2] is for stupid instruction sets where the shift operation
156 * isn't member wise (e.g. MMX).
158 * The reason dctbl[2] and dctbl[3] reduce the shift with (word size)
159 * is that most SIMD implementations have a "multiply and store top
162 * Lastly, we store each of the values in their own table instead
163 * of in a consecutive manner, yet again in order to allow SIMD
167 compute_reciprocal (UINT16 divisor, DCTELEM * dtbl)
173 b = flss(divisor) - 1;
174 r = sizeof(DCTELEM) * 8 + b;
176 fq = ((UDCTELEM2)1 << r) / divisor;
177 fr = ((UDCTELEM2)1 << r) % divisor;
179 c = divisor / 2; /* for rounding */
181 if (fr == 0) { /* divisor is power of two */
182 /* fq will be one bit too large to fit in DCTELEM, so adjust */
185 } else if (fr <= (divisor / 2U)) { /* fractional part is < 0.5 */
187 } else { /* fractional part is > 0.5 */
191 dtbl[DCTSIZE2 * 0] = (DCTELEM) fq; /* reciprocal */
192 dtbl[DCTSIZE2 * 1] = (DCTELEM) c; /* correction + roundfactor */
193 dtbl[DCTSIZE2 * 2] = (DCTELEM) (1 << (sizeof(DCTELEM)*8*2 - r)); /* scale */
194 dtbl[DCTSIZE2 * 3] = (DCTELEM) r - sizeof(DCTELEM)*8; /* shift */
196 if(r <= 16) return 0;
201 * Initialize for a processing pass.
202 * Verify that all referenced Q-tables are present, and set up
203 * the divisor table for each one.
204 * In the current implementation, DCT of all components is done during
205 * the first pass, even if only some components will be output in the
206 * first scan. Hence all components should be examined here.
210 start_pass_fdctmgr (j_compress_ptr cinfo)
212 my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
214 jpeg_component_info *compptr;
218 for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
220 qtblno = compptr->quant_tbl_no;
221 /* Make sure specified quantization table is present */
222 if (qtblno < 0 || qtblno >= NUM_QUANT_TBLS ||
223 cinfo->quant_tbl_ptrs[qtblno] == NULL)
224 ERREXIT1(cinfo, JERR_NO_QUANT_TABLE, qtblno);
225 qtbl = cinfo->quant_tbl_ptrs[qtblno];
226 /* Compute divisors for this quant table */
227 /* We may do this more than once for same table, but it's not a big deal */
228 switch (cinfo->dct_method) {
229 #ifdef DCT_ISLOW_SUPPORTED
231 /* For LL&M IDCT method, divisors are equal to raw quantization
232 * coefficients multiplied by 8 (to counteract scaling).
234 if (fdct->divisors[qtblno] == NULL) {
235 fdct->divisors[qtblno] = (DCTELEM *)
236 (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
237 (DCTSIZE2 * 4) * SIZEOF(DCTELEM));
239 dtbl = fdct->divisors[qtblno];
240 for (i = 0; i < DCTSIZE2; i++) {
241 if(!compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i])
242 && fdct->quantize == jsimd_quantize)
243 fdct->quantize = quantize;
247 #ifdef DCT_IFAST_SUPPORTED
250 /* For AA&N IDCT method, divisors are equal to quantization
251 * coefficients scaled by scalefactor[row]*scalefactor[col], where
253 * scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7
254 * We apply a further scale factor of 8.
256 #define CONST_BITS 14
257 static const INT16 aanscales[DCTSIZE2] = {
258 /* precomputed values scaled up by 14 bits */
259 16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520,
260 22725, 31521, 29692, 26722, 22725, 17855, 12299, 6270,
261 21407, 29692, 27969, 25172, 21407, 16819, 11585, 5906,
262 19266, 26722, 25172, 22654, 19266, 15137, 10426, 5315,
263 16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520,
264 12873, 17855, 16819, 15137, 12873, 10114, 6967, 3552,
265 8867, 12299, 11585, 10426, 8867, 6967, 4799, 2446,
266 4520, 6270, 5906, 5315, 4520, 3552, 2446, 1247
270 if (fdct->divisors[qtblno] == NULL) {
271 fdct->divisors[qtblno] = (DCTELEM *)
272 (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
273 (DCTSIZE2 * 4) * SIZEOF(DCTELEM));
275 dtbl = fdct->divisors[qtblno];
276 for (i = 0; i < DCTSIZE2; i++) {
277 if(!compute_reciprocal(
278 DESCALE(MULTIPLY16V16((INT32) qtbl->quantval[i],
279 (INT32) aanscales[i]),
280 CONST_BITS-3), &dtbl[i])
281 && fdct->quantize == jsimd_quantize)
282 fdct->quantize = quantize;
287 #ifdef DCT_FLOAT_SUPPORTED
290 /* For float AA&N IDCT method, divisors are equal to quantization
291 * coefficients scaled by scalefactor[row]*scalefactor[col], where
293 * scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7
294 * We apply a further scale factor of 8.
295 * What's actually stored is 1/divisor so that the inner loop can
296 * use a multiplication rather than a division.
300 static const double aanscalefactor[DCTSIZE] = {
301 1.0, 1.387039845, 1.306562965, 1.175875602,
302 1.0, 0.785694958, 0.541196100, 0.275899379
305 if (fdct->float_divisors[qtblno] == NULL) {
306 fdct->float_divisors[qtblno] = (FAST_FLOAT *)
307 (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
308 DCTSIZE2 * SIZEOF(FAST_FLOAT));
310 fdtbl = fdct->float_divisors[qtblno];
312 for (row = 0; row < DCTSIZE; row++) {
313 for (col = 0; col < DCTSIZE; col++) {
314 fdtbl[i] = (FAST_FLOAT)
315 (1.0 / (((double) qtbl->quantval[i] *
316 aanscalefactor[row] * aanscalefactor[col] * 8.0)));
324 ERREXIT(cinfo, JERR_NOT_COMPILED);
332 * Load data into workspace, applying unsigned->signed conversion.
336 convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace)
338 register DCTELEM *workspaceptr;
339 register JSAMPROW elemptr;
342 workspaceptr = workspace;
343 for (elemr = 0; elemr < DCTSIZE; elemr++) {
344 elemptr = sample_data[elemr] + start_col;
346 #if DCTSIZE == 8 /* unroll the inner loop */
347 *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
348 *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
349 *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
350 *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
351 *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
352 *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
353 *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
354 *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
358 for (elemc = DCTSIZE; elemc > 0; elemc--)
359 *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
367 * Quantize/descale the coefficients, and store into coef_blocks[].
371 quantize (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace)
375 UDCTELEM recip, corr, shift;
377 JCOEFPTR output_ptr = coef_block;
379 for (i = 0; i < DCTSIZE2; i++) {
381 recip = divisors[i + DCTSIZE2 * 0];
382 corr = divisors[i + DCTSIZE2 * 1];
383 shift = divisors[i + DCTSIZE2 * 3];
387 product = (UDCTELEM2)(temp + corr) * recip;
388 product >>= shift + sizeof(DCTELEM)*8;
392 product = (UDCTELEM2)(temp + corr) * recip;
393 product >>= shift + sizeof(DCTELEM)*8;
397 output_ptr[i] = (JCOEF) temp;
403 * Perform forward DCT on one or more blocks of a component.
405 * The input samples are taken from the sample_data[] array starting at
406 * position start_row/start_col, and moving to the right for any additional
407 * blocks. The quantized coefficients are returned in coef_blocks[].
411 forward_DCT (j_compress_ptr cinfo, jpeg_component_info * compptr,
412 JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
413 JDIMENSION start_row, JDIMENSION start_col,
414 JDIMENSION num_blocks)
415 /* This version is used for integer DCT implementations. */
417 /* This routine is heavily used, so it's worth coding it tightly. */
418 my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
419 DCTELEM * divisors = fdct->divisors[compptr->quant_tbl_no];
423 /* Make sure the compiler doesn't look up these every pass */
424 forward_DCT_method_ptr do_dct = fdct->dct;
425 convsamp_method_ptr do_convsamp = fdct->convsamp;
426 quantize_method_ptr do_quantize = fdct->quantize;
427 workspace = fdct->workspace;
429 sample_data += start_row; /* fold in the vertical offset once */
431 for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
432 /* Load data into workspace, applying unsigned->signed conversion */
433 (*do_convsamp) (sample_data, start_col, workspace);
435 /* Perform the DCT */
436 (*do_dct) (workspace);
438 /* Quantize/descale the coefficients, and store into coef_blocks[] */
439 (*do_quantize) (coef_blocks[bi], divisors, workspace);
444 #ifdef DCT_FLOAT_SUPPORTED
448 convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT * workspace)
450 register FAST_FLOAT *workspaceptr;
451 register JSAMPROW elemptr;
454 workspaceptr = workspace;
455 for (elemr = 0; elemr < DCTSIZE; elemr++) {
456 elemptr = sample_data[elemr] + start_col;
457 #if DCTSIZE == 8 /* unroll the inner loop */
458 *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
459 *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
460 *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
461 *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
462 *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
463 *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
464 *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
465 *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
469 for (elemc = DCTSIZE; elemc > 0; elemc--)
470 *workspaceptr++ = (FAST_FLOAT)
471 (GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
479 quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors, FAST_FLOAT * workspace)
481 register FAST_FLOAT temp;
483 register JCOEFPTR output_ptr = coef_block;
485 for (i = 0; i < DCTSIZE2; i++) {
486 /* Apply the quantization and scaling factor */
487 temp = workspace[i] * divisors[i];
489 /* Round to nearest integer.
490 * Since C does not specify the direction of rounding for negative
491 * quotients, we have to force the dividend positive for portability.
492 * The maximum coefficient size is +-16K (for 12-bit data), so this
493 * code should work for either 16-bit or 32-bit ints.
495 output_ptr[i] = (JCOEF) ((int) (temp + (FAST_FLOAT) 16384.5) - 16384);
501 forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr,
502 JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
503 JDIMENSION start_row, JDIMENSION start_col,
504 JDIMENSION num_blocks)
505 /* This version is used for floating-point DCT implementations. */
507 /* This routine is heavily used, so it's worth coding it tightly. */
508 my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
509 FAST_FLOAT * divisors = fdct->float_divisors[compptr->quant_tbl_no];
510 FAST_FLOAT * workspace;
514 /* Make sure the compiler doesn't look up these every pass */
515 float_DCT_method_ptr do_dct = fdct->float_dct;
516 float_convsamp_method_ptr do_convsamp = fdct->float_convsamp;
517 float_quantize_method_ptr do_quantize = fdct->float_quantize;
518 workspace = fdct->float_workspace;
520 sample_data += start_row; /* fold in the vertical offset once */
522 for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
523 /* Load data into workspace, applying unsigned->signed conversion */
524 (*do_convsamp) (sample_data, start_col, workspace);
526 /* Perform the DCT */
527 (*do_dct) (workspace);
529 /* Quantize/descale the coefficients, and store into coef_blocks[] */
530 (*do_quantize) (coef_blocks[bi], divisors, workspace);
534 #endif /* DCT_FLOAT_SUPPORTED */
538 * Initialize FDCT manager.
542 jinit_forward_dct (j_compress_ptr cinfo)
548 (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
549 SIZEOF(my_fdct_controller));
550 cinfo->fdct = (struct jpeg_forward_dct *) fdct;
551 fdct->pub.start_pass = start_pass_fdctmgr;
553 /* First determine the DCT... */
554 switch (cinfo->dct_method) {
555 #ifdef DCT_ISLOW_SUPPORTED
557 fdct->pub.forward_DCT = forward_DCT;
558 if (jsimd_can_fdct_islow())
559 fdct->dct = jsimd_fdct_islow;
561 fdct->dct = jpeg_fdct_islow;
564 #ifdef DCT_IFAST_SUPPORTED
566 fdct->pub.forward_DCT = forward_DCT;
567 if (jsimd_can_fdct_ifast())
568 fdct->dct = jsimd_fdct_ifast;
570 fdct->dct = jpeg_fdct_ifast;
573 #ifdef DCT_FLOAT_SUPPORTED
575 fdct->pub.forward_DCT = forward_DCT_float;
576 if (jsimd_can_fdct_float())
577 fdct->float_dct = jsimd_fdct_float;
579 fdct->float_dct = jpeg_fdct_float;
583 ERREXIT(cinfo, JERR_NOT_COMPILED);
587 /* ...then the supporting stages. */
588 switch (cinfo->dct_method) {
589 #ifdef DCT_ISLOW_SUPPORTED
592 #ifdef DCT_IFAST_SUPPORTED
595 #if defined(DCT_ISLOW_SUPPORTED) || defined(DCT_IFAST_SUPPORTED)
596 if (jsimd_can_convsamp())
597 fdct->convsamp = jsimd_convsamp;
599 fdct->convsamp = convsamp;
600 if (jsimd_can_quantize())
601 fdct->quantize = jsimd_quantize;
603 fdct->quantize = quantize;
606 #ifdef DCT_FLOAT_SUPPORTED
608 if (jsimd_can_convsamp_float())
609 fdct->float_convsamp = jsimd_convsamp_float;
611 fdct->float_convsamp = convsamp_float;
612 if (jsimd_can_quantize_float())
613 fdct->float_quantize = jsimd_quantize_float;
615 fdct->float_quantize = quantize_float;
619 ERREXIT(cinfo, JERR_NOT_COMPILED);
623 /* Allocate workspace memory */
624 #ifdef DCT_FLOAT_SUPPORTED
625 if (cinfo->dct_method == JDCT_FLOAT)
626 fdct->float_workspace = (FAST_FLOAT *)
627 (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
628 SIZEOF(FAST_FLOAT) * DCTSIZE2);
631 fdct->workspace = (DCTELEM *)
632 (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
633 SIZEOF(DCTELEM) * DCTSIZE2);
635 /* Mark divisor tables unallocated */
636 for (i = 0; i < NUM_QUANT_TBLS; i++) {
637 fdct->divisors[i] = NULL;
638 #ifdef DCT_FLOAT_SUPPORTED
639 fdct->float_divisors[i] = NULL;