1 /* xdelta 3 - delta compression tools and library
2 * Copyright (C) 2002, 2006, 2007. Joshua P. MacDonald
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 /* TODO: This code needs a thorough round of commenting. There is
20 * some slop in the declaration of arrays, which are maybe one element
21 * larger than they need to be and comments would help clear it up. */
23 #ifndef _XDELTA3_DJW_H_
24 #define _XDELTA3_DJW_H_
26 /* The following people deserve much credit for the algorithms and
27 * techniques contained in this file:
30 Bzip2 sources, implementation of the multi-table Huffman technique.
32 Jean-loup Gailly and Mark Adler and L. Peter Deutsch
33 Zlib source code, RFC 1951
35 Daniel S. Hirschberg and Debra A. LeLewer
36 "Efficient Decoding of Prefix Codes"
37 Communications of the ACM, April 1990 33(4).
40 Program bred3.c, bexp3 and accompanying documents bred3.ps, huff.ps.
41 This contains the idea behind the multi-table Huffman and 1-2 coding
43 ftp://ftp.cl.cam.ac.uk/users/djw3/
47 /* OPT: during the multi-table iteration, pick the worst-overall
48 * performing table and replace it with exactly the frequencies of the
49 * worst-overall performing sector or N-worst performing sectors. */
51 /* REF: See xdfs-0.222 and xdfs-0.226 for some old experiments with
52 * the Bzip prefix coding strategy. xdfs-0.256 contains the last of
53 * the other-format tests, including RFC1950 and the RFC1950+MTF
56 #define DJW_MAX_CODELEN 20U /* Maximum length of an alphabet code. */
58 /* Code lengths are themselves code-length encoded, so the total number of
59 * codes is: [RUN_0, RUN_1, 1-DJW_MAX_CODELEN] */
60 #define DJW_TOTAL_CODES (DJW_MAX_CODELEN+2)
62 #define RUN_0 0U /* Symbols used in MTF+1/2 coding. */
65 /* Number of code lengths always encoded (djw_encode_basic array) */
66 #define DJW_BASIC_CODES 5U
67 #define DJW_RUN_CODES 2U /* Number of run codes */
69 /* Offset of extra codes */
70 #define DJW_EXTRA_12OFFSET (DJW_BASIC_CODES + DJW_RUN_CODES)
72 /* Number of optionally encoded code lengths (djw_encode_extra array) */
73 #define DJW_EXTRA_CODES 15U
75 /* Number of bits to code [0-DJW_EXTRA_CODES] */
76 #define DJW_EXTRA_CODE_BITS 4U
78 #define DJW_MAX_GROUPS 8U /* Max number of group coding tables */
79 #define DJW_GROUP_BITS 3U /* Number of bits to code [1-DJW_MAX_GROUPS] */
81 #define DJW_SECTORSZ_MULT 5U /* Multiplier for encoded sectorsz */
82 #define DJW_SECTORSZ_BITS 5U /* Number of bits to code group size */
83 #define DJW_SECTORSZ_MAX ((1U << DJW_SECTORSZ_BITS) * DJW_SECTORSZ_MULT)
85 /* Maximum number of iterations to find group tables. */
86 #define DJW_MAX_ITER 6U
87 /* Minimum number of bits an iteration must reduce coding by. */
88 #define DJW_MIN_IMPROVEMENT 20U
90 /* Maximum code length of a prefix code length */
91 #define DJW_MAX_CLCLEN 15U
93 /* Number of bits to code [0-DJW_MAX_CLCLEN] */
94 #define DJW_CLCLEN_BITS 4U
96 #define DJW_MAX_GBCLEN 7U /* Maximum code length of a group selector */
98 /* Number of bits to code [0-DJW_MAX_GBCLEN]
99 * TODO: Actually, should never have zero code lengths here, or else a group
100 * went unused. Write a test for this: if a group goes unused, eliminate
102 #define DJW_GBCLEN_BITS 3U
104 /* It has to save at least this many bits... */
105 #define EFFICIENCY_BITS 16U
107 typedef struct _djw_stream djw_stream;
108 typedef struct _djw_heapen djw_heapen;
109 typedef struct _djw_prefix djw_prefix;
110 typedef uint32_t djw_weight;
133 /* Each Huffman table consists of 256 "code length" (CLEN) codes,
134 * which are themselves Huffman coded after eliminating repeats and
135 * move-to-front coding. The prefix consists of all the CLEN codes in
136 * djw_encode_basic plus a 4-bit value stating how many of the
137 * djw_encode_extra codes are actually coded (the rest are presumed
138 * zero, or unused CLEN codes).
140 * These values of these two arrays were arrived at by studying the
141 * distribution of min and max clen over a collection of DATA, INST,
142 * and ADDR inputs. The goal is to specify the order of
143 * djw_extra_codes that is most likely to minimize the number of extra
144 * codes that must be encoded.
146 * Results: 158896 sections were counted by compressing files (window
147 * size 512K) listed with: `find / -type f ( -user jmacd -o -perm +444
150 * The distribution of CLEN codes for each efficient invocation of the
151 * secondary compressor (taking the best number of groups/sector size)
152 * was recorded. Then we look at the distribution of min and max clen
153 * values, counting the number of times the value C_low is less than
154 * the min and C_high is greater than the max. Values >= C_high and
155 * <= C_low will not have their lengths coded. The results are sorted
156 * and the least likely 15 are placed into the djw_encode_extra[]
157 * array in order. These values are used as the initial MTF ordering.
188 static const uint8_t djw_encode_12extra[DJW_EXTRA_CODES] =
190 9, 10, 3, 11, 2, 12, 13, 1, 14, 15, 16, 17, 18, 19, 20,
193 static const uint8_t djw_encode_12basic[DJW_BASIC_CODES] =
198 /*********************************************************************/
200 /*********************************************************************/
202 static djw_stream* djw_alloc (xd3_stream *stream);
203 static int djw_init (xd3_stream *stream,
206 static void djw_destroy (xd3_stream *stream,
210 static int xd3_encode_huff (xd3_stream *stream,
211 djw_stream *sec_stream,
217 static int xd3_decode_huff (xd3_stream *stream,
218 djw_stream *sec_stream,
219 const uint8_t **input,
220 const uint8_t *const input_end,
222 const uint8_t *const output_end);
224 /*********************************************************************/
226 /*********************************************************************/
229 djw_alloc (xd3_stream *stream)
231 return (djw_stream*) xd3_alloc (stream, sizeof (djw_stream), 1);
235 djw_init (xd3_stream *stream, djw_stream *h, int is_encode)
237 /* Fields are initialized prior to use. */
242 djw_destroy (xd3_stream *stream,
245 xd3_free (stream, h);
249 /*********************************************************************/
251 /*********************************************************************/
254 heap_less (const djw_heapen *a, const djw_heapen *b)
256 return a->freq < b->freq ||
257 (a->freq == b->freq &&
258 a->depth < b->depth);
262 heap_insert (usize_t *heap, const djw_heapen *ents, usize_t p, const usize_t e)
264 /* Insert ents[e] into next slot heap[p] */
265 usize_t pp = p/2; /* P's parent */
267 while (heap_less (& ents[e], & ents[heap[pp]]))
277 static inline djw_heapen*
278 heap_extract (usize_t *heap, const djw_heapen *ents, usize_t heap_last)
280 usize_t smallest = heap[1];
283 /* Caller decrements heap_last, so heap_last+1 is the replacement elt. */
284 heap[1] = heap[heap_last+1];
287 for (p = 1; ; p = pc)
291 /* Reached bottom of heap */
292 if (pc > heap_last) { break; }
294 /* See if second child is smaller. */
295 if (pc < heap_last && heap_less (& ents[heap[pc+1]], & ents[heap[pc]]))
300 /* If pc is not smaller than p, heap property re-established. */
301 if (! heap_less (& ents[heap[pc]], & ents[heap[p]])) { break; }
308 return (djw_heapen*) & ents[smallest];
313 heap_check (usize_t *heap, djw_heapen *ents, usize_t heap_last)
316 for (i = 1; i <= heap_last; i += 1)
318 /* Heap property: child not less than parent */
319 XD3_ASSERT (! heap_less (& ents[heap[i]], & ents[heap[i/2]]));
321 IF_DEBUG2 (DP(RINT "heap[%d] = %u\n", i, ents[heap[i]].freq));
326 /*********************************************************************/
328 /*********************************************************************/
330 static inline usize_t
331 djw_update_mtf (uint8_t *mtf, usize_t mtf_i)
334 usize_t sym = mtf[mtf_i];
336 for (k = mtf_i; k != 0; k -= 1) { mtf[k] = mtf[k-1]; }
343 djw_update_1_2 (int *mtf_run, usize_t *mtf_i,
344 uint8_t *mtfsym, djw_weight *freq)
350 /* Offset by 1, since any number of RUN_ symbols implies run>0... */
353 code = (*mtf_run & 1) ? RUN_1 : RUN_0;
355 mtfsym[(*mtf_i)++] = code;
359 while (*mtf_run >= 1);
365 djw_init_clen_mtf_1_2 (uint8_t *clmtf)
370 for (i = 0; i < DJW_BASIC_CODES; i += 1)
372 clmtf[cl_i++] = djw_encode_12basic[i];
374 for (i = 0; i < DJW_EXTRA_CODES; i += 1)
376 clmtf[cl_i++] = djw_encode_12extra[i];
380 /*********************************************************************/
382 /*********************************************************************/
385 djw_build_prefix (const djw_weight *freq, uint8_t *clen, usize_t asize, usize_t maxlen)
387 /* Heap with 0th entry unused, prefix tree with up to ALPHABET_SIZE-1
388 * internal nodes, never more than ALPHABET_SIZE entries actually in the
389 * heap (minimum weight subtrees during prefix construction). First
390 * ALPHABET_SIZE entries are the actual symbols, next ALPHABET_SIZE-1 are
392 djw_heapen ents[ALPHABET_SIZE * 2];
393 usize_t heap[ALPHABET_SIZE + 1];
395 usize_t heap_last; /* Index of the last _valid_ heap entry. */
396 usize_t ents_size; /* Number of entries, including 0th fake entry */
397 usize_t overflow; /* Number of code lengths that overflow */
401 IF_DEBUG (uint32_t first_bits = 0);
403 /* Insert real symbol frequences. */
404 for (i = 0; i < asize; i += 1)
406 ents[i+1].freq = freq[i];
407 IF_DEBUG2 (DP(RINT "ents[%d] = freq[%d] = %d\n",
413 /* The loop is re-entered each time an overflow occurs. Re-initialize... */
419 /* 0th entry terminates the while loop in heap_insert (it's the parent of
420 * the smallest element, always less-than) */
426 for (i = 0; i < asize; i += 1, ents_size += 1)
428 ents[ents_size].depth = 0;
429 ents[ents_size].parent = 0;
431 if (ents[ents_size].freq != 0)
433 heap_insert (heap, ents, ++heap_last, ents_size);
437 IF_DEBUG (heap_check (heap, ents, heap_last));
439 /* Must be at least one symbol, or else we can't get here. */
440 XD3_ASSERT (heap_last != 0);
442 /* If there is only one symbol, fake a second to prevent zero-length
446 /* Pick either the first or last symbol. */
447 int s = freq[0] ? asize-1 : 0;
452 /* Build prefix tree. */
453 while (heap_last > 1)
455 djw_heapen *h1 = heap_extract (heap, ents, --heap_last);
456 djw_heapen *h2 = heap_extract (heap, ents, --heap_last);
458 ents[ents_size].freq = h1->freq + h2->freq;
459 ents[ents_size].depth = 1 + max (h1->depth, h2->depth);
460 ents[ents_size].parent = 0;
462 h1->parent = h2->parent = ents_size;
464 heap_insert (heap, ents, ++heap_last, ents_size++);
467 IF_DEBUG (heap_check (heap, ents, heap_last));
469 /* Now compute prefix code lengths, counting parents. */
470 for (i = 1; i < asize+1; i += 1)
474 if (ents[i].freq != 0)
478 while ((p = ents[p].parent) != 0) { b += 1; }
480 if (b > maxlen) { overflow = 1; }
482 total_bits += b * freq[i-1];
485 /* clen is 0-origin, unlike ents. */
486 IF_DEBUG2 (DP(RINT "clen[%d] = %d\n", i-1, b));
490 IF_DEBUG (if (first_bits == 0) first_bits = total_bits);
494 IF_DEBUG2 (if (first_bits != total_bits)
496 DP(RINT "code length overflow changed %u bits\n",
497 (usize_t)(total_bits - first_bits));
502 /* OPT: There is a non-looping way to fix overflow shown in zlib, but this
503 * is easier (for now), as done in bzip2. */
504 for (i = 1; i < asize+1; i += 1)
506 ents[i].freq = ents[i].freq / 2 + 1;
513 djw_build_codes (usize_t *codes, const uint8_t *clen, usize_t asize, usize_t abs_max)
516 usize_t min_clen = DJW_MAX_CODELEN;
517 usize_t max_clen = 0;
520 /* Find the min and max code length */
521 for (i = 0; i < asize; i += 1)
523 if (clen[i] > 0 && clen[i] < min_clen)
528 max_clen = max (max_clen, (usize_t) clen[i]);
531 XD3_ASSERT (max_clen <= abs_max);
533 /* Generate a code for each symbol with the appropriate length. */
534 for (l = min_clen; l <= max_clen; l += 1)
536 for (i = 0; i < asize; i += 1)
548 for (i = 0; i < asize; i += 1)
550 DP(RINT "code[%d] = %u\n", i, codes[i]);
555 /*********************************************************************/
557 /*********************************************************************/
559 djw_compute_mtf_1_2 (djw_prefix *prefix,
561 djw_weight *freq_out,
566 usize_t size = prefix->scount;
570 /* This +2 is for the RUN_0, RUN_1 codes */
571 memset (freq_out, 0, sizeof (freq_out[0]) * (nsym+2));
573 for (i = 0; i < size; )
575 /* OPT: Bzip optimizes this algorithm a little by effectively checking
576 * j==0 before the MTF update. */
577 sym = prefix->symbol[i++];
579 for (j = 0; mtf[j] != sym; j += 1) { }
581 XD3_ASSERT (j <= nsym);
583 for (k = j; k >= 1; k -= 1) { mtf[k] = mtf[k-1]; }
595 djw_update_1_2 (& mtf_run, & mtf_i, prefix->mtfsym, freq_out);
598 /* Non-zero symbols are offset by RUN_1 */
599 prefix->mtfsym[mtf_i++] = (uint8_t)(j+RUN_1);
600 freq_out[j+RUN_1] += 1;
605 djw_update_1_2 (& mtf_run, & mtf_i, prefix->mtfsym, freq_out);
608 prefix->mcount = mtf_i;
611 /* Counts character frequencies of the input buffer, returns the size. */
613 djw_count_freqs (djw_weight *freq, xd3_output *input)
618 memset (freq, 0, sizeof (freq[0]) * ALPHABET_SIZE);
620 for (in = input; in; in = in->next_page)
622 const uint8_t *p = in->base;
623 const uint8_t *p_max = p + in->next;
636 for (i = 0; i < ALPHABET_SIZE; i += 1)
638 DP(RINT "%u ", freq[i]);
646 djw_compute_multi_prefix (usize_t groups,
647 uint8_t clen[DJW_MAX_GROUPS][ALPHABET_SIZE],
652 prefix->scount = ALPHABET_SIZE;
653 memcpy (prefix->symbol, clen[0], ALPHABET_SIZE);
655 for (gp = 1; gp < groups; gp += 1)
657 for (i = 0; i < ALPHABET_SIZE; i += 1)
659 if (clen[gp][i] == 0)
664 prefix->symbol[prefix->scount++] = clen[gp][i];
670 djw_compute_prefix_1_2 (djw_prefix *prefix, djw_weight *freq)
672 /* This +1 is for the 0 code-length. */
673 uint8_t clmtf[DJW_MAX_CODELEN+1];
675 djw_init_clen_mtf_1_2 (clmtf);
677 djw_compute_mtf_1_2 (prefix, clmtf, freq, DJW_MAX_CODELEN);
681 djw_encode_prefix (xd3_stream *stream,
688 usize_t num_to_encode;
689 djw_weight clfreq[DJW_TOTAL_CODES];
690 uint8_t clclen[DJW_TOTAL_CODES];
691 usize_t clcode[DJW_TOTAL_CODES];
693 /* Move-to-front encode prefix symbols, count frequencies */
694 djw_compute_prefix_1_2 (prefix, clfreq);
697 djw_build_prefix (clfreq, clclen, DJW_TOTAL_CODES, DJW_MAX_CLCLEN);
698 djw_build_codes (clcode, clclen, DJW_TOTAL_CODES, DJW_MAX_CLCLEN);
700 /* Compute number of extra codes beyond basic ones for this template. */
701 num_to_encode = DJW_TOTAL_CODES;
702 while (num_to_encode > DJW_EXTRA_12OFFSET && clclen[num_to_encode-1] == 0)
706 XD3_ASSERT (num_to_encode - DJW_EXTRA_12OFFSET < (1 << DJW_EXTRA_CODE_BITS));
708 /* Encode: # of extra codes */
709 if ((ret = xd3_encode_bits (stream, output, bstate, DJW_EXTRA_CODE_BITS,
710 num_to_encode - DJW_EXTRA_12OFFSET)))
715 /* Encode: MTF code lengths */
716 for (i = 0; i < num_to_encode; i += 1)
718 if ((ret = xd3_encode_bits (stream, output, bstate,
719 DJW_CLCLEN_BITS, clclen[i])))
725 /* Encode: CLEN code lengths */
726 for (i = 0; i < prefix->mcount; i += 1)
728 usize_t mtf_sym = prefix->mtfsym[i];
729 usize_t bits = clclen[mtf_sym];
730 usize_t code = clcode[mtf_sym];
732 if ((ret = xd3_encode_bits (stream, output, bstate, bits, code)))
742 djw_compute_selector_1_2 (djw_prefix *prefix,
744 djw_weight *gbest_freq)
746 uint8_t grmtf[DJW_MAX_GROUPS];
749 for (i = 0; i < groups; i += 1) { grmtf[i] = i; }
751 djw_compute_mtf_1_2 (prefix, grmtf, gbest_freq, groups);
755 xd3_encode_howmany_groups (xd3_stream *stream,
759 usize_t *ret_sector_size)
761 usize_t cfg_groups = 0;
762 usize_t cfg_sector_size = 0;
763 usize_t sugg_groups = 0;
764 usize_t sugg_sector_size = 0;
766 if (cfg->ngroups != 0)
768 if (cfg->ngroups > DJW_MAX_GROUPS)
770 stream->msg = "invalid secondary encoder group number";
774 cfg_groups = cfg->ngroups;
777 if (cfg->sector_size != 0)
779 if (cfg->sector_size < DJW_SECTORSZ_MULT ||
780 cfg->sector_size > DJW_SECTORSZ_MAX ||
781 (cfg->sector_size % DJW_SECTORSZ_MULT) != 0)
783 stream->msg = "invalid secondary encoder sector size";
787 cfg_sector_size = cfg->sector_size;
790 if (cfg_groups == 0 || cfg_sector_size == 0)
792 /* These values were found empirically using xdelta3-tune around version
794 switch (cfg->data_type)
797 if (input_size < 1000) { sugg_groups = 1; sugg_sector_size = 0; }
798 else if (input_size < 4000) { sugg_groups = 2; sugg_sector_size = 10; }
799 else if (input_size < 7000) { sugg_groups = 3; sugg_sector_size = 10; }
800 else if (input_size < 10000) { sugg_groups = 4; sugg_sector_size = 10; }
801 else if (input_size < 25000) { sugg_groups = 5; sugg_sector_size = 10; }
802 else if (input_size < 50000) { sugg_groups = 7; sugg_sector_size = 20; }
803 else if (input_size < 100000) { sugg_groups = 8; sugg_sector_size = 30; }
804 else { sugg_groups = 8; sugg_sector_size = 70; }
807 if (input_size < 7000) { sugg_groups = 1; sugg_sector_size = 0; }
808 else if (input_size < 10000) { sugg_groups = 2; sugg_sector_size = 50; }
809 else if (input_size < 25000) { sugg_groups = 3; sugg_sector_size = 50; }
810 else if (input_size < 50000) { sugg_groups = 6; sugg_sector_size = 40; }
811 else if (input_size < 100000) { sugg_groups = 8; sugg_sector_size = 40; }
812 else { sugg_groups = 8; sugg_sector_size = 40; }
815 if (input_size < 9000) { sugg_groups = 1; sugg_sector_size = 0; }
816 else if (input_size < 25000) { sugg_groups = 2; sugg_sector_size = 130; }
817 else if (input_size < 50000) { sugg_groups = 3; sugg_sector_size = 130; }
818 else if (input_size < 100000) { sugg_groups = 5; sugg_sector_size = 130; }
819 else { sugg_groups = 7; sugg_sector_size = 130; }
825 cfg_groups = sugg_groups;
828 if (cfg_sector_size == 0)
830 cfg_sector_size = sugg_sector_size;
834 if (cfg_groups != 1 && cfg_sector_size == 0)
836 switch (cfg->data_type)
839 cfg_sector_size = 20;
842 cfg_sector_size = 50;
845 cfg_sector_size = 130;
850 (*ret_groups) = cfg_groups;
851 (*ret_sector_size) = cfg_sector_size;
853 XD3_ASSERT (cfg_groups > 0 && cfg_groups <= DJW_MAX_GROUPS);
854 XD3_ASSERT (cfg_groups == 1 ||
855 (cfg_sector_size >= DJW_SECTORSZ_MULT &&
856 cfg_sector_size <= DJW_SECTORSZ_MAX));
862 xd3_encode_huff (xd3_stream *stream,
869 usize_t groups, sector_size;
870 bit_state bstate = BIT_STATE_ENCODE_INIT;
875 usize_t initial_offset = output->next;
876 djw_weight real_freq[ALPHABET_SIZE];
877 uint8_t *gbest = NULL;
878 uint8_t *gbest_mtf = NULL;
880 input_bytes = djw_count_freqs (real_freq, input);
881 input_bits = input_bytes * 8;
883 XD3_ASSERT (input_bytes > 0);
885 if ((ret = xd3_encode_howmany_groups (stream, cfg, input_bytes,
886 & groups, & sector_size)))
894 /* Sometimes we dynamically decide there are too many groups. Arrive
896 output->next = initial_offset;
897 xd3_bit_state_encode_init (& bstate);
900 /* Encode: # of groups (3 bits) */
901 if ((ret = xd3_encode_bits (stream, & output, & bstate,
902 DJW_GROUP_BITS, groups-1))) { goto failure; }
906 /* Single Huffman group. */
907 usize_t code[ALPHABET_SIZE]; /* Codes */
908 uint8_t clen[ALPHABET_SIZE];
909 uint8_t prefix_mtfsym[ALPHABET_SIZE];
913 djw_build_prefix (real_freq, clen, ALPHABET_SIZE, DJW_MAX_CODELEN);
914 djw_build_codes (code, clen, ALPHABET_SIZE, DJW_MAX_CODELEN);
916 if (output_bits + EFFICIENCY_BITS >= input_bits && ! cfg->inefficient)
922 prefix.mtfsym = prefix_mtfsym;
923 prefix.symbol = clen;
924 prefix.scount = ALPHABET_SIZE;
926 if ((ret = djw_encode_prefix (stream, & output, & bstate, & prefix)))
931 if (output_bits + (8 * output->next) + EFFICIENCY_BITS >=
932 input_bits && ! cfg->inefficient)
938 for (in = input; in; in = in->next_page)
940 const uint8_t *p = in->base;
941 const uint8_t *p_max = p + in->next;
946 usize_t bits = clen[sym];
948 IF_DEBUG (output_bits -= bits);
950 if ((ret = xd3_encode_bits (stream, & output,
951 & bstate, bits, code[sym])))
959 XD3_ASSERT (output_bits == 0);
964 djw_weight evolve_freq[DJW_MAX_GROUPS][ALPHABET_SIZE];
965 uint8_t evolve_clen[DJW_MAX_GROUPS][ALPHABET_SIZE];
966 djw_weight left = input_bytes;
970 usize_t sym1 = 0, sym2 = 0, s;
971 usize_t gcost[DJW_MAX_GROUPS];
972 usize_t gbest_code[DJW_MAX_GROUPS+2];
973 uint8_t gbest_clen[DJW_MAX_GROUPS+2];
974 usize_t gbest_max = 1 + (input_bytes - 1) / sector_size;
975 usize_t best_bits = 0;
979 IF_DEBUG2 (usize_t gcount[DJW_MAX_GROUPS]);
981 /* Encode: sector size (5 bits) */
982 if ((ret = xd3_encode_bits (stream, & output, & bstate,
984 (sector_size/DJW_SECTORSZ_MULT)-1)))
989 /* Dynamic allocation. */
992 if ((gbest = (uint8_t*) xd3_alloc (stream, gbest_max, 1)) == NULL)
999 if (gbest_mtf == NULL)
1001 if ((gbest_mtf = (uint8_t*) xd3_alloc (stream, gbest_max, 1)) == NULL)
1008 /* OPT: Some of the inner loops can be optimized, as shown in bzip2 */
1010 /* Generate initial code length tables. */
1011 for (gp = 0; gp < groups; gp += 1)
1014 djw_weight goal = left / (groups - gp);
1016 IF_DEBUG2 (usize_t nz = 0);
1018 /* Due to the single-code granularity of this distribution, it may
1019 * be that we can't generate a distribution for each group. In that
1020 * case subtract one group and try again. If (inefficient), we're
1021 * testing group behavior, so don't mess things up. */
1022 if (goal == 0 && !cfg->inefficient)
1024 IF_DEBUG2 (DP(RINT "too many groups (%u), dropping one\n",
1030 /* Sum == goal is possible when (cfg->inefficient)... */
1033 XD3_ASSERT (sym2 < ALPHABET_SIZE);
1034 IF_DEBUG2 (nz += real_freq[sym2] != 0);
1035 sum += real_freq[sym2++];
1038 IF_DEBUG2(DP(RINT "group %u has symbols %u..%u (%u non-zero) "
1040 gp, sym1, sym2, nz, sum,
1041 input_bytes, sum / (double)input_bytes););
1043 for (s = 0; s < ALPHABET_SIZE; s += 1)
1045 evolve_clen[gp][s] = (s >= sym1 && s <= sym2) ? 1 : 16;
1056 memset (evolve_freq, 0, sizeof (evolve_freq[0]) * groups);
1057 IF_DEBUG2 (memset (gcount, 0, sizeof (gcount[0]) * groups));
1059 /* For each input page (loop is irregular to allow non-pow2-size group
1064 /* For each group-size sector. */
1067 const uint8_t *p0 = p;
1068 xd3_output *in0 = in;
1072 /* Select best group for each sector, update evolve_freq. */
1073 memset (gcost, 0, sizeof (gcost[0]) * groups);
1075 /* For each byte in sector. */
1076 for (gpcnt = 0; gpcnt < sector_size; gpcnt += 1)
1078 /* For each group. */
1079 for (gp = 0; gp < groups; gp += 1)
1081 gcost[gp] += evolve_clen[gp][*p];
1084 /* Check end-of-input-page. */
1085 # define GP_PAGE() \
1086 if ((usize_t)(++p - in->base) == in->next) \
1088 in = in->next_page; \
1089 if (in == NULL) { break; } \
1096 /* Find min cost group for this sector */
1098 for (gp = 0; gp < groups; gp += 1)
1100 if (gcost[gp] < best)
1107 XD3_ASSERT(gbest_no < gbest_max);
1108 gbest[gbest_no++] = winner;
1109 IF_DEBUG2 (gcount[winner] += 1);
1114 /* Update group frequencies. */
1115 for (gpcnt = 0; gpcnt < sector_size; gpcnt += 1)
1117 evolve_freq[winner][*p] += 1;
1124 XD3_ASSERT (gbest_no == gbest_max);
1126 /* Recompute code lengths. */
1128 for (gp = 0; gp < groups; gp += 1)
1131 uint8_t evolve_zero[ALPHABET_SIZE];
1134 memset (evolve_zero, 0, sizeof (evolve_zero));
1136 /* Cannot allow a zero clen when the real frequency is non-zero.
1137 * Note: this means we are going to encode a fairly long code for
1138 * these unused entries. An improvement would be to implement a
1139 * NOTUSED code for when these are actually zero, but this requires
1140 * another data structure (evolve_zero) since we don't know when
1141 * evolve_freq[i] == 0... Briefly tested, looked worse. */
1142 for (i = 0; i < ALPHABET_SIZE; i += 1)
1144 if (evolve_freq[gp][i] == 0 && real_freq[i] != 0)
1146 evolve_freq[gp][i] = 1;
1152 output_bits += djw_build_prefix (evolve_freq[gp], evolve_clen[gp],
1153 ALPHABET_SIZE, DJW_MAX_CODELEN);
1155 /* The above faking of frequencies does not matter for the last
1156 * iteration, but we don't know when that is yet. However, it also
1157 * breaks the output_bits computation. Necessary for accuracy, and
1158 * for the (output_bits==0) assert after all bits are output. */
1161 IF_DEBUG2 (usize_t save_total = output_bits);
1163 for (i = 0; i < ALPHABET_SIZE; i += 1)
1165 if (evolve_zero[i]) { output_bits -= evolve_clen[gp][i]; }
1168 IF_DEBUG2 (DP(RINT "evolve_zero reduced %u bits in group %u\n",
1169 save_total - output_bits, gp));
1174 DP(RINT "pass %u total bits: %u group uses: ", niter, output_bits);
1175 for (gp = 0; gp < groups; gp += 1) { DP(RINT "%u ", gcount[gp]); }
1179 /* End iteration. */
1181 IF_DEBUG2 (if (niter > 1 && best_bits < output_bits) {
1182 DP(RINT "iteration lost %u bits\n", output_bits - best_bits); });
1184 if (niter == 1 || (niter < DJW_MAX_ITER &&
1185 (best_bits - output_bits) >= DJW_MIN_IMPROVEMENT))
1187 best_bits = output_bits;
1191 /* Efficiency check. */
1192 if (output_bits + EFFICIENCY_BITS >= input_bits && ! cfg->inefficient)
1197 IF_DEBUG2 (DP(RINT "djw compression: %u -> %0.3f\n",
1198 input_bytes, output_bits / 8.0));
1200 /* Encode: prefix */
1202 uint8_t prefix_symbol[DJW_MAX_GROUPS * ALPHABET_SIZE];
1203 uint8_t prefix_mtfsym[DJW_MAX_GROUPS * ALPHABET_SIZE];
1204 uint8_t prefix_repcnt[DJW_MAX_GROUPS * ALPHABET_SIZE];
1207 prefix.symbol = prefix_symbol;
1208 prefix.mtfsym = prefix_mtfsym;
1209 prefix.repcnt = prefix_repcnt;
1211 djw_compute_multi_prefix (groups, evolve_clen, & prefix);
1212 if ((ret = djw_encode_prefix (stream, & output, & bstate, & prefix)))
1218 /* Encode: selector frequencies */
1220 /* DJW_MAX_GROUPS +2 is for RUN_0, RUN_1 symbols. */
1221 djw_weight gbest_freq[DJW_MAX_GROUPS+2];
1222 djw_prefix gbest_prefix;
1225 gbest_prefix.scount = gbest_no;
1226 gbest_prefix.symbol = gbest;
1227 gbest_prefix.mtfsym = gbest_mtf;
1229 djw_compute_selector_1_2 (& gbest_prefix, groups, gbest_freq);
1232 djw_build_prefix (gbest_freq, gbest_clen, groups+1, DJW_MAX_GBCLEN);
1233 djw_build_codes (gbest_code, gbest_clen, groups+1, DJW_MAX_GBCLEN);
1235 for (i = 0; i < groups+1; i += 1)
1237 if ((ret = xd3_encode_bits (stream, & output, & bstate,
1238 DJW_GBCLEN_BITS, gbest_clen[i])))
1244 for (i = 0; i < gbest_prefix.mcount; i += 1)
1246 usize_t gp_mtf = gbest_mtf[i];
1247 usize_t gp_sel_bits = gbest_clen[gp_mtf];
1248 usize_t gp_sel_code = gbest_code[gp_mtf];
1250 XD3_ASSERT (gp_mtf < groups+1);
1252 if ((ret = xd3_encode_bits (stream, & output, & bstate,
1253 gp_sel_bits, gp_sel_code)))
1258 IF_DEBUG (select_bits -= gp_sel_bits);
1261 XD3_ASSERT (select_bits == 0);
1264 /* Efficiency check. */
1265 if (output_bits + select_bits + (8 * output->next) +
1266 EFFICIENCY_BITS >= input_bits && ! cfg->inefficient)
1273 usize_t evolve_code[DJW_MAX_GROUPS][ALPHABET_SIZE];
1276 /* Build code tables for each group. */
1277 for (gp = 0; gp < groups; gp += 1)
1279 djw_build_codes (evolve_code[gp], evolve_clen[gp],
1280 ALPHABET_SIZE, DJW_MAX_CODELEN);
1283 /* Now loop over the input. */
1289 /* For each sector. */
1290 usize_t gp_best = gbest[sector];
1291 usize_t *gp_codes = evolve_code[gp_best];
1292 uint8_t *gp_clens = evolve_clen[gp_best];
1294 XD3_ASSERT (sector < gbest_no);
1298 /* Encode the sector data. */
1299 for (gpcnt = 0; gpcnt < sector_size; gpcnt += 1)
1302 usize_t bits = gp_clens[sym];
1303 usize_t code = gp_codes[sym];
1305 IF_DEBUG (output_bits -= bits);
1307 if ((ret = xd3_encode_bits (stream, & output, & bstate,
1318 XD3_ASSERT (select_bits == 0);
1319 XD3_ASSERT (output_bits == 0);
1323 ret = xd3_flush_bits (stream, & output, & bstate);
1328 stream->msg = "secondary compression was inefficient";
1334 xd3_free (stream, gbest);
1335 xd3_free (stream, gbest_mtf);
1338 #endif /* XD3_ENCODER */
1340 /*********************************************************************/
1342 /*********************************************************************/
1345 djw_build_decoder (xd3_stream *stream,
1348 const uint8_t *clen,
1357 usize_t nr_clen [DJW_TOTAL_CODES];
1358 usize_t tmp_base[DJW_TOTAL_CODES];
1362 /* Assumption: the two temporary arrays are large enough to hold abs_max. */
1363 XD3_ASSERT (abs_max <= DJW_MAX_CODELEN);
1365 /* This looks something like the start of zlib's inftrees.c */
1366 memset (nr_clen, 0, sizeof (nr_clen[0]) * (abs_max+1));
1368 /* Count number of each code length */
1373 /* Caller _must_ check that values are in-range. Most of the time the
1374 * caller decodes a specific number of bits, which imply the max value,
1375 * and the other time the caller decodes a huffman value, which must be
1376 * in-range. Therefore, its an assertion and this function cannot
1377 * otherwise fail. */
1378 XD3_ASSERT (*ci <= abs_max);
1384 /* Compute min, max. */
1385 for (i = 1; i <= abs_max; i += 1) { if (nr_clen[i]) { break; } }
1387 for (i = abs_max; i != 0; i -= 1) { if (nr_clen[i]) { break; } }
1390 /* Fill the BASE, LIMIT table. */
1391 tmp_base[min_clen] = 0;
1393 limit[min_clen] = nr_clen[min_clen] - 1;
1394 for (i = min_clen + 1; i <= max_clen; i += 1)
1396 usize_t last_limit = ((limit[i-1] + 1) << 1);
1397 tmp_base[i] = tmp_base[i-1] + nr_clen[i-1];
1398 limit[i] = last_limit + nr_clen[i] - 1;
1399 base[i] = last_limit - tmp_base[i];
1402 /* Fill the inorder array, canonically ordered codes. */
1404 for (i = 0; i < asize; i += 1)
1406 if ((l = *ci++) != 0)
1408 inorder[tmp_base[l]++] = i;
1412 *min_clenp = min_clen;
1413 *max_clenp = max_clen;
1417 djw_decode_symbol (xd3_stream *stream,
1419 const uint8_t **input,
1420 const uint8_t *input_end,
1421 const uint8_t *inorder,
1422 const usize_t *base,
1423 const usize_t *limit,
1432 /* OPT: Supposedly a small lookup table improves speed here... */
1434 /* Code outline is similar to xd3_decode_bits... */
1435 if (bstate->cur_mask == 0x100) { goto next_byte; }
1441 if (bits == max_clen) { goto corrupt; }
1446 if (bstate->cur_byte & bstate->cur_mask) { code |= 1; }
1448 bstate->cur_mask <<= 1;
1450 if (bits >= min_clen && code <= limit[bits]) { goto done; }
1452 while (bstate->cur_mask != 0x100);
1456 if (*input == input_end)
1458 stream->msg = "secondary decoder end of input";
1459 return XD3_INTERNAL;
1462 bstate->cur_byte = *(*input)++;
1463 bstate->cur_mask = 1;
1468 if (base[bits] <= code)
1470 usize_t offset = code - base[bits];
1472 if (offset <= max_sym)
1474 IF_DEBUG2 (DP(RINT "(j) %u ", code));
1475 *sym = inorder[offset];
1481 stream->msg = "secondary decoder invalid code";
1482 return XD3_INTERNAL;
1486 djw_decode_clclen (xd3_stream *stream,
1488 const uint8_t **input,
1489 const uint8_t *input_end,
1490 uint8_t *cl_inorder,
1498 uint8_t cl_clen[DJW_TOTAL_CODES];
1499 usize_t num_codes, value;
1502 /* How many extra code lengths to encode. */
1503 if ((ret = xd3_decode_bits (stream, bstate, input,
1504 input_end, DJW_EXTRA_CODE_BITS, & num_codes)))
1509 num_codes += DJW_EXTRA_12OFFSET;
1511 /* Read num_codes. */
1512 for (i = 0; i < num_codes; i += 1)
1514 if ((ret = xd3_decode_bits (stream, bstate, input,
1515 input_end, DJW_CLCLEN_BITS, & value)))
1523 /* Set the rest to zero. */
1524 for (; i < DJW_TOTAL_CODES; i += 1) { cl_clen[i] = 0; }
1526 /* No need to check for in-range clen values, because: */
1527 XD3_ASSERT (1 << DJW_CLCLEN_BITS == DJW_MAX_CLCLEN + 1);
1529 /* Build the code-length decoder. */
1530 djw_build_decoder (stream, DJW_TOTAL_CODES, DJW_MAX_CLCLEN,
1531 cl_clen, cl_inorder, cl_base,
1532 cl_limit, cl_minlen, cl_maxlen);
1534 /* Initialize the MTF state. */
1535 djw_init_clen_mtf_1_2 (cl_mtf);
1541 djw_decode_1_2 (xd3_stream *stream,
1543 const uint8_t **input,
1544 const uint8_t *input_end,
1545 const uint8_t *inorder,
1546 const usize_t *base,
1547 const usize_t *limit,
1548 const usize_t *minlen,
1549 const usize_t *maxlen,
1552 usize_t skip_offset,
1555 usize_t n = 0, rep = 0, mtf = 0, s = 0;
1560 /* Special case inside generic code: CLEN only: If not the first group,
1561 * we already know the zero frequencies. */
1562 if (skip_offset != 0 && n >= skip_offset && values[n-skip_offset] == 0)
1568 /* Repeat last symbol. */
1571 values[n++] = mtfvals[0];
1576 /* Symbol following last repeat code. */
1579 usize_t sym = djw_update_mtf (mtfvals, mtf);
1585 /* Decode next symbol/repeat code. */
1586 if ((ret = djw_decode_symbol (stream, bstate, input, input_end,
1587 inorder, base, limit, *minlen, *maxlen,
1588 & mtf, DJW_TOTAL_CODES))) { return ret; }
1593 rep = ((mtf + 1) << s);
1599 /* Remove the RUN_1 MTF offset. */
1605 /* If (rep != 0) there were too many codes received. */
1608 stream->msg = "secondary decoder invalid repeat code";
1609 return XD3_INTERNAL;
1616 djw_decode_prefix (xd3_stream *stream,
1618 const uint8_t **input,
1619 const uint8_t *input_end,
1620 const uint8_t *cl_inorder,
1621 const usize_t *cl_base,
1622 const usize_t *cl_limit,
1623 const usize_t *cl_minlen,
1624 const usize_t *cl_maxlen,
1629 return djw_decode_1_2 (stream, bstate, input, input_end,
1630 cl_inorder, cl_base, cl_limit,
1631 cl_minlen, cl_maxlen, cl_mtf,
1632 ALPHABET_SIZE * groups, ALPHABET_SIZE, clen);
1636 xd3_decode_huff (xd3_stream *stream,
1638 const uint8_t **input_pos,
1639 const uint8_t *const input_end,
1640 uint8_t **output_pos,
1641 const uint8_t *const output_end)
1643 const uint8_t *input = *input_pos;
1644 uint8_t *output = *output_pos;
1645 bit_state bstate = BIT_STATE_DECODE_INIT;
1646 uint8_t *sel_group = NULL;
1648 usize_t output_bytes = (usize_t)(output_end - output);
1649 usize_t sector_size;
1653 /* Invalid input. */
1654 if (output_bytes == 0)
1656 stream->msg = "secondary decoder invalid input";
1657 return XD3_INTERNAL;
1660 /* Decode: number of groups */
1661 if ((ret = xd3_decode_bits (stream, & bstate, & input,
1662 input_end, DJW_GROUP_BITS, & groups)))
1671 /* Decode: group size */
1672 if ((ret = xd3_decode_bits (stream, & bstate, & input,
1673 input_end, DJW_SECTORSZ_BITS,
1674 & sector_size))) { goto fail; }
1676 sector_size = (sector_size + 1) * DJW_SECTORSZ_MULT;
1680 /* Default for groups == 1 */
1681 sector_size = output_bytes;
1684 sectors = 1 + (output_bytes - 1) / sector_size;
1686 /* TODO: In the case of groups==1, lots of extra stack space gets used here.
1687 * Could dynamically allocate this memory, which would help with excess
1688 * parameter passing, too. Passing too many parameters in this file,
1691 /* Outer scope: per-group symbol decoder tables. */
1693 uint8_t inorder[DJW_MAX_GROUPS][ALPHABET_SIZE];
1694 usize_t base [DJW_MAX_GROUPS][DJW_TOTAL_CODES];
1695 usize_t limit [DJW_MAX_GROUPS][DJW_TOTAL_CODES];
1696 usize_t minlen [DJW_MAX_GROUPS];
1697 usize_t maxlen [DJW_MAX_GROUPS];
1699 /* Nested scope: code length decoder tables. */
1701 uint8_t clen [DJW_MAX_GROUPS][ALPHABET_SIZE];
1702 uint8_t cl_inorder[DJW_TOTAL_CODES];
1703 usize_t cl_base [DJW_MAX_CLCLEN+2];
1704 usize_t cl_limit [DJW_MAX_CLCLEN+2];
1705 uint8_t cl_mtf [DJW_TOTAL_CODES];
1709 /* Compute the code length decoder. */
1710 if ((ret = djw_decode_clclen (stream, & bstate, & input, input_end,
1711 cl_inorder, cl_base, cl_limit, & cl_minlen,
1712 & cl_maxlen, cl_mtf))) { goto fail; }
1714 /* Now decode each group decoder. */
1715 if ((ret = djw_decode_prefix (stream, & bstate, & input, input_end,
1716 cl_inorder, cl_base, cl_limit,
1717 & cl_minlen, & cl_maxlen, cl_mtf,
1718 groups, clen[0]))) { goto fail; }
1720 /* Prepare the actual decoding tables. */
1721 for (gp = 0; gp < groups; gp += 1)
1723 djw_build_decoder (stream, ALPHABET_SIZE, DJW_MAX_CODELEN,
1724 clen[gp], inorder[gp], base[gp], limit[gp],
1725 & minlen[gp], & maxlen[gp]);
1729 /* Decode: selector clens. */
1731 uint8_t sel_inorder[DJW_MAX_GROUPS+2];
1732 usize_t sel_base [DJW_MAX_GBCLEN+2];
1733 usize_t sel_limit [DJW_MAX_GBCLEN+2];
1734 uint8_t sel_mtf [DJW_MAX_GROUPS+2];
1738 /* Setup group selection. */
1741 uint8_t sel_clen[DJW_MAX_GROUPS+1];
1743 for (gp = 0; gp < groups+1; gp += 1)
1747 if ((ret = xd3_decode_bits (stream, & bstate, & input,
1748 input_end, DJW_GBCLEN_BITS,
1749 & value))) { goto fail; }
1751 sel_clen[gp] = value;
1755 if ((sel_group = (uint8_t*) xd3_alloc (stream, sectors, 1)) == NULL)
1761 djw_build_decoder (stream, groups+1, DJW_MAX_GBCLEN, sel_clen,
1762 sel_inorder, sel_base, sel_limit,
1763 & sel_minlen, & sel_maxlen);
1765 if ((ret = djw_decode_1_2 (stream, & bstate, & input, input_end,
1766 sel_inorder, sel_base,
1767 sel_limit, & sel_minlen,
1768 & sel_maxlen, sel_mtf,
1769 sectors, 0, sel_group))) { goto fail; }
1772 /* Now decode each sector. */
1774 /* Initialize for (groups==1) case. */
1775 uint8_t *gp_inorder = inorder[0];
1776 usize_t *gp_base = base[0];
1777 usize_t *gp_limit = limit[0];
1778 usize_t gp_minlen = minlen[0];
1779 usize_t gp_maxlen = maxlen[0];
1782 for (c = 0; c < sectors; c += 1)
1790 XD3_ASSERT (gp < groups);
1792 gp_inorder = inorder[gp];
1794 gp_limit = limit[gp];
1795 gp_minlen = minlen[gp];
1796 gp_maxlen = maxlen[gp];
1799 XD3_ASSERT (output_end - output > 0);
1801 /* Decode next sector. */
1802 n = min (sector_size, (usize_t) (output_end - output));
1808 if ((ret = djw_decode_symbol (stream, & bstate,
1810 gp_inorder, gp_base,
1811 gp_limit, gp_minlen, gp_maxlen,
1812 & sym, ALPHABET_SIZE)))
1825 IF_REGRESSION (if ((ret = xd3_test_clean_bits (stream, & bstate)))
1827 XD3_ASSERT (ret == 0);
1830 xd3_free (stream, sel_group);
1832 (*input_pos) = input;
1833 (*output_pos) = output;