2 * Copyright (c) 2007, Novell Inc.
4 * This program is licensed under the BSD license, read LICENSE.BSD
5 * for further information
15 #define BLOCK_SIZE (65536*1)
16 #if BLOCK_SIZE <= 65536
17 typedef __uint16_t Ref;
19 typedef __uint32_t Ref;
23 The format is tailored for fast decompression (i.e. only byte based),
24 and skewed to ASCII content (highest bit often not set):
27 - self-describing ASCII character hex L
28 b 100lllll <l+1 bytes>
29 - literal run of length l+1
31 - back ref of length l+2, at offset -(o+1) (o < 1 << 10)
33 - back ref of length l+2+8, at offset -(o+1) (o < 1 << 8)
35 - back ref of length l+3, at offset -(o+1) (o < 1 << 16)
36 f1 1111llll <8l> <8o> <8o>
37 - back ref, length l+19 (l < 1<<12), offset -(o+1) (o < 1<<16)
38 f2 11110lll <8l> <8o> <8o>
39 - back ref, length l+19 (l < 1<<11), offset -(o+1) (o < 1<<16)
40 g 11111lll <8l> <8o> <8o> <8o>
41 - back ref, length l+5 (l < 1<<11), offset -(o+1) (o < 1<<24)
43 Generally for a literal of length L we need L+1 bytes, hence it is
44 better to encode also very short backrefs (2 chars) as backrefs if
45 their offset is small, as that only needs two bytes. Except if we
46 already have a literal run, in that case it's better to append there,
47 instead of breaking it for a backref. So given a potential backref
48 at offset O, length L the strategy is as follows:
50 L < 2 : encode as 1-literal
51 L == 2, O > 1024 : encode as 1-literal
52 L == 2, have already literals: encode as 1-literal
54 L >= 2, L <= 9, O < 1024 : encode as c
55 L >= 10, L <= 41, O < 256 : encode as d
56 else we have either O >= 1024, or L >= 42:
57 L < 3 : encode as 1-literal
58 L >= 3, L <= 18, O < 65536 : encode as e
59 L >= 19, L <= 4095+18, O < 65536 : encode as f
60 else we have either L >= 4096+18 or O >= 65536.
61 O >= 65536: encode as 1-literal, too bad
62 (with the current block size this can't happen)
63 L >= 4096+18, so reduce to 4095+18 : encode as f
68 compress_buf (const unsigned char *in, unsigned int in_len,
69 unsigned char *out, unsigned int out_len)
71 unsigned int oo = 0; //out-offset
72 unsigned int io = 0; //in-offset
75 Ref hnext[BLOCK_SIZE];
76 memset (htab, -1, sizeof (htab));
77 memset (hnext, -1, sizeof (hnext));
78 unsigned int litofs = 0;
79 while (io + 2 < in_len)
81 /* Search for a match of the string starting at IN, we have at
82 least three characters. */
83 unsigned int hval = in[io] | in[io + 1] << 8 | in[io + 2] << 16;
84 unsigned int try, mlen, mofs, tries;
85 hval = (hval ^ (hval << 5) ^ (hval >> 5)) - hval * 5;
86 hval = hval & (HS - 1);
88 hnext[io] = htab[hval];
93 for (tries = 0; try != -1 && tries < 12; tries++)
96 && in[try] == in[io] && in[try + 1] == in[io + 1])
99 mofs = (io - try) - 1;
104 for (; try != -1 && tries < 12; tries++)
106 //assert (mlen >= 2);
107 //assert (io + mlen < in_len);
108 /* Try a match starting from [io] with the strings at [try].
109 That's only sensible if TRY actually is before IO (can happen
110 with uninit hash table). If we have a previous match already
111 we're only going to take the new one if it's longer, hence
112 check the potentially last character. */
113 if (try < io && in[try + mlen] == in[io + mlen])
115 unsigned int this_len, this_ofs;
116 if (memcmp (in + try, in + io, mlen))
119 /* Now try extending the match by more characters. */
121 io + this_len < in_len
122 && in[try + this_len] == in[io + this_len]; this_len++)
126 for (testi = 0; testi < this_len; testi++)
127 assert (in[try + testi] == in[io + testi]);
129 this_ofs = (io - try) - 1;
130 /*if (this_ofs > 65535)
133 assert (this_len >= 2);
134 assert (this_len >= mlen);
135 assert (this_len > mlen || (this_len == mlen && this_ofs > mofs));
137 mlen = this_len, mofs = this_ofs;
138 /* If our match extends up to the end of input, no next
139 match can become better. This is not just an
140 optimization, it establishes a loop invariant
141 (io + mlen < in_len). */
142 if (io + mlen >= in_len)
147 /*if (io - try - 1 >= 65536)
154 //fprintf (stderr, "%d %d\n", mlen, mofs);
155 if (mlen == 2 && (litofs || mofs >= 1024))
157 /*else if (mofs >= 65536)
159 else if (mofs >= 65536)
161 if (mlen >= 2048 + 5)
168 /*else if (mlen >= 4096 + 19)
170 else if (mlen >= 2048 + 19)
172 /* Skip this match if the next character would deliver a better one,
173 but only do this if we have the chance to really extend the
174 length (i.e. our current length isn't yet the (conservative)
176 if (mlen && mlen < (2048 + 5) && io + 3 < in_len)
179 in[io + 1] | in[io + 2] << 8 | in[io + 3] << 16;
181 hval = (hval ^ (hval << 5) ^ (hval >> 5)) - hval * 5;
182 hval = hval & (HS - 1);
185 && in[try] == in[io + 1] && in[try + 1] == in[io + 2])
187 unsigned int this_len;
190 io + 1 + this_len < in_len
191 && in[try + this_len] == in[io + 1 + this_len];
194 if (this_len >= mlen)
210 unsigned litlen = io - litofs;
211 //fprintf (stderr, "lit: %d\n", litlen);
214 unsigned int easy_sz;
215 /* Emit everything we can as self-describers. As soon as
216 we hit a byte we can't emit as such we're going to emit
217 a length descriptor anyway, so we can as well include
218 bytes < 0x80 which might follow afterwards in that run. */
220 easy_sz < litlen && in[litofs + easy_sz] < 0x80;
225 if (oo + easy_sz >= out_len)
227 memcpy (out + oo, in + litofs, easy_sz);
236 if (oo + 1 + litlen >= out_len)
238 out[oo++] = 0x80 | (litlen - 1);
240 out[oo++] = in[litofs++];
245 /* Literal length > 32, so chunk it. */
246 if (oo + 1 + 32 >= out_len)
248 out[oo++] = 0x80 | 31;
249 memcpy (out + oo, in + litofs, 32);
258 //fprintf (stderr, "ref: %d @ %d\n", mlen, mofs);
260 if (mlen >= 2 && mlen <= 9 && mofs < 1024)
262 if (oo + 2 >= out_len)
264 out[oo++] = 0xa0 | ((mofs & 0x300) >> 5) | (mlen - 2);
265 out[oo++] = mofs & 0xff;
267 else if (mlen >= 10 && mlen <= 41 && mofs < 256)
269 if (oo + 2 >= out_len)
271 out[oo++] = 0xc0 | (mlen - 10);
274 else if (mofs >= 65536)
276 assert (mlen >= 5 && mlen < 2048 + 5);
277 if (oo + 5 >= out_len)
279 out[oo++] = 0xf8 | ((mlen - 5) >> 8);
280 out[oo++] = (mlen - 5) & 0xff;
281 out[oo++] = mofs & 0xff;
282 out[oo++] = (mofs >> 8) & 0xff;
283 out[oo++] = mofs >> 16;
285 else if (mlen >= 3 && mlen <= 18)
287 assert (mofs < 65536);
288 if (oo + 3 >= out_len)
290 out[oo++] = 0xe0 | (mlen - 3);
291 out[oo++] = mofs & 0xff;
292 out[oo++] = mofs >> 8;
296 assert (mlen >= 19 && mlen <= 4095 + 19 && mofs < 65536);
297 if (oo + 4 >= out_len)
299 out[oo++] = 0xf0 | ((mlen - 19) >> 8);
300 out[oo++] = (mlen - 19) & 0xff;
301 out[oo++] = mofs & 0xff;
302 out[oo++] = mofs >> 8;
304 /* Insert the hashes for the compressed run [io..io+mlen-1].
305 For [io] we have it already done at the start of the loop.
306 So it's from [io+1..io+mlen-1], and we need three chars per
307 hash, so the accessed characters will be [io+1..io+mlen-1+2],
308 ergo io+mlen+1 < in_len. */
316 in[io] | in[io + 1] << 8 | in[io + 2] << 16;
317 hval = (hval ^ (hval << 5) ^ (hval >> 5)) - hval * 5;
318 hval = hval & (HS - 1);
319 hnext[io] = htab[hval];
326 /* We might have some characters left. */
327 if (io < in_len && !litofs)
333 unsigned litlen = io - litofs;
334 //fprintf (stderr, "lit: %d\n", litlen);
337 unsigned int easy_sz;
338 /* Emit everything we can as self-describers. As soon as we hit a
339 byte we can't emit as such we're going to emit a length
340 descriptor anyway, so we can as well include bytes < 0x80 which
341 might follow afterwards in that run. */
342 for (easy_sz = 0; easy_sz < litlen && in[litofs + easy_sz] < 0x80;
347 if (oo + easy_sz >= out_len)
349 memcpy (out + oo, in + litofs, easy_sz);
358 if (oo + 1 + litlen >= out_len)
360 out[oo++] = 0x80 | (litlen - 1);
362 out[oo++] = in[litofs++];
367 /* Literal length > 32, so chunk it. */
368 if (oo + 1 + 32 >= out_len)
370 out[oo++] = 0x80 | 31;
371 memcpy (out + oo, in + litofs, 32);
383 unchecked_decompress_buf (const unsigned char *in, unsigned int in_len,
385 unsigned int out_len __attribute__((unused)))
387 unsigned char *orig_out = out;
388 const unsigned char *in_end = in + in_len;
391 unsigned int first = *in++;
396 /* This default case can't happen, but GCCs VRP is not strong
397 enough to see this, so make this explicitely not fall to
398 the end of the switch, so that we don't have to initialize
406 //fprintf (stderr, "lit: 1\n");
410 //b 100lllll <l+1 bytes>
412 unsigned int l = first & 31;
413 //fprintf (stderr, "lit: %d\n", l);
422 o = first & (3 << 3);
423 o = (o << 5) | *in++;
424 first = (first & 7) + 2;
431 first = (first & 31) + 10;
435 // e 1110llll <8o> <8o>
437 o = in[0] | (in[1] << 8);
444 //f1 1111llll <8o> <8o> <8l>
445 //f2 11110lll <8o> <8o> <8l>
446 // g 11111lll <8o> <8o> <8o> <8l>
451 first = (((first - 8) << 8) | in[0]) + 5;
452 o = in[1] | (in[2] << 8) | (in[3] << 16);
457 first = ((first << 8) | in[0]) + 19;
458 o = in[1] | (in[2] << 8);
464 //fprintf (stderr, "ref: %d @ %d\n", first, o);
468 /* We know that first will not be zero, and this loop structure is
469 better optimizable. */
479 case 18: *out = *(out + o); out++;
480 case 17: *out = *(out + o); out++;
481 case 16: *out = *(out + o); out++;
482 case 15: *out = *(out + o); out++;
483 case 14: *out = *(out + o); out++;
484 case 13: *out = *(out + o); out++;
485 case 12: *out = *(out + o); out++;
486 case 11: *out = *(out + o); out++;
487 case 10: *out = *(out + o); out++;
488 case 9: *out = *(out + o); out++;
489 case 8: *out = *(out + o); out++;
490 case 7: *out = *(out + o); out++;
491 case 6: *out = *(out + o); out++;
492 case 5: *out = *(out + o); out++;
493 case 4: *out = *(out + o); out++;
494 case 3: *out = *(out + o); out++;
495 case 2: *out = *(out + o); out++;
496 case 1: *out = *(out + o); out++;
504 case 0: *out = *(out + o); out++;
505 case 15: *out = *(out + o); out++;
506 case 14: *out = *(out + o); out++;
507 case 13: *out = *(out + o); out++;
508 case 12: *out = *(out + o); out++;
509 case 11: *out = *(out + o); out++;
510 case 10: *out = *(out + o); out++;
511 case 9: *out = *(out + o); out++;
512 case 8: *out = *(out + o); out++;
513 case 7: *out = *(out + o); out++;
514 case 6: *out = *(out + o); out++;
515 case 5: *out = *(out + o); out++;
516 case 4: *out = *(out + o); out++;
517 case 3: *out = *(out + o); out++;
518 case 2: *out = *(out + o); out++;
519 case 1: *out = *(out + o); out++;
521 while ((int)(first -= 16) > 0);
527 return out - orig_out;
533 transfer_file (FILE * from, FILE * to, int compress)
535 unsigned char inb[BLOCK_SIZE];
536 unsigned char outb[BLOCK_SIZE];
537 while (!feof (from) && !ferror (from))
539 unsigned int in_len, out_len;
542 in_len = fread (inb, 1, BLOCK_SIZE, from);
545 unsigned char *b = outb;
546 out_len = compress_buf (inb, in_len, outb, sizeof (outb));
548 b = inb, out_len = in_len;
549 if (fwrite (&out_len, sizeof (out_len), 1, to) != 1)
551 perror ("write size");
554 if (fwrite (b, out_len, 1, to) != 1)
556 perror ("write data");
563 if (fread (&in_len, sizeof (in_len), 1, from) != 1)
567 perror ("can't read size");
570 if (fread (inb, in_len, 1, from) != 1)
572 perror ("can't read data");
576 unchecked_decompress_buf (inb, in_len, outb, sizeof (outb));
577 if (fwrite (outb, out_len, 1, to) != 1)
579 perror ("can't write output");
586 /* Just for benchmarking purposes. */
588 dumb_memcpy (void *dest, const void *src, unsigned int len)
597 benchmark (FILE * from)
599 unsigned char inb[BLOCK_SIZE];
600 unsigned char outb[BLOCK_SIZE];
601 unsigned int in_len = fread (inb, 1, BLOCK_SIZE, from);
602 unsigned int out_len;
605 perror ("can't read from input");
609 unsigned int calib_loop;
610 unsigned int per_loop;
619 while ((clock () - start) < CLOCKS_PER_SEC / 4)
622 for (i = 0; i < calib_loop; i++)
623 dumb_memcpy (outb, inb, in_len);
624 per_loop += calib_loop;
627 fprintf (stderr, "memcpy:\nCalibrated to %d iterations per loop\n",
631 for (i = 0; i < 10; i++)
632 for (j = 0; j < per_loop; j++)
633 dumb_memcpy (outb, inb, in_len);
635 seconds = (end - start) / (float) CLOCKS_PER_SEC;
636 fprintf (stderr, "%.2f seconds == %.2f MB/s\n", seconds,
637 ((long long) in_len * per_loop * 10) / (1024 * 1024 * seconds));
643 while ((clock () - start) < CLOCKS_PER_SEC / 4)
646 for (i = 0; i < calib_loop; i++)
647 compress_buf (inb, in_len, outb, sizeof (outb));
648 per_loop += calib_loop;
651 fprintf (stderr, "compression:\nCalibrated to %d iterations per loop\n",
655 for (i = 0; i < 10; i++)
656 for (j = 0; j < per_loop; j++)
657 compress_buf (inb, in_len, outb, sizeof (outb));
659 seconds = (end - start) / (float) CLOCKS_PER_SEC;
660 fprintf (stderr, "%.2f seconds == %.2f MB/s\n", seconds,
661 ((long long) in_len * per_loop * 10) / (1024 * 1024 * seconds));
663 out_len = compress_buf (inb, in_len, outb, sizeof (outb));
668 while ((clock () - start) < CLOCKS_PER_SEC / 4)
671 for (i = 0; i < calib_loop; i++)
672 unchecked_decompress_buf (outb, out_len, inb, sizeof (inb));
673 per_loop += calib_loop;
676 fprintf (stderr, "decompression:\nCalibrated to %d iterations per loop\n",
680 for (i = 0; i < 10; i++)
681 for (j = 0; j < per_loop; j++)
682 unchecked_decompress_buf (outb, out_len, inb, sizeof (inb));
684 seconds = (end - start) / (float) CLOCKS_PER_SEC;
685 fprintf (stderr, "%.2f seconds == %.2f MB/s\n", seconds,
686 ((long long) in_len * per_loop * 10) / (1024 * 1024 * seconds));
690 main (int argc, char *argv[])
693 if (argc > 1 && !strcmp (argv[1], "-d"))
695 if (argc > 1 && !strcmp (argv[1], "-b"))
698 transfer_file (stdin, stdout, compress);