Add my old micro-bunzip library. Needs some cleanup...
authorRob Landley <rob@landley.net>
Sat, 25 Nov 2006 21:50:00 +0000 (16:50 -0500)
committerRob Landley <rob@landley.net>
Sat, 25 Nov 2006 21:50:00 +0000 (16:50 -0500)
lib/bunzip.c [new file with mode: 0644]
toys.h

diff --git a/lib/bunzip.c b/lib/bunzip.c
new file mode 100644 (file)
index 0000000..c6b61ec
--- /dev/null
@@ -0,0 +1,502 @@
+/* vi: set sw=4 ts=4: */
+/* micro-bunzip, a small, simple bzip2 decompression implementation.
+
+   Copyright 2003, 2006 by Rob Landley (rob@landley.net).
+
+   Based on bzip2 decompression code by Julian R Seward (jseward@acm.org),
+   which also acknowledges contributions by Mike Burrows, David Wheeler, Peter
+   Fenwick, Alistair Moffat, Radford Neal, Ian H. Witten, Robert Sedgewick, and
+   Jon L. Bentley.  (Didn't actually use any of their code, though.)
+*/
+
+#include "toys.h"
+
+/* Constants for huffman coding */
+#define MAX_GROUPS                     6
+#define GROUP_SIZE             50              /* 64 would have been more efficient */
+#define MAX_HUFCODE_BITS       20              /* Longest huffman code allowed */
+#define MAX_SYMBOLS            258             /* 256 literals + RUNA + RUNB */
+#define SYMBOL_RUNA                    0
+#define SYMBOL_RUNB                    1
+
+/* Status return values */
+#define RETVAL_OK                                              0
+#define RETVAL_LAST_BLOCK                              (-1)
+#define RETVAL_NOT_BZIP_DATA                   (-2)
+#define RETVAL_UNEXPECTED_INPUT_EOF            (-3)
+#define RETVAL_UNEXPECTED_OUTPUT_EOF   (-4)
+#define RETVAL_DATA_ERROR                              (-5)
+#define RETVAL_OUT_OF_MEMORY                   (-6)
+#define RETVAL_OBSOLETE_INPUT                  (-7)
+
+/* Other housekeeping constants */
+#define IOBUF_SIZE                     4096
+
+char *bunzip_errors[]={NULL,"Bad file checksum","Not bzip data",
+               "Unexpected input EOF","Unexpected output EOF","Data error",
+                "Out of memory","Obsolete (pre 0.9.5) bzip format not supported."};
+
+/* This is what we know about each huffman coding group */
+struct group_data {
+       int limit[MAX_HUFCODE_BITS],base[MAX_HUFCODE_BITS],permute[MAX_SYMBOLS];
+       char minLen, maxLen;
+};
+
+/* Structure holding all the housekeeping data, including IO buffers and
+   memory that persists between calls to bunzip */
+typedef struct {
+       /* For I/O error handling */
+       jmp_buf jmpbuf;
+       /* Input stream, input buffer, input bit buffer */
+       int in_fd,inbufCount,inbufPos;
+       unsigned char *inbuf;
+       unsigned int inbufBitCount, inbufBits;
+       /* Output buffer */
+       char outbuf[IOBUF_SIZE];
+       int outbufPos;
+       /* The CRC values stored in the block header and calculated from the data */
+       unsigned int crc32Table[256],headerCRC, dataCRC, totalCRC;
+       /* Intermediate buffer and its size (in bytes) */
+       unsigned int *dbuf, dbufSize;
+       /* State for interrupting output loop */
+       int writePos,writeRun,writeCount,writeCurrent;
+
+       /* These things are a bit too big to go on the stack */
+       unsigned char selectors[32768];                 /* nSelectors=15 bits */
+       struct group_data groups[MAX_GROUPS];   /* huffman coding tables */
+} bunzip_data;
+
+/* Return the next nnn bits of input.  All reads from the compressed input
+   are done through this function.  All reads are big endian */
+static unsigned int get_bits(bunzip_data *bd, char bits_wanted)
+{
+       unsigned int bits=0;
+
+       /* If we need to get more data from the byte buffer, do so.  (Loop getting
+          one byte at a time to enforce endianness and avoid unaligned access.) */
+       while (bd->inbufBitCount<bits_wanted) {
+               /* If we need to read more data from file into byte buffer, do so */
+               if(bd->inbufPos==bd->inbufCount) {
+                       if(!(bd->inbufCount = read(bd->in_fd, bd->inbuf, IOBUF_SIZE)))
+                               longjmp(bd->jmpbuf,RETVAL_UNEXPECTED_INPUT_EOF);
+                       bd->inbufPos=0;
+               }
+               /* Avoid 32-bit overflow (dump bit buffer to top of output) */
+               if(bd->inbufBitCount>=24) {
+                       bits=bd->inbufBits&((1<<bd->inbufBitCount)-1);
+                       bits_wanted-=bd->inbufBitCount;
+                       bits<<=bits_wanted;
+                       bd->inbufBitCount=0;
+               }
+               /* Grab next 8 bits of input from buffer. */
+               bd->inbufBits=(bd->inbufBits<<8)|bd->inbuf[bd->inbufPos++];
+               bd->inbufBitCount+=8;
+       }
+       /* Calculate result */
+       bd->inbufBitCount-=bits_wanted;
+       bits|=(bd->inbufBits>>bd->inbufBitCount)&((1<<bits_wanted)-1);
+
+       return bits;
+}
+
+/* Decompress a block of text to into intermediate buffer */
+
+extern int read_bunzip_data(bunzip_data *bd)
+{
+       struct group_data *hufGroup;
+       int dbufCount,nextSym,dbufSize,origPtr,groupCount,*base,*limit,selector,
+               i,j,k,t,runPos,symCount,symTotal,nSelectors,byteCount[256];
+       unsigned char uc, symToByte[256], mtfSymbol[256], *selectors;
+       unsigned int *dbuf;
+
+       /* Read in header signature (borrowing mtfSymbol for temp space). */
+       for(i=0;i<6;i++) mtfSymbol[i]=get_bits(bd,8);
+       mtfSymbol[6]=0;
+       /* Read CRC (which is stored big endian). */
+       bd->headerCRC=get_bits(bd,32);
+       /* Is this the last block (with CRC for file)? */
+       if(!strcmp(mtfSymbol,"\x17\x72\x45\x38\x50\x90"))
+               return RETVAL_LAST_BLOCK;
+       /* If it's not a valid data block, barf. */
+       if(strcmp(mtfSymbol,"\x31\x41\x59\x26\x53\x59"))
+               return RETVAL_NOT_BZIP_DATA;
+
+       dbuf=bd->dbuf;
+       dbufSize=bd->dbufSize;
+       selectors=bd->selectors;
+       /* We can add support for blockRandomised if anybody complains.  There was
+          some code for this in busybox 1.0.0-pre3, but nobody ever noticed that
+          it didn't actually work. */
+       if(get_bits(bd,1)) return RETVAL_OBSOLETE_INPUT;
+       if((origPtr=get_bits(bd,24)) > dbufSize) return RETVAL_DATA_ERROR;
+       /* mapping table: if some byte values are never used (encoding things
+          like ascii text), the compression code removes the gaps to have fewer
+          symbols to deal with, and writes a sparse bitfield indicating which
+          values were present.  We make a translation table to convert the symbols
+          back to the corresponding bytes. */
+       t=get_bits(bd, 16);
+       memset(symToByte,0,256);
+       symTotal=0;
+       for (i=0;i<16;i++) {
+               if(t&(1<<(15-i))) {
+                       k=get_bits(bd,16);
+                       for(j=0;j<16;j++)
+                               if(k&(1<<(15-j))) symToByte[symTotal++]=(16*i)+j;
+               }
+       }
+       /* How many different huffman coding groups does this block use? */
+       groupCount=get_bits(bd,3);
+       if (groupCount<2 || groupCount>MAX_GROUPS) return RETVAL_DATA_ERROR;
+       /* nSelectors: Every GROUP_SIZE many symbols we select a new huffman coding
+          group.  Read in the group selector list, which is stored as MTF encoded
+          bit runs. */
+       if(!(nSelectors=get_bits(bd, 15))) return RETVAL_DATA_ERROR;
+       for(i=0; i<groupCount; i++) mtfSymbol[i] = i;
+       for(i=0; i<nSelectors; i++) {
+               /* Get next value */
+               for(j=0;get_bits(bd,1);j++) if (j>=groupCount) return RETVAL_DATA_ERROR;
+               /* Decode MTF to get the next selector */
+               uc = mtfSymbol[j];
+               memmove(mtfSymbol+1,mtfSymbol,j);
+               mtfSymbol[0]=selectors[i]=uc;
+       }
+       /* Read the huffman coding tables for each group, which code for symTotal
+          literal symbols, plus two run symbols (RUNA, RUNB) */
+       symCount=symTotal+2;
+       for (j=0; j<groupCount; j++) {
+               unsigned char length[MAX_SYMBOLS],temp[MAX_HUFCODE_BITS+1];
+               int     minLen, maxLen, pp;
+               /* Read lengths */
+               t=get_bits(bd, 5);
+               for (i = 0; i < symCount; i++) {
+                       for(;;) {
+                               if (t < 1 || t > MAX_HUFCODE_BITS) return RETVAL_DATA_ERROR;
+                                       if(!get_bits(bd, 1)) break;
+                                       if(!get_bits(bd, 1)) t++;
+                                       else t--;
+                       }
+                       length[i] = t;
+               }
+               /* Find largest and smallest lengths in this group */
+               minLen=maxLen=length[0];
+               for(i = 1; i < symCount; i++) {
+                       if(length[i] > maxLen) maxLen = length[i];
+                       else if(length[i] < minLen) minLen = length[i];
+               }
+               /* Calculate permute[], base[], and limit[] tables from length[].
+                *
+                * permute[] is the lookup table for converting huffman coded symbols
+                * into decoded symbols.  base[] is the amount to subtract from the
+                * value of a huffman symbol of a given length when using permute[].
+                *
+                * limit[] indicates the largest numerical value a symbol with a given
+                * number of bits can have.  It lets us know when to stop reading.
+                *
+                * To use these, keep reading bits until value<=limit[bitcount] or
+                * you've read over 20 bits (error).  Then the decoded symbol
+                * equals permute[hufcode_value-base[hufcode_bitcount]].
+                */
+               hufGroup=bd->groups+j;
+               hufGroup->minLen = minLen;
+               hufGroup->maxLen = maxLen;
+               /* Note that minLen can't be smaller than 1, so we adjust the base
+                  and limit array pointers so we're not always wasting the first
+                  entry.  We do this again when using them (during symbol decoding).*/
+               base=hufGroup->base-1;
+               limit=hufGroup->limit-1;
+               /* Calculate permute[] */
+               pp = 0;
+               for(i=minLen;i<=maxLen;i++) 
+                       for(t=0;t<symCount;t++) 
+                               if(length[t]==i) hufGroup->permute[pp++] = t;
+               /* Count cumulative symbols coded for at each bit length */
+               for (i=minLen;i<=maxLen;i++) temp[i]=limit[i]=0;
+               for (i=0;i<symCount;i++) temp[length[i]]++;
+               /* Calculate limit[] (the largest symbol-coding value at each bit
+                * length, which is (previous limit<<1)+symbols at this level), and
+                * base[] (number of symbols to ignore at each bit length, which is
+                * limit-cumulative count of symbols coded for already). */
+               pp=t=0;
+               for (i=minLen; i<maxLen; i++) {
+                       pp+=temp[i];
+                       limit[i]=pp-1;
+                       pp<<=1;
+                       base[i+1]=pp-(t+=temp[i]);
+               }
+               limit[maxLen]=pp+temp[maxLen]-1;
+               base[minLen]=0;
+       }
+       /* We've finished reading and digesting the block header.  Now read this
+          block's huffman coded symbols from the file and undo the huffman coding
+          and run length encoding, saving the result into dbuf[dbufCount++]=uc */
+
+       /* Initialize symbol occurrence counters and symbol mtf table */
+       memset(byteCount,0,256*sizeof(int));
+       for(i=0;i<256;i++) mtfSymbol[i]=(unsigned char)i;
+       /* Loop through compressed symbols */
+       runPos=dbufCount=symCount=selector=0;
+       for(;;) {
+               /* Determine which huffman coding group to use. */
+               if(!(symCount--)) {
+                       symCount=GROUP_SIZE-1;
+                       if(selector>=nSelectors) return RETVAL_DATA_ERROR;
+                       hufGroup=bd->groups+selectors[selector++];
+                       base=hufGroup->base-1;
+                       limit=hufGroup->limit-1;
+               }
+               /* Read next huffman-coded symbol */
+               i = hufGroup->minLen;
+               j=get_bits(bd, i);
+               for(;;) {
+                       if (i > hufGroup->maxLen) return RETVAL_DATA_ERROR;
+                       if (j <= limit[i]) break;
+                       i++;
+
+                       j = (j << 1) | get_bits(bd,1);
+               }
+               /* Huffman decode nextSym (with bounds checking) */
+               j-=base[i];
+               if (j < 0 || j >= MAX_SYMBOLS) return RETVAL_DATA_ERROR;
+               nextSym = hufGroup->permute[j];
+               /* If this is a repeated run, loop collecting data */
+               if (nextSym == SYMBOL_RUNA || nextSym == SYMBOL_RUNB) {
+                       /* If this is the start of a new run, zero out counter */
+                       if(!runPos) {
+                               runPos = 1;
+                               t = 0;
+                       }
+                       /* Neat trick that saves 1 symbol: instead of or-ing 0 or 1 at
+                          each bit position, add 1 or 2 instead.  For example,
+                          1011 is 1<<0 + 1<<1 + 2<<2.  1010 is 2<<0 + 2<<1 + 1<<2.
+                          You can make any bit pattern that way using 1 less symbol than
+                          the basic or 0/1 method (except all bits 0, which would use no
+                          symbols, but a run of length 0 doesn't mean anything in this
+                          context).  Thus space is saved. */
+                       if (nextSym == SYMBOL_RUNA) t += runPos;
+                       else t += 2*runPos;
+                       runPos <<= 1;
+                       continue;
+               }
+               /* When we hit the first non-run symbol after a run, we now know
+                  how many times to repeat the last literal, so append that many
+                  copies to our buffer of decoded symbols (dbuf) now.  (The last
+                  literal used is the one at the head of the mtfSymbol array.) */
+               if(runPos) {
+                       runPos=0;
+                       if(dbufCount+t>=dbufSize) return RETVAL_DATA_ERROR;
+
+                       uc = symToByte[mtfSymbol[0]];
+                       byteCount[uc] += t;
+                       while(t--) dbuf[dbufCount++]=uc;
+               }
+               /* Is this the terminating symbol? */
+               if(nextSym>symTotal) break;
+               /* At this point, the symbol we just decoded indicates a new literal
+                  character.  Subtract one to get the position in the MTF array
+                  at which this literal is currently to be found.  (Note that the
+                  result can't be -1 or 0, because 0 and 1 are RUNA and RUNB.
+                  Another instance of the first symbol in the mtf array, position 0,
+                  would have been handled as part of a run.) */
+               if(dbufCount>=dbufSize) return RETVAL_DATA_ERROR;
+               i = nextSym - 1;
+               uc = mtfSymbol[i];
+               memmove(mtfSymbol+1,mtfSymbol,i);
+               mtfSymbol[0] = uc;
+               uc=symToByte[uc];
+               /* We have our literal byte.  Save it into dbuf. */
+               byteCount[uc]++;
+               dbuf[dbufCount++] = (unsigned int)uc;
+       }
+       /* At this point, we've finished reading huffman-coded symbols and
+          compressed runs from the input stream.  There are dbufCount many of
+          them in dbuf[].  Now undo the Burrows-Wheeler transform on dbuf.
+          See http://dogma.net/markn/articles/bwt/bwt.htm
+        */
+
+       /* Now we know what dbufCount is, do a better sanity check on origPtr.  */
+       if (origPtr<0 || origPtr>=dbufCount) return RETVAL_DATA_ERROR;
+       /* Turn byteCount into cumulative occurrence counts of 0 to n-1. */
+       j=0;
+       for(i=0;i<256;i++) {
+               k=j+byteCount[i];
+               byteCount[i] = j;
+               j=k;
+       }
+       /* Figure out what order dbuf would be in if we sorted it. */
+       for (i=0;i<dbufCount;i++) {
+               uc = (unsigned char)(dbuf[i] & 0xff);
+               dbuf[byteCount[uc]] |= (i << 8);
+               byteCount[uc]++;
+       }
+       /* blockRandomised support would go here. */
+
+       /* Using i as position, j as previous character, t as current character,
+          and uc as run count */
+       bd->dataCRC = 0xffffffffL;
+       /* Decode first byte by hand to initialize "previous" byte.  Note that it
+          doesn't get output, and if the first three characters are identical
+          it doesn't qualify as a run (hence uc=255, which will either wrap
+          to 1 or get reset). */
+       if(dbufCount) {
+               bd->writePos=dbuf[origPtr];
+           bd->writeCurrent=(unsigned char)(bd->writePos&0xff);
+               bd->writePos>>=8;
+               bd->writeRun=-1;
+       }
+       bd->writeCount=dbufCount;
+
+       return RETVAL_OK;
+}
+
+/* Flush output buffer to disk */
+extern void flush_bunzip_outbuf(bunzip_data *bd, int out_fd)
+{
+       if(bd->outbufPos) {
+               if(write(out_fd, bd->outbuf, bd->outbufPos) != bd->outbufPos)
+                       longjmp(bd->jmpbuf,RETVAL_UNEXPECTED_OUTPUT_EOF);
+               bd->outbufPos=0;
+       }
+}
+
+
+/* Undo burrows-wheeler transform on intermediate buffer to produce output.
+   If !len, write up to len bytes of data to buf.  Otherwise write to out_fd.
+   Returns len ? bytes written : RETVAL_OK.  Notice all errors negative #'s. */
+extern int write_bunzip_data(bunzip_data *bd, int out_fd, char *outbuf, int len)
+{
+       unsigned int *dbuf=bd->dbuf;
+       int count,pos,current, run,copies,outbyte,previous,gotcount=0;
+
+       for(;;) {
+               /* If last read was short due to end of file, return last block now */
+               if(bd->writeCount<0) return bd->writeCount;
+               /* If we need to refill dbuf, do it. */
+               if(!bd->writeCount) {
+                       int i=read_bunzip_data(bd);
+                       if(i) {
+                               if(i==RETVAL_LAST_BLOCK) {
+                                       bd->writeCount=i;
+                                       return gotcount;
+                               } else return i;
+                       }
+               }
+               /* Loop generating output */
+               count=bd->writeCount;
+               pos=bd->writePos;
+               current=bd->writeCurrent;
+               run=bd->writeRun;
+               while(count) {
+                       /* If somebody (like busybox tar) wants a certain number of bytes of
+                       data from memory instead of written to a file, humor them */
+                       if(len && bd->outbufPos>=len) goto dataus_interruptus;
+                       count--;
+                       /* Follow sequence vector to undo Burrows-Wheeler transform */
+                       previous=current;
+                       pos=dbuf[pos];
+                       current=pos&0xff;
+                       pos>>=8;
+                       /* Whenever we see 3 consecutive copies of the same byte,
+                          the 4th is a repeat count */
+                       if(run++==3) {
+                               copies=current;
+                               outbyte=previous;
+                               current=-1;
+                       } else {
+                               copies=1;
+                               outbyte=current;
+                       }
+                       /* Output bytes to buffer, flushing to file if necessary */
+                       while(copies--) {
+                               if(bd->outbufPos == IOBUF_SIZE) flush_bunzip_outbuf(bd,out_fd);
+                               bd->outbuf[bd->outbufPos++] = outbyte;
+                               bd->dataCRC = (bd->dataCRC << 8)
+                                                               ^ bd->crc32Table[(bd->dataCRC >> 24) ^ outbyte];
+                       }
+                       if(current!=previous) run=0;
+               }
+               /* Decompression of this block completed successfully */
+               bd->dataCRC=~(bd->dataCRC);
+               bd->totalCRC=((bd->totalCRC << 1) | (bd->totalCRC >> 31)) ^ bd->dataCRC;
+               /* If this block had a CRC error, force file level CRC error. */
+               if(bd->dataCRC!=bd->headerCRC) {
+                       bd->totalCRC=bd->headerCRC+1;
+                       return RETVAL_LAST_BLOCK;
+               }
+dataus_interruptus:
+               bd->writeCount=count;
+               if(len) {
+                       gotcount+=bd->outbufPos;
+                       memcpy(outbuf,bd->outbuf,len);
+                       /* If we got enough data, checkpoint loop state and return */
+                       if((len-=bd->outbufPos)<1) {
+                               bd->outbufPos-=len;
+                               if(bd->outbufPos)
+                                       memmove(bd->outbuf,bd->outbuf+len,bd->outbufPos);
+                               bd->writePos=pos;
+                               bd->writeCurrent=current;
+                               bd->writeRun=run;
+                               return gotcount;
+                       }
+               }
+       }
+}
+
+/* Allocate the structure, read file header.  If !len, src_fd contains
+   filehandle to read from.  Else inbuf contains data. */
+extern int start_bunzip(bunzip_data **bdp, int src_fd, char *inbuf, int len)
+{
+       bunzip_data *bd;
+       unsigned int i,j,c;
+
+       /* Figure out how much data to allocate */
+       i=sizeof(bunzip_data);
+       if(!len) i+=IOBUF_SIZE;
+       /* Allocate bunzip_data.  Most fields initialize to zero. */
+       if(!(bd=*bdp=malloc(i))) return RETVAL_OUT_OF_MEMORY;
+       memset(bd,0,sizeof(bunzip_data));
+       if(len) {
+               bd->inbuf=inbuf;
+               bd->inbufCount=len;
+               bd->in_fd=-1;
+       } else {
+               bd->inbuf=(char *)(bd+1);
+               bd->in_fd=src_fd;
+       }
+       /* Init the CRC32 table (big endian) */
+       for(i=0;i<256;i++) {
+               c=i<<24;
+               for(j=8;j;j--)
+                       c=c&0x80000000 ? (c<<1)^0x04c11db7 : (c<<1);
+               bd->crc32Table[i]=c;
+       }
+       /* Setup for I/O error handling via longjmp */
+       i=setjmp(bd->jmpbuf);
+       if(i) return i;
+       /* Ensure that file starts with "BZh" */
+    for(i=0;i<3;i++) if(get_bits(bd,8)!="BZh"[i]) return RETVAL_NOT_BZIP_DATA;
+       /* Next byte ascii '1'-'9', indicates block size in units of 100k of
+          uncompressed data.  Allocate intermediate buffer for block. */
+       i=get_bits(bd,8);
+       if (i<'1' || i>'9') return RETVAL_NOT_BZIP_DATA;
+       bd->dbufSize=100000*(i-'0');
+       if(!(bd->dbuf=malloc(bd->dbufSize * sizeof(int))))
+               return RETVAL_OUT_OF_MEMORY;
+       return RETVAL_OK;
+}
+
+/* Example usage: decompress src_fd to dst_fd.  (Stops at end of bzip data,
+   not end of file.) */
+extern char *uncompressStream(int src_fd, int dst_fd)
+{
+       bunzip_data *bd;
+       int i;
+
+       if(!(i=start_bunzip(&bd,src_fd,0,0))) {
+               i=write_bunzip_data(bd,dst_fd,0,0);
+               if(i==RETVAL_LAST_BLOCK && bd->headerCRC==bd->totalCRC) i=RETVAL_OK;
+       }
+       flush_bunzip_outbuf(bd,dst_fd);
+       if(bd->dbuf) free(bd->dbuf);
+       free(bd);
+       return bunzip_errors[-i];
+}
diff --git a/toys.h b/toys.h
index 31ce206..7a01c2a 100644 (file)
--- a/toys.h
+++ b/toys.h
@@ -11,6 +11,7 @@
 #include <fcntl.h>
 #include <inttypes.h>
 #include <limits.h>
+#include <setjmp.h>
 #include <stdarg.h>
 #include <stdint.h>
 #include <stdio.h>