Lib/gzip.py

   1 """Functions that read and write gzipped files.
   2
   3 The user of the file doesn't have to worry about the compression,
   4 but random access is not allowed."""
   5
   6 # based on Andrew Kuchling's minigzip.py distributed with the zlib module
   7
   8 import struct, sys, time, os
   9 import zlib
  10 import io
  11 import __builtin__
  12
  13 __all__ = ["GzipFile","open"]
  14
  15 FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
  16
  17 READ, WRITE = 1, 2
  18
  19 def write32u(output, value):
  20     # The L format writes the bit pattern correctly whether signed
  21     # or unsigned.
  22     output.write(struct.pack("<L", value))
  23
  24 def read32(input):
  25     return struct.unpack("<I", input.read(4))[0]
  26
  27 def open(filename, mode="rb", compresslevel=9):
  28     """Shorthand for GzipFile(filename, mode, compresslevel).
  29
  30     The filename argument is required; mode defaults to 'rb'
  31     and compresslevel defaults to 9.
  32
  33     """
  34     return GzipFile(filename, mode, compresslevel)
  35
  36 class GzipFile(io.BufferedIOBase):
  37     """The GzipFile class simulates most of the methods of a file object with
  38     the exception of the readinto() and truncate() methods.
  39
  40     """
  41
  42     myfileobj = None
  43     max_read_chunk = 10 * 1024 * 1024   # 10Mb
  44
  45     def __init__(self, filename=None, mode=None,
  46                  compresslevel=9, fileobj=None, mtime=None):
  47         """Constructor for the GzipFile class.
  48
  49         At least one of fileobj and filename must be given a
  50         non-trivial value.
  51
  52         The new class instance is based on fileobj, which can be a regular
  53         file, a StringIO object, or any other object which simulates a file.
  54         It defaults to None, in which case filename is opened to provide
  55         a file object.
  56
  57         When fileobj is not None, the filename argument is only used to be
  58         included in the gzip file header, which may includes the original
  59         filename of the uncompressed file.  It defaults to the filename of
  60         fileobj, if discernible; otherwise, it defaults to the empty string,
  61         and in this case the original filename is not included in the header.
  62
  63         The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
  64         depending on whether the file will be read or written.  The default
  65         is the mode of fileobj if discernible; otherwise, the default is 'rb'.
  66         Be aware that only the 'rb', 'ab', and 'wb' values should be used
  67         for cross-platform portability.
  68
  69         The compresslevel argument is an integer from 1 to 9 controlling the
  70         level of compression; 1 is fastest and produces the least compression,
  71         and 9 is slowest and produces the most compression.  The default is 9.
  72
  73         The mtime argument is an optional numeric timestamp to be written
  74         to the stream when compressing.  All gzip compressed streams
  75         are required to contain a timestamp.  If omitted or None, the
  76         current time is used.  This module ignores the timestamp when
  77         decompressing; however, some programs, such as gunzip, make use
  78         of it.  The format of the timestamp is the same as that of the
  79         return value of time.time() and of the st_mtime member of the
  80         object returned by os.stat().
  81
  82         """
  83
  84         # guarantee the file is opened in binary mode on platforms
  85         # that care about that sort of thing
  86         if mode and 'b' not in mode:
  87             mode += 'b'
  88         if fileobj is None:
  89             fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
  90         if filename is None:
  91             # Issue #13781: os.fdopen() creates a fileobj with a bogus name
  92             # attribute. Avoid saving this in the gzip header's filename field.
  93             if hasattr(fileobj, 'name') and fileobj.name != '<fdopen>':
  94                 filename = fileobj.name
  95             else:
  96                 filename = ''
  97         if mode is None:
  98             if hasattr(fileobj, 'mode'): mode = fileobj.mode
  99             else: mode = 'rb'
 100
 101         if mode[0:1] == 'r':
 102             self.mode = READ
 103             # Set flag indicating start of a new member
 104             self._new_member = True
 105             # Buffer data read from gzip file. extrastart is offset in
 106             # stream where buffer starts. extrasize is number of
 107             # bytes remaining in buffer from current stream position.
 108             self.extrabuf = ""
 109             self.extrasize = 0
 110             self.extrastart = 0
 111             self.name = filename
 112             # Starts small, scales exponentially
 113             self.min_readsize = 100
 114
 115         elif mode[0:1] == 'w' or mode[0:1] == 'a':
 116             self.mode = WRITE
 117             self._init_write(filename)
 118             self.compress = zlib.compressobj(compresslevel,
 119                                              zlib.DEFLATED,
 120                                              -zlib.MAX_WBITS,
 121                                              zlib.DEF_MEM_LEVEL,
 122                                              0)
 123         else:
 124             raise IOError, "Mode " + mode + " not supported"
 125
 126         self.fileobj = fileobj
 127         self.offset = 0
 128         self.mtime = mtime
 129
 130         if self.mode == WRITE:
 131             self._write_gzip_header()
 132
 133     @property
 134     def filename(self):
 135         import warnings
 136         warnings.warn("use the name attribute", DeprecationWarning, 2)
 137         if self.mode == WRITE and self.name[-3:] != ".gz":
 138             return self.name + ".gz"
 139         return self.name
 140
 141     def __repr__(self):
 142         s = repr(self.fileobj)
 143         return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
 144
 145     def _check_closed(self):
 146         """Raises a ValueError if the underlying file object has been closed.
 147
 148         """
 149         if self.closed:
 150             raise ValueError('I/O operation on closed file.')
 151
 152     def _init_write(self, filename):
 153         self.name = filename
 154         self.crc = zlib.crc32("") & 0xffffffffL
 155         self.size = 0
 156         self.writebuf = []
 157         self.bufsize = 0
 158
 159     def _write_gzip_header(self):
 160         self.fileobj.write('\037\213')             # magic header
 161         self.fileobj.write('\010')                 # compression method
 162         fname = os.path.basename(self.name)
 163         if fname.endswith(".gz"):
 164             fname = fname[:-3]
 165         flags = 0
 166         if fname:
 167             flags = FNAME
 168         self.fileobj.write(chr(flags))
 169         mtime = self.mtime
 170         if mtime is None:
 171             mtime = time.time()
 172         write32u(self.fileobj, long(mtime))
 173         self.fileobj.write('\002')
 174         self.fileobj.write('\377')
 175         if fname:
 176             self.fileobj.write(fname + '\000')
 177
 178     def _init_read(self):
 179         self.crc = zlib.crc32("") & 0xffffffffL
 180         self.size = 0
 181
 182     def _read_gzip_header(self):
 183         magic = self.fileobj.read(2)
 184         if magic != '\037\213':
 185             raise IOError, 'Not a gzipped file'
 186         method = ord( self.fileobj.read(1) )
 187         if method != 8:
 188             raise IOError, 'Unknown compression method'
 189         flag = ord( self.fileobj.read(1) )
 190         self.mtime = read32(self.fileobj)
 191         # extraflag = self.fileobj.read(1)
 192         # os = self.fileobj.read(1)
 193         self.fileobj.read(2)
 194
 195         if flag & FEXTRA:
 196             # Read & discard the extra field, if present
 197             xlen = ord(self.fileobj.read(1))
 198             xlen = xlen + 256*ord(self.fileobj.read(1))
 199             self.fileobj.read(xlen)
 200         if flag & FNAME:
 201             # Read and discard a null-terminated string containing the filename
 202             while True:
 203                 s = self.fileobj.read(1)
 204                 if not s or s=='\000':
 205                     break
 206         if flag & FCOMMENT:
 207             # Read and discard a null-terminated string containing a comment
 208             while True:
 209                 s = self.fileobj.read(1)
 210                 if not s or s=='\000':
 211                     break
 212         if flag & FHCRC:
 213             self.fileobj.read(2)     # Read & discard the 16-bit header CRC
 214
 215     def write(self,data):
 216         self._check_closed()
 217         if self.mode != WRITE:
 218             import errno
 219             raise IOError(errno.EBADF, "write() on read-only GzipFile object")
 220
 221         if self.fileobj is None:
 222             raise ValueError, "write() on closed GzipFile object"
 223
 224         # Convert data type if called by io.BufferedWriter.
 225         if isinstance(data, memoryview):
 226             data = data.tobytes()
 227
 228         if len(data) > 0:
 229             self.size = self.size + len(data)
 230             self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
 231             self.fileobj.write( self.compress.compress(data) )
 232             self.offset += len(data)
 233
 234         return len(data)
 235
 236     def read(self, size=-1):
 237         self._check_closed()
 238         if self.mode != READ:
 239             import errno
 240             raise IOError(errno.EBADF, "read() on write-only GzipFile object")
 241
 242         if self.extrasize <= 0 and self.fileobj is None:
 243             return ''
 244
 245         readsize = 1024
 246         if size < 0:        # get the whole thing
 247             try:
 248                 while True:
 249                     self._read(readsize)
 250                     readsize = min(self.max_read_chunk, readsize * 2)
 251             except EOFError:
 252                 size = self.extrasize
 253         else:               # just get some more of it
 254             try:
 255                 while size > self.extrasize:
 256                     self._read(readsize)
 257                     readsize = min(self.max_read_chunk, readsize * 2)
 258             except EOFError:
 259                 if size > self.extrasize:
 260                     size = self.extrasize
 261
 262         offset = self.offset - self.extrastart
 263         chunk = self.extrabuf[offset: offset + size]
 264         self.extrasize = self.extrasize - size
 265
 266         self.offset += size
 267         return chunk
 268
 269     def _unread(self, buf):
 270         self.extrasize = len(buf) + self.extrasize
 271         self.offset -= len(buf)
 272
 273     def _read(self, size=1024):
 274         if self.fileobj is None:
 275             raise EOFError, "Reached EOF"
 276
 277         if self._new_member:
 278             # If the _new_member flag is set, we have to
 279             # jump to the next member, if there is one.
 280             #
 281             # First, check if we're at the end of the file;
 282             # if so, it's time to stop; no more members to read.
 283             pos = self.fileobj.tell()   # Save current position
 284             self.fileobj.seek(0, 2)     # Seek to end of file
 285             if pos == self.fileobj.tell():
 286                 raise EOFError, "Reached EOF"
 287             else:
 288                 self.fileobj.seek( pos ) # Return to original position
 289
 290             self._init_read()
 291             self._read_gzip_header()
 292             self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
 293             self._new_member = False
 294
 295         # Read a chunk of data from the file
 296         buf = self.fileobj.read(size)
 297
 298         # If the EOF has been reached, flush the decompression object
 299         # and mark this object as finished.
 300
 301         if buf == "":
 302             uncompress = self.decompress.flush()
 303             self._read_eof()
 304             self._add_read_data( uncompress )
 305             raise EOFError, 'Reached EOF'
 306
 307         uncompress = self.decompress.decompress(buf)
 308         self._add_read_data( uncompress )
 309
 310         if self.decompress.unused_data != "":
 311             # Ending case: we've come to the end of a member in the file,
 312             # so seek back to the start of the unused data, finish up
 313             # this member, and read a new gzip header.
 314             # (The number of bytes to seek back is the length of the unused
 315             # data, minus 8 because _read_eof() will rewind a further 8 bytes)
 316             self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
 317
 318             # Check the CRC and file size, and set the flag so we read
 319             # a new member on the next call
 320             self._read_eof()
 321             self._new_member = True
 322
 323     def _add_read_data(self, data):
 324         self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
 325         offset = self.offset - self.extrastart
 326         self.extrabuf = self.extrabuf[offset:] + data
 327         self.extrasize = self.extrasize + len(data)
 328         self.extrastart = self.offset
 329         self.size = self.size + len(data)
 330
 331     def _read_eof(self):
 332         # We've read to the end of the file, so we have to rewind in order
 333         # to reread the 8 bytes containing the CRC and the file size.
 334         # We check the that the computed CRC and size of the
 335         # uncompressed data matches the stored values.  Note that the size
 336         # stored is the true file size mod 2**32.
 337         self.fileobj.seek(-8, 1)
 338         crc32 = read32(self.fileobj)
 339         isize = read32(self.fileobj)  # may exceed 2GB
 340         if crc32 != self.crc:
 341             raise IOError("CRC check failed %s != %s" % (hex(crc32),
 342                                                          hex(self.crc)))
 343         elif isize != (self.size & 0xffffffffL):
 344             raise IOError, "Incorrect length of data produced"
 345
 346         # Gzip files can be padded with zeroes and still have archives.
 347         # Consume all zero bytes and set the file position to the first
 348         # non-zero byte. See http://www.gzip.org/#faq8
 349         c = "\x00"
 350         while c == "\x00":
 351             c = self.fileobj.read(1)
 352         if c:
 353             self.fileobj.seek(-1, 1)
 354
 355     @property
 356     def closed(self):
 357         return self.fileobj is None
 358
 359     def close(self):
 360         if self.fileobj is None:
 361             return
 362         if self.mode == WRITE:
 363             self.fileobj.write(self.compress.flush())
 364             write32u(self.fileobj, self.crc)
 365             # self.size may exceed 2GB, or even 4GB
 366             write32u(self.fileobj, self.size & 0xffffffffL)
 367             self.fileobj = None
 368         elif self.mode == READ:
 369             self.fileobj = None
 370         if self.myfileobj:
 371             self.myfileobj.close()
 372             self.myfileobj = None
 373
 374     def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
 375         self._check_closed()
 376         if self.mode == WRITE:
 377             # Ensure the compressor's buffer is flushed
 378             self.fileobj.write(self.compress.flush(zlib_mode))
 379             self.fileobj.flush()
 380
 381     def fileno(self):
 382         """Invoke the underlying file object's fileno() method.
 383
 384         This will raise AttributeError if the underlying file object
 385         doesn't support fileno().
 386         """
 387         return self.fileobj.fileno()
 388
 389     def rewind(self):
 390         '''Return the uncompressed stream file position indicator to the
 391         beginning of the file'''
 392         if self.mode != READ:
 393             raise IOError("Can't rewind in write mode")
 394         self.fileobj.seek(0)
 395         self._new_member = True
 396         self.extrabuf = ""
 397         self.extrasize = 0
 398         self.extrastart = 0
 399         self.offset = 0
 400
 401     def readable(self):
 402         return self.mode == READ
 403
 404     def writable(self):
 405         return self.mode == WRITE
 406
 407     def seekable(self):
 408         return True
 409
 410     def seek(self, offset, whence=0):
 411         if whence:
 412             if whence == 1:
 413                 offset = self.offset + offset
 414             else:
 415                 raise ValueError('Seek from end not supported')
 416         if self.mode == WRITE:
 417             if offset < self.offset:
 418                 raise IOError('Negative seek in write mode')
 419             count = offset - self.offset
 420             for i in range(count // 1024):
 421                 self.write(1024 * '\0')
 422             self.write((count % 1024) * '\0')
 423         elif self.mode == READ:
 424             if offset < self.offset:
 425                 # for negative seek, rewind and do positive seek
 426                 self.rewind()
 427             count = offset - self.offset
 428             for i in range(count // 1024):
 429                 self.read(1024)
 430             self.read(count % 1024)
 431
 432         return self.offset
 433
 434     def readline(self, size=-1):
 435         if size < 0:
 436             # Shortcut common case - newline found in buffer.
 437             offset = self.offset - self.extrastart
 438             i = self.extrabuf.find('\n', offset) + 1
 439             if i > 0:
 440                 self.extrasize -= i - offset
 441                 self.offset += i - offset
 442                 return self.extrabuf[offset: i]
 443
 444             size = sys.maxint
 445             readsize = self.min_readsize
 446         else:
 447             readsize = size
 448         bufs = []
 449         while size != 0:
 450             c = self.read(readsize)
 451             i = c.find('\n')
 452
 453             # We set i=size to break out of the loop under two
 454             # conditions: 1) there's no newline, and the chunk is
 455             # larger than size, or 2) there is a newline, but the
 456             # resulting line would be longer than 'size'.
 457             if (size <= i) or (i == -1 and len(c) > size):
 458                 i = size - 1
 459
 460             if i >= 0 or c == '':
 461                 bufs.append(c[:i + 1])    # Add portion of last chunk
 462                 self._unread(c[i + 1:])   # Push back rest of chunk
 463                 break
 464
 465             # Append chunk to list, decrease 'size',
 466             bufs.append(c)
 467             size = size - len(c)
 468             readsize = min(size, readsize * 2)
 469         if readsize > self.min_readsize:
 470             self.min_readsize = min(readsize, self.min_readsize * 2, 512)
 471         return ''.join(bufs) # Return resulting line
 472
 473
 474 def _test():
 475     # Act like gzip; with -d, act like gunzip.
 476     # The input file is not deleted, however, nor are any other gzip
 477     # options or features supported.
 478     args = sys.argv[1:]
 479     decompress = args and args[0] == "-d"
 480     if decompress:
 481         args = args[1:]
 482     if not args:
 483         args = ["-"]
 484     for arg in args:
 485         if decompress:
 486             if arg == "-":
 487                 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
 488                 g = sys.stdout
 489             else:
 490                 if arg[-3:] != ".gz":
 491                     print "filename doesn't end in .gz:", repr(arg)
 492                     continue
 493                 f = open(arg, "rb")
 494                 g = __builtin__.open(arg[:-3], "wb")
 495         else:
 496             if arg == "-":
 497                 f = sys.stdin
 498                 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
 499             else:
 500                 f = __builtin__.open(arg, "rb")
 501                 g = open(arg + ".gz", "wb")
 502         while True:
 503             chunk = f.read(1024)
 504             if not chunk:
 505                 break
 506             g.write(chunk)
 507         if g is not sys.stdout:
 508             g.close()
 509         if f is not sys.stdin:
 510             f.close()
 511
 512 if __name__ == '__main__':
 513     _test()