import os
import stat
+import bz2
# A list of supported compression types
SUPPORTED_COMPRESSION_TYPES = ('bz2', 'gz', 'tar.gz', 'tgz', 'tar.bz2')
+def _fake_seek_forward(file_obj, cur_pos, offset, whence = os.SEEK_SET):
+ """ Seek to a specified offset. We only support seeking forward and
+ only relative to the beginning of the file and to the current
+ position. The arguments are:
+ 1. 'file_obj' - file-like object to emulate 'seek()' for
+ 2. 'cur_pos' - current file position of 'file_ojb', which supposedly
+ also does not support 'tell()'
+ 3. 'offset' and 'whence' are the standard 'seek()' arguments
+
+ Returns the new 'file_obj' position. """
+
+ if whence == os.SEEK_SET:
+ new_pos = offset
+ elif whence == os.SEEK_CUR:
+ new_pos = cur_pos + offset
+ else:
+ raise Error("_Bzip2Read's 'seek()' method requires 'whence' " \
+ "argument to be %d or %d, but %d was passed" \
+ % (os.SEEK_SET, os.SEEK_CUR, whence))
+
+ if new_pos < cur_pos:
+ raise Error("_Bzip2Read' seek() method supports only seeking " \
+ "forward, seeking from %d to %d is not allowed" \
+ % (cur_pos, new_pos))
+
+ length = new_pos - cur_pos
+ to_read = length
+ while to_read > 0:
+ buf = file_obj.read(to_read)
+ if not buf:
+ break
+ to_read -= len(buf)
+
+ cur_pos = cur_pos + (length - to_read)
+
+ if to_read < 0:
+ raise Error("seeked too far: %d instead of %d" % (cur_pos, new_pos))
+
+ return cur_pos
+
+class _Bzip2Read:
+ """ This class implements transparent reading from a bzip2-compressed
+ file-like object and decompressing the contents on-the-fly. The only reason
+ this class exists is that the standard python 2 bz2.Bzip2File() class does
+ not accept file-like objects and requires a file name.
+
+ To read a bzip2-compressed file-like object, create an instance of this
+ class and use its 'read()' method. In other words, the instances of this
+ class are "read-only" file-like objects. 'seek()' is supported, but only
+ forward.
+
+ Note, this class is very simple and does not implement many things, e.g.,
+ there is no locking. """
+
+ def __init__(self, file_obj):
+ """ Class constructor. The 'file_ojb' argument is the bzip2-compressed
+ file-like object to read from. """
+
+ self._pos = 0
+ self._file_obj = file_obj
+ self._decompressor = bz2.BZ2Decompressor()
+ self._buffer = ''
+ self._buffer_pos = 0
+ self._eof = False
+
+ def _read_from_buffer(self, length):
+ """ Read from the internal buffer which contains the extra data we read
+ last time. """
+
+ buffer_len = len(self._buffer)
+ if buffer_len - self._buffer_pos > length:
+ data = self._buffer[self._buffer_pos:self._buffer_pos + length]
+ self._buffer_pos += length
+ else:
+ data = self._buffer[self._buffer_pos:]
+ self._buffer = ''
+ self._buffer_pos = 0
+
+ return data
+
+ def read(self, size):
+ """ Read the bzip2-compressed file, uncompress the data on-the-fly, and
+ return 'size' bytes of the uncompressed data. """
+
+ assert self._pos >= 0
+ assert self._buffer_pos >= 0
+ assert self._buffer_pos <= len(self._buffer)
+
+ if self._eof:
+ return ''
+
+ # Fetch the data from the buffers first
+ data = self._read_from_buffer(size)
+ size -= len(data)
+
+ # If the buffers did not contain all the requested data, read them,
+ # decompress, and buffer.
+ chunk_size = max(size, 128 * 1024)
+ while size > 0:
+ buf = self._file_obj.read(chunk_size)
+ if not buf:
+ self._eof = True
+ break
+
+ buf = self._decompressor.decompress(buf)
+ if not buf:
+ continue
+
+ assert len(self._buffer) == 0
+ assert self._buffer_pos == 0
+
+ if len(buf) >= size:
+ self._buffer = buf
+ data += self._read_from_buffer(size)
+ else:
+ data += buf
+
+ size -= len(buf)
+
+ self._pos += len(data)
+ return data
+
+ def seek(self, offset, whence = os.SEEK_SET):
+ """ Fake 'seek()' implementation limited to seeking forward. """
+
+ _fake_seek_forward(self, self._pos, offset, whence)
+
+ def tell(self):
+ """ Return current position. """
+
+ return self._pos
+
+ def close(self):
+ """ Close the file-like object. """
+ pass
+
class Error(Exception):
""" A class for exceptions generated by this module. We currently support
only one type of exceptions, and we basically throw human-readable problem
or self.filepath.endswith('.tgz'):
import tarfile
- tar = tarfile.open(self.filepath, 'r')
+ tar = tarfile.open(fileobj = self._file_obj, mode = 'r')
# The tarball is supposed to contain only one single member
members = tar.getmembers()
if len(members) > 1:
elif self.filepath.endswith('.gz'):
import gzip
- self._transfile_obj = gzip.GzipFile(self.filepath, 'rb')
+ self._transfile_obj = gzip.GzipFile(fileobj = self._file_obj,
+ mode = 'rb')
elif self.filepath.endswith('.bz2'):
- import bz2
-
- self._transfile_obj = bz2.BZ2File(self.filepath, 'rb')
+ self._transfile_obj = _Bzip2Read(self._file_obj)
else:
self.is_compressed = False
self._transfile_obj = self._file_obj