From 0bf61426bbd3911b97912aa70c47001f5ccd66db Mon Sep 17 00:00:00 2001 From: JF Ding Date: Sun, 28 Aug 2011 15:48:58 +0800 Subject: [PATCH] included 3rdparty urlgrabber modules The python-urlgrabber-3.1.0 is the last and stable pure python version. MIC included it to decrease the dependencies. --- mic/urlgrabber/__init__.py | 53 ++ mic/urlgrabber/byterange.py | 463 +++++++++++++ mic/urlgrabber/grabber.py | 1477 ++++++++++++++++++++++++++++++++++++++++++ mic/urlgrabber/keepalive.py | 617 ++++++++++++++++++ mic/urlgrabber/mirror.py | 458 +++++++++++++ mic/urlgrabber/progress.py | 530 +++++++++++++++ mic/urlgrabber/sslfactory.py | 89 +++ setup.py | 1 + 8 files changed, 3688 insertions(+) create mode 100644 mic/urlgrabber/__init__.py create mode 100644 mic/urlgrabber/byterange.py create mode 100644 mic/urlgrabber/grabber.py create mode 100644 mic/urlgrabber/keepalive.py create mode 100644 mic/urlgrabber/mirror.py create mode 100644 mic/urlgrabber/progress.py create mode 100644 mic/urlgrabber/sslfactory.py diff --git a/mic/urlgrabber/__init__.py b/mic/urlgrabber/__init__.py new file mode 100644 index 0000000..7bcd9d5 --- /dev/null +++ b/mic/urlgrabber/__init__.py @@ -0,0 +1,53 @@ +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +# Copyright 2002-2006 Michael D. Stenner, Ryan Tomayko + +# $Id: __init__.py,v 1.20 2006/09/22 00:58:55 mstenner Exp $ + +"""A high-level cross-protocol url-grabber. + +Using urlgrabber, data can be fetched in three basic ways: + + urlgrab(url) copy the file to the local filesystem + urlopen(url) open the remote file and return a file object + (like urllib2.urlopen) + urlread(url) return the contents of the file as a string + +When using these functions (or methods), urlgrabber supports the +following features: + + * identical behavior for http://, ftp://, and file:// urls + * http keepalive - faster downloads of many files by using + only a single connection + * byte ranges - fetch only a portion of the file + * reget - for a urlgrab, resume a partial download + * progress meters - the ability to report download progress + automatically, even when using urlopen! + * throttling - restrict bandwidth usage + * retries - automatically retry a download if it fails. The + number of retries and failure types are configurable. + * authenticated server access for http and ftp + * proxy support - support for authenticated http and ftp proxies + * mirror groups - treat a list of mirrors as a single source, + automatically switching mirrors if there is a failure. +""" + +__version__ = '3.1.0' +__date__ = '2006/09/21' +__author__ = 'Michael D. Stenner , ' \ + 'Ryan Tomayko ' +__url__ = 'http://linux.duke.edu/projects/urlgrabber/' + +from grabber import urlgrab, urlopen, urlread diff --git a/mic/urlgrabber/byterange.py b/mic/urlgrabber/byterange.py new file mode 100644 index 0000000..001b4e3 --- /dev/null +++ b/mic/urlgrabber/byterange.py @@ -0,0 +1,463 @@ +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, +# Boston, MA 02111-1307 USA + +# This file is part of urlgrabber, a high-level cross-protocol url-grabber +# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko + +# $Id: byterange.py,v 1.12 2006/07/20 20:15:58 mstenner Exp $ + +import os +import stat +import urllib +import urllib2 +import rfc822 + +DEBUG = None + +try: + from cStringIO import StringIO +except ImportError, msg: + from StringIO import StringIO + +class RangeError(IOError): + """Error raised when an unsatisfiable range is requested.""" + pass + +class HTTPRangeHandler(urllib2.BaseHandler): + """Handler that enables HTTP Range headers. + + This was extremely simple. The Range header is a HTTP feature to + begin with so all this class does is tell urllib2 that the + "206 Partial Content" reponse from the HTTP server is what we + expected. + + Example: + import urllib2 + import byterange + + range_handler = range.HTTPRangeHandler() + opener = urllib2.build_opener(range_handler) + + # install it + urllib2.install_opener(opener) + + # create Request and set Range header + req = urllib2.Request('http://www.python.org/') + req.header['Range'] = 'bytes=30-50' + f = urllib2.urlopen(req) + """ + + def http_error_206(self, req, fp, code, msg, hdrs): + # 206 Partial Content Response + r = urllib.addinfourl(fp, hdrs, req.get_full_url()) + r.code = code + r.msg = msg + return r + + def http_error_416(self, req, fp, code, msg, hdrs): + # HTTP's Range Not Satisfiable error + raise RangeError('Requested Range Not Satisfiable') + +class HTTPSRangeHandler(HTTPRangeHandler): + """ Range Header support for HTTPS. """ + + def https_error_206(self, req, fp, code, msg, hdrs): + return self.http_error_206(req, fp, code, msg, hdrs) + + def https_error_416(self, req, fp, code, msg, hdrs): + self.https_error_416(req, fp, code, msg, hdrs) + +class RangeableFileObject: + """File object wrapper to enable raw range handling. + This was implemented primarilary for handling range + specifications for file:// urls. This object effectively makes + a file object look like it consists only of a range of bytes in + the stream. + + Examples: + # expose 10 bytes, starting at byte position 20, from + # /etc/aliases. + >>> fo = RangeableFileObject(file('/etc/passwd', 'r'), (20,30)) + # seek seeks within the range (to position 23 in this case) + >>> fo.seek(3) + # tell tells where your at _within the range_ (position 3 in + # this case) + >>> fo.tell() + # read EOFs if an attempt is made to read past the last + # byte in the range. the following will return only 7 bytes. + >>> fo.read(30) + """ + + def __init__(self, fo, rangetup): + """Create a RangeableFileObject. + fo -- a file like object. only the read() method need be + supported but supporting an optimized seek() is + preferable. + rangetup -- a (firstbyte,lastbyte) tuple specifying the range + to work over. + The file object provided is assumed to be at byte offset 0. + """ + self.fo = fo + (self.firstbyte, self.lastbyte) = range_tuple_normalize(rangetup) + self.realpos = 0 + self._do_seek(self.firstbyte) + + def __getattr__(self, name): + """This effectively allows us to wrap at the instance level. + Any attribute not found in _this_ object will be searched for + in self.fo. This includes methods.""" + if hasattr(self.fo, name): + return getattr(self.fo, name) + raise AttributeError, name + + def tell(self): + """Return the position within the range. + This is different from fo.seek in that position 0 is the + first byte position of the range tuple. For example, if + this object was created with a range tuple of (500,899), + tell() will return 0 when at byte position 500 of the file. + """ + return (self.realpos - self.firstbyte) + + def seek(self,offset,whence=0): + """Seek within the byte range. + Positioning is identical to that described under tell(). + """ + assert whence in (0, 1, 2) + if whence == 0: # absolute seek + realoffset = self.firstbyte + offset + elif whence == 1: # relative seek + realoffset = self.realpos + offset + elif whence == 2: # absolute from end of file + # XXX: are we raising the right Error here? + raise IOError('seek from end of file not supported.') + + # do not allow seek past lastbyte in range + if self.lastbyte and (realoffset >= self.lastbyte): + realoffset = self.lastbyte + + self._do_seek(realoffset - self.realpos) + + def read(self, size=-1): + """Read within the range. + This method will limit the size read based on the range. + """ + size = self._calc_read_size(size) + rslt = self.fo.read(size) + self.realpos += len(rslt) + return rslt + + def readline(self, size=-1): + """Read lines within the range. + This method will limit the size read based on the range. + """ + size = self._calc_read_size(size) + rslt = self.fo.readline(size) + self.realpos += len(rslt) + return rslt + + def _calc_read_size(self, size): + """Handles calculating the amount of data to read based on + the range. + """ + if self.lastbyte: + if size > -1: + if ((self.realpos + size) >= self.lastbyte): + size = (self.lastbyte - self.realpos) + else: + size = (self.lastbyte - self.realpos) + return size + + def _do_seek(self,offset): + """Seek based on whether wrapped object supports seek(). + offset is relative to the current position (self.realpos). + """ + assert offset >= 0 + if not hasattr(self.fo, 'seek'): + self._poor_mans_seek(offset) + else: + self.fo.seek(self.realpos + offset) + self.realpos+= offset + + def _poor_mans_seek(self,offset): + """Seek by calling the wrapped file objects read() method. + This is used for file like objects that do not have native + seek support. The wrapped objects read() method is called + to manually seek to the desired position. + offset -- read this number of bytes from the wrapped + file object. + raise RangeError if we encounter EOF before reaching the + specified offset. + """ + pos = 0 + bufsize = 1024 + while pos < offset: + if (pos + bufsize) > offset: + bufsize = offset - pos + buf = self.fo.read(bufsize) + if len(buf) != bufsize: + raise RangeError('Requested Range Not Satisfiable') + pos+= bufsize + +class FileRangeHandler(urllib2.FileHandler): + """FileHandler subclass that adds Range support. + This class handles Range headers exactly like an HTTP + server would. + """ + def open_local_file(self, req): + import mimetypes + import mimetools + host = req.get_host() + file = req.get_selector() + localfile = urllib.url2pathname(file) + stats = os.stat(localfile) + size = stats[stat.ST_SIZE] + modified = rfc822.formatdate(stats[stat.ST_MTIME]) + mtype = mimetypes.guess_type(file)[0] + if host: + host, port = urllib.splitport(host) + if port or socket.gethostbyname(host) not in self.get_names(): + raise urllib2.URLError('file not on local host') + fo = open(localfile,'rb') + brange = req.headers.get('Range',None) + brange = range_header_to_tuple(brange) + assert brange != () + if brange: + (fb,lb) = brange + if lb == '': lb = size + if fb < 0 or fb > size or lb > size: + raise RangeError('Requested Range Not Satisfiable') + size = (lb - fb) + fo = RangeableFileObject(fo, (fb,lb)) + headers = mimetools.Message(StringIO( + 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' % + (mtype or 'text/plain', size, modified))) + return urllib.addinfourl(fo, headers, 'file:'+file) + + +# FTP Range Support +# Unfortunately, a large amount of base FTP code had to be copied +# from urllib and urllib2 in order to insert the FTP REST command. +# Code modifications for range support have been commented as +# follows: +# -- range support modifications start/end here + +from urllib import splitport, splituser, splitpasswd, splitattr, \ + unquote, addclosehook, addinfourl +import ftplib +import socket +import sys +import ftplib +import mimetypes +import mimetools + +class FTPRangeHandler(urllib2.FTPHandler): + def ftp_open(self, req): + host = req.get_host() + if not host: + raise IOError, ('ftp error', 'no host given') + host, port = splitport(host) + if port is None: + port = ftplib.FTP_PORT + + # username/password handling + user, host = splituser(host) + if user: + user, passwd = splitpasswd(user) + else: + passwd = None + host = unquote(host) + user = unquote(user or '') + passwd = unquote(passwd or '') + + try: + host = socket.gethostbyname(host) + except socket.error, msg: + raise urllib2.URLError(msg) + path, attrs = splitattr(req.get_selector()) + dirs = path.split('/') + dirs = map(unquote, dirs) + dirs, file = dirs[:-1], dirs[-1] + if dirs and not dirs[0]: + dirs = dirs[1:] + try: + fw = self.connect_ftp(user, passwd, host, port, dirs) + type = file and 'I' or 'D' + for attr in attrs: + attr, value = splitattr(attr) + if attr.lower() == 'type' and \ + value in ('a', 'A', 'i', 'I', 'd', 'D'): + type = value.upper() + + # -- range support modifications start here + rest = None + range_tup = range_header_to_tuple(req.headers.get('Range',None)) + assert range_tup != () + if range_tup: + (fb,lb) = range_tup + if fb > 0: rest = fb + # -- range support modifications end here + + fp, retrlen = fw.retrfile(file, type, rest) + + # -- range support modifications start here + if range_tup: + (fb,lb) = range_tup + if lb == '': + if retrlen is None or retrlen == 0: + raise RangeError('Requested Range Not Satisfiable due to unobtainable file length.') + lb = retrlen + retrlen = lb - fb + if retrlen < 0: + # beginning of range is larger than file + raise RangeError('Requested Range Not Satisfiable') + else: + retrlen = lb - fb + fp = RangeableFileObject(fp, (0,retrlen)) + # -- range support modifications end here + + headers = "" + mtype = mimetypes.guess_type(req.get_full_url())[0] + if mtype: + headers += "Content-Type: %s\n" % mtype + if retrlen is not None and retrlen >= 0: + headers += "Content-Length: %d\n" % retrlen + sf = StringIO(headers) + headers = mimetools.Message(sf) + return addinfourl(fp, headers, req.get_full_url()) + except ftplib.all_errors, msg: + raise IOError, ('ftp error', msg), sys.exc_info()[2] + + def connect_ftp(self, user, passwd, host, port, dirs): + fw = ftpwrapper(user, passwd, host, port, dirs) + return fw + +class ftpwrapper(urllib.ftpwrapper): + # range support note: + # this ftpwrapper code is copied directly from + # urllib. The only enhancement is to add the rest + # argument and pass it on to ftp.ntransfercmd + def retrfile(self, file, type, rest=None): + self.endtransfer() + if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1 + else: cmd = 'TYPE ' + type; isdir = 0 + try: + self.ftp.voidcmd(cmd) + except ftplib.all_errors: + self.init() + self.ftp.voidcmd(cmd) + conn = None + if file and not isdir: + # Use nlst to see if the file exists at all + try: + self.ftp.nlst(file) + except ftplib.error_perm, reason: + raise IOError, ('ftp error', reason), sys.exc_info()[2] + # Restore the transfer mode! + self.ftp.voidcmd(cmd) + # Try to retrieve as a file + try: + cmd = 'RETR ' + file + conn = self.ftp.ntransfercmd(cmd, rest) + except ftplib.error_perm, reason: + if str(reason)[:3] == '501': + # workaround for REST not supported error + fp, retrlen = self.retrfile(file, type) + fp = RangeableFileObject(fp, (rest,'')) + return (fp, retrlen) + elif str(reason)[:3] != '550': + raise IOError, ('ftp error', reason), sys.exc_info()[2] + if not conn: + # Set transfer mode to ASCII! + self.ftp.voidcmd('TYPE A') + # Try a directory listing + if file: cmd = 'LIST ' + file + else: cmd = 'LIST' + conn = self.ftp.ntransfercmd(cmd) + self.busy = 1 + # Pass back both a suitably decorated object and a retrieval length + return (addclosehook(conn[0].makefile('rb'), + self.endtransfer), conn[1]) + + +#################################################################### +# Range Tuple Functions +# XXX: These range tuple functions might go better in a class. + +_rangere = None +def range_header_to_tuple(range_header): + """Get a (firstbyte,lastbyte) tuple from a Range header value. + + Range headers have the form "bytes=-". This + function pulls the firstbyte and lastbyte values and returns + a (firstbyte,lastbyte) tuple. If lastbyte is not specified in + the header value, it is returned as an empty string in the + tuple. + + Return None if range_header is None + Return () if range_header does not conform to the range spec + pattern. + + """ + global _rangere + if range_header is None: return None + if _rangere is None: + import re + _rangere = re.compile(r'^bytes=(\d{1,})-(\d*)') + match = _rangere.match(range_header) + if match: + tup = range_tuple_normalize(match.group(1,2)) + if tup and tup[1]: + tup = (tup[0],tup[1]+1) + return tup + return () + +def range_tuple_to_header(range_tup): + """Convert a range tuple to a Range header value. + Return a string of the form "bytes=-" or None + if no range is needed. + """ + if range_tup is None: return None + range_tup = range_tuple_normalize(range_tup) + if range_tup: + if range_tup[1]: + range_tup = (range_tup[0],range_tup[1] - 1) + return 'bytes=%s-%s' % range_tup + +def range_tuple_normalize(range_tup): + """Normalize a (first_byte,last_byte) range tuple. + Return a tuple whose first element is guaranteed to be an int + and whose second element will be '' (meaning: the last byte) or + an int. Finally, return None if the normalized tuple == (0,'') + as that is equivelant to retrieving the entire file. + """ + if range_tup is None: return None + # handle first byte + fb = range_tup[0] + if fb in (None,''): fb = 0 + else: fb = int(fb) + # handle last byte + try: lb = range_tup[1] + except IndexError: lb = '' + else: + if lb is None: lb = '' + elif lb != '': lb = int(lb) + # check if range is over the entire file + if (fb,lb) == (0,''): return None + # check that the range is valid + if lb < fb: raise RangeError('Invalid byte range: %s-%s' % (fb,lb)) + return (fb,lb) + diff --git a/mic/urlgrabber/grabber.py b/mic/urlgrabber/grabber.py new file mode 100644 index 0000000..20e7899 --- /dev/null +++ b/mic/urlgrabber/grabber.py @@ -0,0 +1,1477 @@ +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, +# Boston, MA 02111-1307 USA + +# This file is part of urlgrabber, a high-level cross-protocol url-grabber +# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko + +"""A high-level cross-protocol url-grabber. + +GENERAL ARGUMENTS (kwargs) + + Where possible, the module-level default is indicated, and legal + values are provided. + + copy_local = 0 [0|1] + + ignored except for file:// urls, in which case it specifies + whether urlgrab should still make a copy of the file, or simply + point to the existing copy. The module level default for this + option is 0. + + close_connection = 0 [0|1] + + tells URLGrabber to close the connection after a file has been + transfered. This is ignored unless the download happens with the + http keepalive handler (keepalive=1). Otherwise, the connection + is left open for further use. The module level default for this + option is 0 (keepalive connections will not be closed). + + keepalive = 1 [0|1] + + specifies whether keepalive should be used for HTTP/1.1 servers + that support it. The module level default for this option is 1 + (keepalive is enabled). + + progress_obj = None + + a class instance that supports the following methods: + po.start(filename, url, basename, length, text) + # length will be None if unknown + po.update(read) # read == bytes read so far + po.end() + + text = None + + specifies an alternativ text item in the beginning of the progress + bar line. If not given, the basename of the file is used. + + throttle = 1.0 + + a number - if it's an int, it's the bytes/second throttle limit. + If it's a float, it is first multiplied by bandwidth. If throttle + == 0, throttling is disabled. If None, the module-level default + (which can be set on default_grabber.throttle) is used. See + BANDWIDTH THROTTLING for more information. + + timeout = None + + a positive float expressing the number of seconds to wait for socket + operations. If the value is None or 0.0, socket operations will block + forever. Setting this option causes urlgrabber to call the settimeout + method on the Socket object used for the request. See the Python + documentation on settimeout for more information. + http://www.python.org/doc/current/lib/socket-objects.html + + bandwidth = 0 + + the nominal max bandwidth in bytes/second. If throttle is a float + and bandwidth == 0, throttling is disabled. If None, the + module-level default (which can be set on + default_grabber.bandwidth) is used. See BANDWIDTH THROTTLING for + more information. + + range = None + + a tuple of the form (first_byte, last_byte) describing a byte + range to retrieve. Either or both of the values may set to + None. If first_byte is None, byte offset 0 is assumed. If + last_byte is None, the last byte available is assumed. Note that + the range specification is python-like in that (0,10) will yeild + the first 10 bytes of the file. + + If set to None, no range will be used. + + reget = None [None|'simple'|'check_timestamp'] + + whether to attempt to reget a partially-downloaded file. Reget + only applies to .urlgrab and (obviously) only if there is a + partially downloaded file. Reget has two modes: + + 'simple' -- the local file will always be trusted. If there + are 100 bytes in the local file, then the download will always + begin 100 bytes into the requested file. + + 'check_timestamp' -- the timestamp of the server file will be + compared to the timestamp of the local file. ONLY if the + local file is newer than or the same age as the server file + will reget be used. If the server file is newer, or the + timestamp is not returned, the entire file will be fetched. + + NOTE: urlgrabber can do very little to verify that the partial + file on disk is identical to the beginning of the remote file. + You may want to either employ a custom "checkfunc" or simply avoid + using reget in situations where corruption is a concern. + + user_agent = 'urlgrabber/VERSION' + + a string, usually of the form 'AGENT/VERSION' that is provided to + HTTP servers in the User-agent header. The module level default + for this option is "urlgrabber/VERSION". + + http_headers = None + + a tuple of 2-tuples, each containing a header and value. These + will be used for http and https requests only. For example, you + can do + http_headers = (('Pragma', 'no-cache'),) + + ftp_headers = None + + this is just like http_headers, but will be used for ftp requests. + + proxies = None + + a dictionary that maps protocol schemes to proxy hosts. For + example, to use a proxy server on host "foo" port 3128 for http + and https URLs: + proxies={ 'http' : 'http://foo:3128', 'https' : 'http://foo:3128' } + note that proxy authentication information may be provided using + normal URL constructs: + proxies={ 'http' : 'http://user:host@foo:3128' } + Lastly, if proxies is None, the default environment settings will + be used. + + prefix = None + + a url prefix that will be prepended to all requested urls. For + example: + g = URLGrabber(prefix='http://foo.com/mirror/') + g.urlgrab('some/file.txt') + ## this will fetch 'http://foo.com/mirror/some/file.txt' + This option exists primarily to allow identical behavior to + MirrorGroup (and derived) instances. Note: a '/' will be inserted + if necessary, so you cannot specify a prefix that ends with a + partial file or directory name. + + opener = None + + Overrides the default urllib2.OpenerDirector provided to urllib2 + when making requests. This option exists so that the urllib2 + handler chain may be customized. Note that the range, reget, + proxy, and keepalive features require that custom handlers be + provided to urllib2 in order to function properly. If an opener + option is provided, no attempt is made by urlgrabber to ensure + chain integrity. You are responsible for ensuring that any + extension handlers are present if said features are required. + + data = None + + Only relevant for the HTTP family (and ignored for other + protocols), this allows HTTP POSTs. When the data kwarg is + present (and not None), an HTTP request will automatically become + a POST rather than GET. This is done by direct passthrough to + urllib2. If you use this, you may also want to set the + 'Content-length' and 'Content-type' headers with the http_headers + option. Note that python 2.2 handles the case of these + badly and if you do not use the proper case (shown here), your + values will be overridden with the defaults. + + +RETRY RELATED ARGUMENTS + + retry = None + + the number of times to retry the grab before bailing. If this is + zero, it will retry forever. This was intentional... really, it + was :). If this value is not supplied or is supplied but is None + retrying does not occur. + + retrycodes = [-1,2,4,5,6,7] + + a sequence of errorcodes (values of e.errno) for which it should + retry. See the doc on URLGrabError for more details on this. You + might consider modifying a copy of the default codes rather than + building yours from scratch so that if the list is extended in the + future (or one code is split into two) you can still enjoy the + benefits of the default list. You can do that with something like + this: + + retrycodes = urlgrabber.grabber.URLGrabberOptions().retrycodes + if 12 not in retrycodes: + retrycodes.append(12) + + checkfunc = None + + a function to do additional checks. This defaults to None, which + means no additional checking. The function should simply return + on a successful check. It should raise URLGrabError on an + unsuccessful check. Raising of any other exception will be + considered immediate failure and no retries will occur. + + If it raises URLGrabError, the error code will determine the retry + behavior. Negative error numbers are reserved for use by these + passed in functions, so you can use many negative numbers for + different types of failure. By default, -1 results in a retry, + but this can be customized with retrycodes. + + If you simply pass in a function, it will be given exactly one + argument: a CallbackObject instance with the .url attribute + defined and either .filename (for urlgrab) or .data (for urlread). + For urlgrab, .filename is the name of the local file. For + urlread, .data is the actual string data. If you need other + arguments passed to the callback (program state of some sort), you + can do so like this: + + checkfunc=(function, ('arg1', 2), {'kwarg': 3}) + + if the downloaded file has filename /tmp/stuff, then this will + result in this call (for urlgrab): + + function(obj, 'arg1', 2, kwarg=3) + # obj.filename = '/tmp/stuff' + # obj.url = 'http://foo.com/stuff' + + NOTE: both the "args" tuple and "kwargs" dict must be present if + you use this syntax, but either (or both) can be empty. + + failure_callback = None + + The callback that gets called during retries when an attempt to + fetch a file fails. The syntax for specifying the callback is + identical to checkfunc, except for the attributes defined in the + CallbackObject instance. The attributes for failure_callback are: + + exception = the raised exception + url = the url we're trying to fetch + tries = the number of tries so far (including this one) + retry = the value of the retry option + + The callback is present primarily to inform the calling program of + the failure, but if it raises an exception (including the one it's + passed) that exception will NOT be caught and will therefore cause + future retries to be aborted. + + The callback is called for EVERY failure, including the last one. + On the last try, the callback can raise an alternate exception, + but it cannot (without severe trickiness) prevent the exception + from being raised. + + interrupt_callback = None + + This callback is called if KeyboardInterrupt is received at any + point in the transfer. Basically, this callback can have three + impacts on the fetch process based on the way it exits: + + 1) raise no exception: the current fetch will be aborted, but + any further retries will still take place + + 2) raise a URLGrabError: if you're using a MirrorGroup, then + this will prompt a failover to the next mirror according to + the behavior of the MirrorGroup subclass. It is recommended + that you raise URLGrabError with code 15, 'user abort'. If + you are NOT using a MirrorGroup subclass, then this is the + same as (3). + + 3) raise some other exception (such as KeyboardInterrupt), which + will not be caught at either the grabber or mirror levels. + That is, it will be raised up all the way to the caller. + + This callback is very similar to failure_callback. They are + passed the same arguments, so you could use the same function for + both. + + urlparser = URLParser() + + The URLParser class handles pre-processing of URLs, including + auth-handling for user/pass encoded in http urls, file handing + (that is, filenames not sent as a URL), and URL quoting. If you + want to override any of this behavior, you can pass in a + replacement instance. See also the 'quote' option. + + quote = None + + Whether or not to quote the path portion of a url. + quote = 1 -> quote the URLs (they're not quoted yet) + quote = 0 -> do not quote them (they're already quoted) + quote = None -> guess what to do + + This option only affects proper urls like 'file:///etc/passwd'; it + does not affect 'raw' filenames like '/etc/passwd'. The latter + will always be quoted as they are converted to URLs. Also, only + the path part of a url is quoted. If you need more fine-grained + control, you should probably subclass URLParser and pass it in via + the 'urlparser' option. + +BANDWIDTH THROTTLING + + urlgrabber supports throttling via two values: throttle and + bandwidth Between the two, you can either specify and absolute + throttle threshold or specify a theshold as a fraction of maximum + available bandwidth. + + throttle is a number - if it's an int, it's the bytes/second + throttle limit. If it's a float, it is first multiplied by + bandwidth. If throttle == 0, throttling is disabled. If None, the + module-level default (which can be set with set_throttle) is used. + + bandwidth is the nominal max bandwidth in bytes/second. If throttle + is a float and bandwidth == 0, throttling is disabled. If None, the + module-level default (which can be set with set_bandwidth) is used. + + THROTTLING EXAMPLES: + + Lets say you have a 100 Mbps connection. This is (about) 10^8 bits + per second, or 12,500,000 Bytes per second. You have a number of + throttling options: + + *) set_bandwidth(12500000); set_throttle(0.5) # throttle is a float + + This will limit urlgrab to use half of your available bandwidth. + + *) set_throttle(6250000) # throttle is an int + + This will also limit urlgrab to use half of your available + bandwidth, regardless of what bandwidth is set to. + + *) set_throttle(6250000); set_throttle(1.0) # float + + Use half your bandwidth + + *) set_throttle(6250000); set_throttle(2.0) # float + + Use up to 12,500,000 Bytes per second (your nominal max bandwidth) + + *) set_throttle(6250000); set_throttle(0) # throttle = 0 + + Disable throttling - this is more efficient than a very large + throttle setting. + + *) set_throttle(0); set_throttle(1.0) # throttle is float, bandwidth = 0 + + Disable throttling - this is the default when the module is loaded. + + SUGGESTED AUTHOR IMPLEMENTATION (THROTTLING) + + While this is flexible, it's not extremely obvious to the user. I + suggest you implement a float throttle as a percent to make the + distinction between absolute and relative throttling very explicit. + + Also, you may want to convert the units to something more convenient + than bytes/second, such as kbps or kB/s, etc. + +""" + +# $Id: grabber.py,v 1.48 2006/09/22 00:58:05 mstenner Exp $ + +import os +import os.path +import sys +import urlparse +import rfc822 +import time +import string +import urllib +import urllib2 +from stat import * # S_* and ST_* + +######################################################################## +# MODULE INITIALIZATION +######################################################################## +try: + exec('from ' + (__name__.split('.'))[0] + ' import __version__') +except: + __version__ = '???' + +import sslfactory + +auth_handler = urllib2.HTTPBasicAuthHandler( \ + urllib2.HTTPPasswordMgrWithDefaultRealm()) + +try: + from i18n import _ +except ImportError, msg: + def _(st): return st + +try: + from httplib import HTTPException +except ImportError, msg: + HTTPException = None + +try: + # This is a convenient way to make keepalive optional. + # Just rename the module so it can't be imported. + import keepalive + from keepalive import HTTPHandler, HTTPSHandler + have_keepalive = True +except ImportError, msg: + have_keepalive = False + +try: + # add in range support conditionally too + import byterange + from byterange import HTTPRangeHandler, HTTPSRangeHandler, \ + FileRangeHandler, FTPRangeHandler, range_tuple_normalize, \ + range_tuple_to_header, RangeError +except ImportError, msg: + range_handlers = () + RangeError = None + have_range = 0 +else: + range_handlers = (HTTPRangeHandler(), HTTPSRangeHandler(), + FileRangeHandler(), FTPRangeHandler()) + have_range = 1 + + +# check whether socket timeout support is available (Python >= 2.3) +import socket +try: + TimeoutError = socket.timeout + have_socket_timeout = True +except AttributeError: + TimeoutError = None + have_socket_timeout = False + +######################################################################## +# functions for debugging output. These functions are here because they +# are also part of the module initialization. +DEBUG = None +def set_logger(DBOBJ): + """Set the DEBUG object. This is called by _init_default_logger when + the environment variable URLGRABBER_DEBUG is set, but can also be + called by a calling program. Basically, if the calling program uses + the logging module and would like to incorporate urlgrabber logging, + then it can do so this way. It's probably not necessary as most + internal logging is only for debugging purposes. + + The passed-in object should be a logging.Logger instance. It will + be pushed into the keepalive and byterange modules if they're + being used. The mirror module pulls this object in on import, so + you will need to manually push into it. In fact, you may find it + tidier to simply push your logging object (or objects) into each + of these modules independently. + """ + + global DEBUG + DEBUG = DBOBJ + if have_keepalive and keepalive.DEBUG is None: + keepalive.DEBUG = DBOBJ + if have_range and byterange.DEBUG is None: + byterange.DEBUG = DBOBJ + if sslfactory.DEBUG is None: + sslfactory.DEBUG = DBOBJ + +def _init_default_logger(): + '''Examines the environment variable URLGRABBER_DEBUG and creates + a logging object (logging.logger) based on the contents. It takes + the form + + URLGRABBER_DEBUG=level,filename + + where "level" can be either an integer or a log level from the + logging module (DEBUG, INFO, etc). If the integer is zero or + less, logging will be disabled. Filename is the filename where + logs will be sent. If it is "-", then stdout will be used. If + the filename is empty or missing, stderr will be used. If the + variable cannot be processed or the logging module cannot be + imported (python < 2.3) then logging will be disabled. Here are + some examples: + + URLGRABBER_DEBUG=1,debug.txt # log everything to debug.txt + URLGRABBER_DEBUG=WARNING,- # log warning and higher to stdout + URLGRABBER_DEBUG=INFO # log info and higher to stderr + + This funtion is called during module initialization. It is not + intended to be called from outside. The only reason it is a + function at all is to keep the module-level namespace tidy and to + collect the code into a nice block.''' + + try: + dbinfo = os.environ['URLGRABBER_DEBUG'].split(',') + import logging + level = logging._levelNames.get(dbinfo[0], int(dbinfo[0])) + if level < 1: raise ValueError() + + formatter = logging.Formatter('%(asctime)s %(message)s') + if len(dbinfo) > 1: filename = dbinfo[1] + else: filename = '' + if filename == '': handler = logging.StreamHandler(sys.stderr) + elif filename == '-': handler = logging.StreamHandler(sys.stdout) + else: handler = logging.FileHandler(filename) + handler.setFormatter(formatter) + DBOBJ = logging.getLogger('urlgrabber') + DBOBJ.addHandler(handler) + DBOBJ.setLevel(level) + except (KeyError, ImportError, ValueError): + DBOBJ = None + set_logger(DBOBJ) + +_init_default_logger() +######################################################################## +# END MODULE INITIALIZATION +######################################################################## + + + +class URLGrabError(IOError): + """ + URLGrabError error codes: + + URLGrabber error codes (0 -- 255) + 0 - everything looks good (you should never see this) + 1 - malformed url + 2 - local file doesn't exist + 3 - request for non-file local file (dir, etc) + 4 - IOError on fetch + 5 - OSError on fetch + 6 - no content length header when we expected one + 7 - HTTPException + 8 - Exceeded read limit (for urlread) + 9 - Requested byte range not satisfiable. + 10 - Byte range requested, but range support unavailable + 11 - Illegal reget mode + 12 - Socket timeout + 13 - malformed proxy url + 14 - HTTPError (includes .code and .exception attributes) + 15 - user abort + + MirrorGroup error codes (256 -- 511) + 256 - No more mirrors left to try + + Custom (non-builtin) classes derived from MirrorGroup (512 -- 767) + [ this range reserved for application-specific error codes ] + + Retry codes (< 0) + -1 - retry the download, unknown reason + + Note: to test which group a code is in, you can simply do integer + division by 256: e.errno / 256 + + Negative codes are reserved for use by functions passed in to + retrygrab with checkfunc. The value -1 is built in as a generic + retry code and is already included in the retrycodes list. + Therefore, you can create a custom check function that simply + returns -1 and the fetch will be re-tried. For more customized + retries, you can use other negative number and include them in + retry-codes. This is nice for outputting useful messages about + what failed. + + You can use these error codes like so: + try: urlgrab(url) + except URLGrabError, e: + if e.errno == 3: ... + # or + print e.strerror + # or simply + print e #### print '[Errno %i] %s' % (e.errno, e.strerror) + """ + pass + +class CallbackObject: + """Container for returned callback data. + + This is currently a dummy class into which urlgrabber can stuff + information for passing to callbacks. This way, the prototype for + all callbacks is the same, regardless of the data that will be + passed back. Any function that accepts a callback function as an + argument SHOULD document what it will define in this object. + + It is possible that this class will have some greater + functionality in the future. + """ + def __init__(self, **kwargs): + self.__dict__.update(kwargs) + +def urlgrab(url, filename=None, **kwargs): + """grab the file at and make a local copy at + If filename is none, the basename of the url is used. + urlgrab returns the filename of the local file, which may be different + from the passed-in filename if the copy_local kwarg == 0. + + See module documentation for a description of possible kwargs. + """ + return default_grabber.urlgrab(url, filename, **kwargs) + +def urlopen(url, **kwargs): + """open the url and return a file object + If a progress object or throttle specifications exist, then + a special file object will be returned that supports them. + The file object can be treated like any other file object. + + See module documentation for a description of possible kwargs. + """ + return default_grabber.urlopen(url, **kwargs) + +def urlread(url, limit=None, **kwargs): + """read the url into a string, up to 'limit' bytes + If the limit is exceeded, an exception will be thrown. Note that urlread + is NOT intended to be used as a way of saying "I want the first N bytes" + but rather 'read the whole file into memory, but don't use too much' + + See module documentation for a description of possible kwargs. + """ + return default_grabber.urlread(url, limit, **kwargs) + + +class URLParser: + """Process the URLs before passing them to urllib2. + + This class does several things: + + * add any prefix + * translate a "raw" file to a proper file: url + * handle any http or https auth that's encoded within the url + * quote the url + + Only the "parse" method is called directly, and it calls sub-methods. + + An instance of this class is held in the options object, which + means that it's easy to change the behavior by sub-classing and + passing the replacement in. It need only have a method like: + + url, parts = urlparser.parse(url, opts) + """ + + def parse(self, url, opts): + """parse the url and return the (modified) url and its parts + + Note: a raw file WILL be quoted when it's converted to a URL. + However, other urls (ones which come with a proper scheme) may + or may not be quoted according to opts.quote + + opts.quote = 1 --> quote it + opts.quote = 0 --> do not quote it + opts.quote = None --> guess + """ + quote = opts.quote + + if opts.prefix: + url = self.add_prefix(url, opts.prefix) + + parts = urlparse.urlparse(url) + (scheme, host, path, parm, query, frag) = parts + + if not scheme or (len(scheme) == 1 and scheme in string.letters): + # if a scheme isn't specified, we guess that it's "file:" + if url[0] not in '/\\': url = os.path.abspath(url) + url = 'file:' + urllib.pathname2url(url) + parts = urlparse.urlparse(url) + quote = 0 # pathname2url quotes, so we won't do it again + + if scheme in ['http', 'https']: + parts = self.process_http(parts) + + if quote is None: + quote = self.guess_should_quote(parts) + if quote: + parts = self.quote(parts) + + url = urlparse.urlunparse(parts) + return url, parts + + def add_prefix(self, url, prefix): + if prefix[-1] == '/' or url[0] == '/': + url = prefix + url + else: + url = prefix + '/' + url + return url + + def process_http(self, parts): + (scheme, host, path, parm, query, frag) = parts + + if '@' in host and auth_handler: + try: + user_pass, host = host.split('@', 1) + if ':' in user_pass: + user, password = user_pass.split(':', 1) + except ValueError, e: + raise URLGrabError(1, _('Bad URL: %s') % url) + if DEBUG: DEBUG.info('adding HTTP auth: %s, %s', user, password) + auth_handler.add_password(None, host, user, password) + + return (scheme, host, path, parm, query, frag) + + def quote(self, parts): + """quote the URL + + This method quotes ONLY the path part. If you need to quote + other parts, you should override this and pass in your derived + class. The other alternative is to quote other parts before + passing into urlgrabber. + """ + (scheme, host, path, parm, query, frag) = parts + path = urllib.quote(path) + return (scheme, host, path, parm, query, frag) + + hexvals = '0123456789ABCDEF' + def guess_should_quote(self, parts): + """ + Guess whether we should quote a path. This amounts to + guessing whether it's already quoted. + + find ' ' -> 1 + find '%' -> 1 + find '%XX' -> 0 + else -> 1 + """ + (scheme, host, path, parm, query, frag) = parts + if ' ' in path: + return 1 + ind = string.find(path, '%') + if ind > -1: + while ind > -1: + if len(path) < ind+3: + return 1 + code = path[ind+1:ind+3].upper() + if code[0] not in self.hexvals or \ + code[1] not in self.hexvals: + return 1 + ind = string.find(path, '%', ind+1) + return 0 + return 1 + +class URLGrabberOptions: + """Class to ease kwargs handling.""" + + def __init__(self, delegate=None, **kwargs): + """Initialize URLGrabberOptions object. + Set default values for all options and then update options specified + in kwargs. + """ + self.delegate = delegate + if delegate is None: + self._set_defaults() + self._set_attributes(**kwargs) + + def __getattr__(self, name): + if self.delegate and hasattr(self.delegate, name): + return getattr(self.delegate, name) + raise AttributeError, name + + def raw_throttle(self): + """Calculate raw throttle value from throttle and bandwidth + values. + """ + if self.throttle <= 0: + return 0 + elif type(self.throttle) == type(0): + return float(self.throttle) + else: # throttle is a float + return self.bandwidth * self.throttle + + def derive(self, **kwargs): + """Create a derived URLGrabberOptions instance. + This method creates a new instance and overrides the + options specified in kwargs. + """ + return URLGrabberOptions(delegate=self, **kwargs) + + def _set_attributes(self, **kwargs): + """Update object attributes with those provided in kwargs.""" + self.__dict__.update(kwargs) + if have_range and kwargs.has_key('range'): + # normalize the supplied range value + self.range = range_tuple_normalize(self.range) + if not self.reget in [None, 'simple', 'check_timestamp']: + raise URLGrabError(11, _('Illegal reget mode: %s') \ + % (self.reget, )) + + def _set_defaults(self): + """Set all options to their default values. + When adding new options, make sure a default is + provided here. + """ + self.progress_obj = None + self.throttle = 1.0 + self.bandwidth = 0 + self.retry = None + self.retrycodes = [-1,2,4,5,6,7] + self.checkfunc = None + self.copy_local = 0 + self.close_connection = 0 + self.range = None + self.user_agent = 'urlgrabber/%s' % __version__ + self.keepalive = 1 + self.proxies = None + self.reget = None + self.failure_callback = None + self.interrupt_callback = None + self.prefix = None + self.opener = None + self.cache_openers = True + self.timeout = None + self.text = None + self.http_headers = None + self.ftp_headers = None + self.data = None + self.urlparser = URLParser() + self.quote = None + self.ssl_ca_cert = None + self.ssl_context = None + +class URLGrabber: + """Provides easy opening of URLs with a variety of options. + + All options are specified as kwargs. Options may be specified when + the class is created and may be overridden on a per request basis. + + New objects inherit default values from default_grabber. + """ + + def __init__(self, **kwargs): + self.opts = URLGrabberOptions(**kwargs) + + def _retry(self, opts, func, *args): + tries = 0 + while 1: + # there are only two ways out of this loop. The second has + # several "sub-ways" + # 1) via the return in the "try" block + # 2) by some exception being raised + # a) an excepton is raised that we don't "except" + # b) a callback raises ANY exception + # c) we're not retry-ing or have run out of retries + # d) the URLGrabError code is not in retrycodes + # beware of infinite loops :) + tries = tries + 1 + exception = None + retrycode = None + callback = None + if DEBUG: DEBUG.info('attempt %i/%s: %s', + tries, opts.retry, args[0]) + try: + r = apply(func, (opts,) + args, {}) + if DEBUG: DEBUG.info('success') + return r + except URLGrabError, e: + exception = e + callback = opts.failure_callback + retrycode = e.errno + except KeyboardInterrupt, e: + exception = e + callback = opts.interrupt_callback + + if DEBUG: DEBUG.info('exception: %s', exception) + if callback: + if DEBUG: DEBUG.info('calling callback: %s', callback) + cb_func, cb_args, cb_kwargs = self._make_callback(callback) + obj = CallbackObject(exception=exception, url=args[0], + tries=tries, retry=opts.retry) + cb_func(obj, *cb_args, **cb_kwargs) + + if (opts.retry is None) or (tries == opts.retry): + if DEBUG: DEBUG.info('retries exceeded, re-raising') + raise + + if (retrycode is not None) and (retrycode not in opts.retrycodes): + if DEBUG: DEBUG.info('retrycode (%i) not in list %s, re-raising', + retrycode, opts.retrycodes) + raise + + def urlopen(self, url, **kwargs): + """open the url and return a file object + If a progress object or throttle value specified when this + object was created, then a special file object will be + returned that supports them. The file object can be treated + like any other file object. + """ + opts = self.opts.derive(**kwargs) + (url,parts) = opts.urlparser.parse(url, opts) + def retryfunc(opts, url): + return URLGrabberFileObject(url, filename=None, opts=opts) + return self._retry(opts, retryfunc, url) + + def urlgrab(self, url, filename=None, **kwargs): + """grab the file at and make a local copy at + If filename is none, the basename of the url is used. + urlgrab returns the filename of the local file, which may be + different from the passed-in filename if copy_local == 0. + """ + opts = self.opts.derive(**kwargs) + (url,parts) = opts.urlparser.parse(url, opts) + (scheme, host, path, parm, query, frag) = parts + if filename is None: + filename = os.path.basename( urllib.unquote(path) ) + if scheme == 'file' and not opts.copy_local: + # just return the name of the local file - don't make a + # copy currently + path = urllib.url2pathname(path) + if host: + path = os.path.normpath('//' + host + path) + if not os.path.exists(path): + raise URLGrabError(2, + _('Local file does not exist: %s') % (path, )) + elif not os.path.isfile(path): + raise URLGrabError(3, + _('Not a normal file: %s') % (path, )) + elif not opts.range: + return path + + def retryfunc(opts, url, filename): + fo = URLGrabberFileObject(url, filename, opts) + try: + fo._do_grab() + if not opts.checkfunc is None: + cb_func, cb_args, cb_kwargs = \ + self._make_callback(opts.checkfunc) + obj = CallbackObject() + obj.filename = filename + obj.url = url + apply(cb_func, (obj, )+cb_args, cb_kwargs) + finally: + fo.close() + return filename + + return self._retry(opts, retryfunc, url, filename) + + def urlread(self, url, limit=None, **kwargs): + """read the url into a string, up to 'limit' bytes + If the limit is exceeded, an exception will be thrown. Note + that urlread is NOT intended to be used as a way of saying + "I want the first N bytes" but rather 'read the whole file + into memory, but don't use too much' + """ + opts = self.opts.derive(**kwargs) + (url,parts) = opts.urlparser.parse(url, opts) + if limit is not None: + limit = limit + 1 + + def retryfunc(opts, url, limit): + fo = URLGrabberFileObject(url, filename=None, opts=opts) + s = '' + try: + # this is an unfortunate thing. Some file-like objects + # have a default "limit" of None, while the built-in (real) + # file objects have -1. They each break the other, so for + # now, we just force the default if necessary. + if limit is None: s = fo.read() + else: s = fo.read(limit) + + if not opts.checkfunc is None: + cb_func, cb_args, cb_kwargs = \ + self._make_callback(opts.checkfunc) + obj = CallbackObject() + obj.data = s + obj.url = url + apply(cb_func, (obj, )+cb_args, cb_kwargs) + finally: + fo.close() + return s + + s = self._retry(opts, retryfunc, url, limit) + if limit and len(s) > limit: + raise URLGrabError(8, + _('Exceeded limit (%i): %s') % (limit, url)) + return s + + def _make_callback(self, callback_obj): + if callable(callback_obj): + return callback_obj, (), {} + else: + return callback_obj + +# create the default URLGrabber used by urlXXX functions. +# NOTE: actual defaults are set in URLGrabberOptions +default_grabber = URLGrabber() + +class URLGrabberFileObject: + """This is a file-object wrapper that supports progress objects + and throttling. + + This exists to solve the following problem: lets say you want to + drop-in replace a normal open with urlopen. You want to use a + progress meter and/or throttling, but how do you do that without + rewriting your code? Answer: urlopen will return a wrapped file + object that does the progress meter and-or throttling internally. + """ + + def __init__(self, url, filename, opts): + self.url = url + self.filename = filename + self.opts = opts + self.fo = None + self._rbuf = '' + self._rbufsize = 1024*8 + self._ttime = time.time() + self._tsize = 0 + self._amount_read = 0 + self._opener = None + self._do_open() + + def __getattr__(self, name): + """This effectively allows us to wrap at the instance level. + Any attribute not found in _this_ object will be searched for + in self.fo. This includes methods.""" + if hasattr(self.fo, name): + return getattr(self.fo, name) + raise AttributeError, name + + def _get_opener(self): + """Build a urllib2 OpenerDirector based on request options.""" + if self.opts.opener: + return self.opts.opener + elif self._opener is None: + handlers = [] + need_keepalive_handler = (have_keepalive and self.opts.keepalive) + need_range_handler = (range_handlers and \ + (self.opts.range or self.opts.reget)) + # if you specify a ProxyHandler when creating the opener + # it _must_ come before all other handlers in the list or urllib2 + # chokes. + if self.opts.proxies: + handlers.append( CachedProxyHandler(self.opts.proxies) ) + + # ------------------------------------------------------- + # OK, these next few lines are a serious kludge to get + # around what I think is a bug in python 2.2's + # urllib2. The basic idea is that default handlers + # get applied first. If you override one (like a + # proxy handler), then the default gets pulled, but + # the replacement goes on the end. In the case of + # proxies, this means the normal handler picks it up + # first and the proxy isn't used. Now, this probably + # only happened with ftp or non-keepalive http, so not + # many folks saw it. The simple approach to fixing it + # is just to make sure you override the other + # conflicting defaults as well. I would LOVE to see + # these go way or be dealt with more elegantly. The + # problem isn't there after 2.2. -MDS 2005/02/24 + if not need_keepalive_handler: + handlers.append( urllib2.HTTPHandler() ) + if not need_range_handler: + handlers.append( urllib2.FTPHandler() ) + # ------------------------------------------------------- + + ssl_factory = sslfactory.get_factory(self.opts.ssl_ca_cert, + self.opts.ssl_context) + + if need_keepalive_handler: + handlers.append(HTTPHandler()) + handlers.append(HTTPSHandler(ssl_factory)) + if need_range_handler: + handlers.extend( range_handlers ) + handlers.append( auth_handler ) + if self.opts.cache_openers: + self._opener = CachedOpenerDirector(ssl_factory, *handlers) + else: + self._opener = ssl_factory.create_opener(*handlers) + # OK, I don't like to do this, but otherwise, we end up with + # TWO user-agent headers. + self._opener.addheaders = [] + return self._opener + + def _do_open(self): + opener = self._get_opener() + + req = urllib2.Request(self.url, self.opts.data) # build request object + self._add_headers(req) # add misc headers that we need + self._build_range(req) # take care of reget and byterange stuff + + fo, hdr = self._make_request(req, opener) + if self.reget_time and self.opts.reget == 'check_timestamp': + # do this if we have a local file with known timestamp AND + # we're in check_timestamp reget mode. + fetch_again = 0 + try: + modified_tuple = hdr.getdate_tz('last-modified') + modified_stamp = rfc822.mktime_tz(modified_tuple) + if modified_stamp > self.reget_time: fetch_again = 1 + except (TypeError,): + fetch_again = 1 + + if fetch_again: + # the server version is newer than the (incomplete) local + # version, so we should abandon the version we're getting + # and fetch the whole thing again. + fo.close() + self.opts.reget = None + del req.headers['Range'] + self._build_range(req) + fo, hdr = self._make_request(req, opener) + + (scheme, host, path, parm, query, frag) = urlparse.urlparse(self.url) + path = urllib.unquote(path) + if not (self.opts.progress_obj or self.opts.raw_throttle() \ + or self.opts.timeout): + # if we're not using the progress_obj, throttling, or timeout + # we can get a performance boost by going directly to + # the underlying fileobject for reads. + self.read = fo.read + if hasattr(fo, 'readline'): + self.readline = fo.readline + elif self.opts.progress_obj: + try: + length = int(hdr['Content-Length']) + length = length + self._amount_read # Account for regets + except (KeyError, ValueError, TypeError): + length = None + + self.opts.progress_obj.start(str(self.filename), + urllib.unquote(self.url), + os.path.basename(path), + length, text=self.opts.text) + self.opts.progress_obj.update(0) + (self.fo, self.hdr) = (fo, hdr) + + def _add_headers(self, req): + if self.opts.user_agent: + req.add_header('User-agent', self.opts.user_agent) + try: req_type = req.get_type() + except ValueError: req_type = None + if self.opts.http_headers and req_type in ('http', 'https'): + for h, v in self.opts.http_headers: + req.add_header(h, v) + if self.opts.ftp_headers and req_type == 'ftp': + for h, v in self.opts.ftp_headers: + req.add_header(h, v) + + def _build_range(self, req): + self.reget_time = None + self.append = 0 + reget_length = 0 + rt = None + if have_range and self.opts.reget and type(self.filename) == type(''): + # we have reget turned on and we're dumping to a file + try: + s = os.stat(self.filename) + except OSError: + pass + else: + self.reget_time = s[ST_MTIME] + reget_length = s[ST_SIZE] + + # Set initial length when regetting + self._amount_read = reget_length + + rt = reget_length, '' + self.append = 1 + + if self.opts.range: + if not have_range: + raise URLGrabError(10, _('Byte range requested but range '\ + 'support unavailable')) + rt = self.opts.range + if rt[0]: rt = (rt[0] + reget_length, rt[1]) + + if rt: + header = range_tuple_to_header(rt) + if header: req.add_header('Range', header) + + def _make_request(self, req, opener): + try: + if have_socket_timeout and self.opts.timeout: + old_to = socket.getdefaulttimeout() + socket.setdefaulttimeout(self.opts.timeout) + try: + fo = opener.open(req) + finally: + socket.setdefaulttimeout(old_to) + else: + fo = opener.open(req) + hdr = fo.info() + except ValueError, e: + raise URLGrabError(1, _('Bad URL: %s') % (e, )) + except RangeError, e: + raise URLGrabError(9, str(e)) + except urllib2.HTTPError, e: + new_e = URLGrabError(14, str(e)) + new_e.code = e.code + new_e.exception = e + raise new_e + except IOError, e: + if hasattr(e, 'reason') and have_socket_timeout and \ + isinstance(e.reason, TimeoutError): + raise URLGrabError(12, _('Timeout: %s') % (e, )) + else: + raise URLGrabError(4, _('IOError: %s') % (e, )) + except OSError, e: + raise URLGrabError(5, _('OSError: %s') % (e, )) + except HTTPException, e: + raise URLGrabError(7, _('HTTP Exception (%s): %s') % \ + (e.__class__.__name__, e)) + else: + return (fo, hdr) + + def _do_grab(self): + """dump the file to self.filename.""" + if self.append: new_fo = open(self.filename, 'ab') + else: new_fo = open(self.filename, 'wb') + bs = 1024*8 + size = 0 + + block = self.read(bs) + size = size + len(block) + while block: + new_fo.write(block) + block = self.read(bs) + size = size + len(block) + + new_fo.close() + try: + modified_tuple = self.hdr.getdate_tz('last-modified') + modified_stamp = rfc822.mktime_tz(modified_tuple) + os.utime(self.filename, (modified_stamp, modified_stamp)) + except (TypeError,), e: pass + + return size + + def _fill_buffer(self, amt=None): + """fill the buffer to contain at least 'amt' bytes by reading + from the underlying file object. If amt is None, then it will + read until it gets nothing more. It updates the progress meter + and throttles after every self._rbufsize bytes.""" + # the _rbuf test is only in this first 'if' for speed. It's not + # logically necessary + if self._rbuf and not amt is None: + L = len(self._rbuf) + if amt > L: + amt = amt - L + else: + return + + # if we've made it here, then we don't have enough in the buffer + # and we need to read more. + + buf = [self._rbuf] + bufsize = len(self._rbuf) + while amt is None or amt: + # first, delay if necessary for throttling reasons + if self.opts.raw_throttle(): + diff = self._tsize/self.opts.raw_throttle() - \ + (time.time() - self._ttime) + if diff > 0: time.sleep(diff) + self._ttime = time.time() + + # now read some data, up to self._rbufsize + if amt is None: readamount = self._rbufsize + else: readamount = min(amt, self._rbufsize) + try: + new = self.fo.read(readamount) + except socket.error, e: + raise URLGrabError(4, _('Socket Error: %s') % (e, )) + except TimeoutError, e: + raise URLGrabError(12, _('Timeout: %s') % (e, )) + except IOError, e: + raise URLGrabError(4, _('IOError: %s') %(e,)) + newsize = len(new) + if not newsize: break # no more to read + + if amt: amt = amt - newsize + buf.append(new) + bufsize = bufsize + newsize + self._tsize = newsize + self._amount_read = self._amount_read + newsize + if self.opts.progress_obj: + self.opts.progress_obj.update(self._amount_read) + + self._rbuf = string.join(buf, '') + return + + def read(self, amt=None): + self._fill_buffer(amt) + if amt is None: + s, self._rbuf = self._rbuf, '' + else: + s, self._rbuf = self._rbuf[:amt], self._rbuf[amt:] + return s + + def readline(self, limit=-1): + i = string.find(self._rbuf, '\n') + while i < 0 and not (0 < limit <= len(self._rbuf)): + L = len(self._rbuf) + self._fill_buffer(L + self._rbufsize) + if not len(self._rbuf) > L: break + i = string.find(self._rbuf, '\n', L) + + if i < 0: i = len(self._rbuf) + else: i = i+1 + if 0 <= limit < len(self._rbuf): i = limit + + s, self._rbuf = self._rbuf[:i], self._rbuf[i:] + return s + + def close(self): + if self.opts.progress_obj: + self.opts.progress_obj.end(self._amount_read) + self.fo.close() + if self.opts.close_connection: + try: self.fo.close_connection() + except: pass + +_handler_cache = [] +def CachedOpenerDirector(ssl_factory = None, *handlers): + for (cached_handlers, opener) in _handler_cache: + if cached_handlers == handlers: + for handler in opener.handlers: + handler.add_parent(opener) + return opener + if not ssl_factory: + ssl_factory = sslfactory.get_factory() + opener = ssl_factory.create_opener(*handlers) + _handler_cache.append( (handlers, opener) ) + return opener + +_proxy_cache = [] +def CachedProxyHandler(proxies): + for (pdict, handler) in _proxy_cache: + if pdict == proxies: + if DEBUG: DEBUG.debug('re-using proxy settings: %s', proxies) + break + else: + for k, v in proxies.items(): + utype, url = urllib.splittype(v) + host, other = urllib.splithost(url) + if (utype is None) or (host is None): + raise URLGrabError(13, _('Bad proxy URL: %s') % v) + + if DEBUG: DEBUG.info('creating new proxy handler: %s', proxies) + handler = urllib2.ProxyHandler(proxies) + _proxy_cache.append( (proxies, handler) ) + return handler + +##################################################################### +# DEPRECATED FUNCTIONS +def set_throttle(new_throttle): + """Deprecated. Use: default_grabber.throttle = new_throttle""" + default_grabber.throttle = new_throttle + +def set_bandwidth(new_bandwidth): + """Deprecated. Use: default_grabber.bandwidth = new_bandwidth""" + default_grabber.bandwidth = new_bandwidth + +def set_progress_obj(new_progress_obj): + """Deprecated. Use: default_grabber.progress_obj = new_progress_obj""" + default_grabber.progress_obj = new_progress_obj + +def set_user_agent(new_user_agent): + """Deprecated. Use: default_grabber.user_agent = new_user_agent""" + default_grabber.user_agent = new_user_agent + +def retrygrab(url, filename=None, copy_local=0, close_connection=0, + progress_obj=None, throttle=None, bandwidth=None, + numtries=3, retrycodes=[-1,2,4,5,6,7], checkfunc=None): + """Deprecated. Use: urlgrab() with the retry arg instead""" + kwargs = {'copy_local' : copy_local, + 'close_connection' : close_connection, + 'progress_obj' : progress_obj, + 'throttle' : throttle, + 'bandwidth' : bandwidth, + 'retry' : numtries, + 'retrycodes' : retrycodes, + 'checkfunc' : checkfunc + } + return urlgrab(url, filename, **kwargs) + + +##################################################################### +# TESTING +def _main_test(): + import sys + try: url, filename = sys.argv[1:3] + except ValueError: + print 'usage:', sys.argv[0], \ + ' [copy_local=0|1] [close_connection=0|1]' + sys.exit() + + kwargs = {} + for a in sys.argv[3:]: + k, v = string.split(a, '=', 1) + kwargs[k] = int(v) + + set_throttle(1.0) + set_bandwidth(32 * 1024) + print "throttle: %s, throttle bandwidth: %s B/s" % (default_grabber.throttle, + default_grabber.bandwidth) + + try: from progress import text_progress_meter + except ImportError, e: pass + else: kwargs['progress_obj'] = text_progress_meter() + + try: name = apply(urlgrab, (url, filename), kwargs) + except URLGrabError, e: print e + else: print 'LOCAL FILE:', name + + +def _retry_test(): + import sys + try: url, filename = sys.argv[1:3] + except ValueError: + print 'usage:', sys.argv[0], \ + ' [copy_local=0|1] [close_connection=0|1]' + sys.exit() + + kwargs = {} + for a in sys.argv[3:]: + k, v = string.split(a, '=', 1) + kwargs[k] = int(v) + + try: from progress import text_progress_meter + except ImportError, e: pass + else: kwargs['progress_obj'] = text_progress_meter() + + def cfunc(filename, hello, there='foo'): + print hello, there + import random + rnum = random.random() + if rnum < .5: + print 'forcing retry' + raise URLGrabError(-1, 'forcing retry') + if rnum < .75: + print 'forcing failure' + raise URLGrabError(-2, 'forcing immediate failure') + print 'success' + return + + kwargs['checkfunc'] = (cfunc, ('hello',), {'there':'there'}) + try: name = apply(retrygrab, (url, filename), kwargs) + except URLGrabError, e: print e + else: print 'LOCAL FILE:', name + +def _file_object_test(filename=None): + import random, cStringIO, sys + if filename is None: + filename = __file__ + print 'using file "%s" for comparisons' % filename + fo = open(filename) + s_input = fo.read() + fo.close() + + for testfunc in [_test_file_object_smallread, + _test_file_object_readall, + _test_file_object_readline, + _test_file_object_readlines]: + fo_input = cStringIO.StringIO(s_input) + fo_output = cStringIO.StringIO() + wrapper = URLGrabberFileObject(fo_input, None, 0) + print 'testing %-30s ' % testfunc.__name__, + testfunc(wrapper, fo_output) + s_output = fo_output.getvalue() + if s_output == s_input: print 'passed' + else: print 'FAILED' + +def _test_file_object_smallread(wrapper, fo_output): + while 1: + s = wrapper.read(23) + fo_output.write(s) + if not s: return + +def _test_file_object_readall(wrapper, fo_output): + s = wrapper.read() + fo_output.write(s) + +def _test_file_object_readline(wrapper, fo_output): + while 1: + s = wrapper.readline() + fo_output.write(s) + if not s: return + +def _test_file_object_readlines(wrapper, fo_output): + li = wrapper.readlines() + fo_output.write(string.join(li, '')) + +if __name__ == '__main__': + _main_test() + _retry_test() + _file_object_test('test') diff --git a/mic/urlgrabber/keepalive.py b/mic/urlgrabber/keepalive.py new file mode 100644 index 0000000..71393e2 --- /dev/null +++ b/mic/urlgrabber/keepalive.py @@ -0,0 +1,617 @@ +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, +# Boston, MA 02111-1307 USA + +# This file is part of urlgrabber, a high-level cross-protocol url-grabber +# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko + +"""An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive. + +>>> import urllib2 +>>> from keepalive import HTTPHandler +>>> keepalive_handler = HTTPHandler() +>>> opener = urllib2.build_opener(keepalive_handler) +>>> urllib2.install_opener(opener) +>>> +>>> fo = urllib2.urlopen('http://www.python.org') + +If a connection to a given host is requested, and all of the existing +connections are still in use, another connection will be opened. If +the handler tries to use an existing connection but it fails in some +way, it will be closed and removed from the pool. + +To remove the handler, simply re-run build_opener with no arguments, and +install that opener. + +You can explicitly close connections by using the close_connection() +method of the returned file-like object (described below) or you can +use the handler methods: + + close_connection(host) + close_all() + open_connections() + +NOTE: using the close_connection and close_all methods of the handler +should be done with care when using multiple threads. + * there is nothing that prevents another thread from creating new + connections immediately after connections are closed + * no checks are done to prevent in-use connections from being closed + +>>> keepalive_handler.close_all() + +EXTRA ATTRIBUTES AND METHODS + + Upon a status of 200, the object returned has a few additional + attributes and methods, which should not be used if you want to + remain consistent with the normal urllib2-returned objects: + + close_connection() - close the connection to the host + readlines() - you know, readlines() + status - the return status (ie 404) + reason - english translation of status (ie 'File not found') + + If you want the best of both worlds, use this inside an + AttributeError-catching try: + + >>> try: status = fo.status + >>> except AttributeError: status = None + + Unfortunately, these are ONLY there if status == 200, so it's not + easy to distinguish between non-200 responses. The reason is that + urllib2 tries to do clever things with error codes 301, 302, 401, + and 407, and it wraps the object upon return. + + For python versions earlier than 2.4, you can avoid this fancy error + handling by setting the module-level global HANDLE_ERRORS to zero. + You see, prior to 2.4, it's the HTTP Handler's job to determine what + to handle specially, and what to just pass up. HANDLE_ERRORS == 0 + means "pass everything up". In python 2.4, however, this job no + longer belongs to the HTTP Handler and is now done by a NEW handler, + HTTPErrorProcessor. Here's the bottom line: + + python version < 2.4 + HANDLE_ERRORS == 1 (default) pass up 200, treat the rest as + errors + HANDLE_ERRORS == 0 pass everything up, error processing is + left to the calling code + python version >= 2.4 + HANDLE_ERRORS == 1 pass up 200, treat the rest as errors + HANDLE_ERRORS == 0 (default) pass everything up, let the + other handlers (specifically, + HTTPErrorProcessor) decide what to do + + In practice, setting the variable either way makes little difference + in python 2.4, so for the most consistent behavior across versions, + you probably just want to use the defaults, which will give you + exceptions on errors. + +""" + +# $Id: keepalive.py,v 1.16 2006/09/22 00:58:05 mstenner Exp $ + +import urllib2 +import httplib +import socket +import thread + +DEBUG = None + +import sslfactory + +import sys +if sys.version_info < (2, 4): HANDLE_ERRORS = 1 +else: HANDLE_ERRORS = 0 + +class ConnectionManager: + """ + The connection manager must be able to: + * keep track of all existing + """ + def __init__(self): + self._lock = thread.allocate_lock() + self._hostmap = {} # map hosts to a list of connections + self._connmap = {} # map connections to host + self._readymap = {} # map connection to ready state + + def add(self, host, connection, ready): + self._lock.acquire() + try: + if not self._hostmap.has_key(host): self._hostmap[host] = [] + self._hostmap[host].append(connection) + self._connmap[connection] = host + self._readymap[connection] = ready + finally: + self._lock.release() + + def remove(self, connection): + self._lock.acquire() + try: + try: + host = self._connmap[connection] + except KeyError: + pass + else: + del self._connmap[connection] + del self._readymap[connection] + self._hostmap[host].remove(connection) + if not self._hostmap[host]: del self._hostmap[host] + finally: + self._lock.release() + + def set_ready(self, connection, ready): + try: self._readymap[connection] = ready + except KeyError: pass + + def get_ready_conn(self, host): + conn = None + self._lock.acquire() + try: + if self._hostmap.has_key(host): + for c in self._hostmap[host]: + if self._readymap[c]: + self._readymap[c] = 0 + conn = c + break + finally: + self._lock.release() + return conn + + def get_all(self, host=None): + if host: + return list(self._hostmap.get(host, [])) + else: + return dict(self._hostmap) + +class KeepAliveHandler: + def __init__(self): + self._cm = ConnectionManager() + + #### Connection Management + def open_connections(self): + """return a list of connected hosts and the number of connections + to each. [('foo.com:80', 2), ('bar.org', 1)]""" + return [(host, len(li)) for (host, li) in self._cm.get_all().items()] + + def close_connection(self, host): + """close connection(s) to + host is the host:port spec, as in 'www.cnn.com:8080' as passed in. + no error occurs if there is no connection to that host.""" + for h in self._cm.get_all(host): + self._cm.remove(h) + h.close() + + def close_all(self): + """close all open connections""" + for host, conns in self._cm.get_all().items(): + for h in conns: + self._cm.remove(h) + h.close() + + def _request_closed(self, request, host, connection): + """tells us that this request is now closed and the the + connection is ready for another request""" + self._cm.set_ready(connection, 1) + + def _remove_connection(self, host, connection, close=0): + if close: connection.close() + self._cm.remove(connection) + + #### Transaction Execution + def do_open(self, req): + host = req.get_host() + if not host: + raise urllib2.URLError('no host given') + + try: + h = self._cm.get_ready_conn(host) + while h: + r = self._reuse_connection(h, req, host) + + # if this response is non-None, then it worked and we're + # done. Break out, skipping the else block. + if r: break + + # connection is bad - possibly closed by server + # discard it and ask for the next free connection + h.close() + self._cm.remove(h) + h = self._cm.get_ready_conn(host) + else: + # no (working) free connections were found. Create a new one. + h = self._get_connection(host) + if DEBUG: DEBUG.info("creating new connection to %s (%d)", + host, id(h)) + self._cm.add(host, h, 0) + self._start_transaction(h, req) + r = h.getresponse() + except (socket.error, httplib.HTTPException), err: + raise urllib2.URLError(err) + + # if not a persistent connection, don't try to reuse it + if r.will_close: self._cm.remove(h) + + if DEBUG: DEBUG.info("STATUS: %s, %s", r.status, r.reason) + r._handler = self + r._host = host + r._url = req.get_full_url() + r._connection = h + r.code = r.status + r.headers = r.msg + r.msg = r.reason + + if r.status == 200 or not HANDLE_ERRORS: + return r + else: + return self.parent.error('http', req, r, + r.status, r.msg, r.headers) + + def _reuse_connection(self, h, req, host): + """start the transaction with a re-used connection + return a response object (r) upon success or None on failure. + This DOES not close or remove bad connections in cases where + it returns. However, if an unexpected exception occurs, it + will close and remove the connection before re-raising. + """ + try: + self._start_transaction(h, req) + r = h.getresponse() + # note: just because we got something back doesn't mean it + # worked. We'll check the version below, too. + except (socket.error, httplib.HTTPException): + r = None + except: + # adding this block just in case we've missed + # something we will still raise the exception, but + # lets try and close the connection and remove it + # first. We previously got into a nasty loop + # where an exception was uncaught, and so the + # connection stayed open. On the next try, the + # same exception was raised, etc. The tradeoff is + # that it's now possible this call will raise + # a DIFFERENT exception + if DEBUG: DEBUG.error("unexpected exception - closing " + \ + "connection to %s (%d)", host, id(h)) + self._cm.remove(h) + h.close() + raise + + if r is None or r.version == 9: + # httplib falls back to assuming HTTP 0.9 if it gets a + # bad header back. This is most likely to happen if + # the socket has been closed by the server since we + # last used the connection. + if DEBUG: DEBUG.info("failed to re-use connection to %s (%d)", + host, id(h)) + r = None + else: + if DEBUG: DEBUG.info("re-using connection to %s (%d)", host, id(h)) + + return r + + def _start_transaction(self, h, req): + try: + if req.has_data(): + data = req.get_data() + h.putrequest('POST', req.get_selector()) + if not req.headers.has_key('Content-type'): + h.putheader('Content-type', + 'application/x-www-form-urlencoded') + if not req.headers.has_key('Content-length'): + h.putheader('Content-length', '%d' % len(data)) + else: + h.putrequest('GET', req.get_selector()) + except (socket.error, httplib.HTTPException), err: + raise urllib2.URLError(err) + + for args in self.parent.addheaders: + h.putheader(*args) + for k, v in req.headers.items(): + h.putheader(k, v) + h.endheaders() + if req.has_data(): + h.send(data) + + def _get_connection(self, host): + return NotImplementedError + +class HTTPHandler(KeepAliveHandler, urllib2.HTTPHandler): + def __init__(self): + KeepAliveHandler.__init__(self) + + def http_open(self, req): + return self.do_open(req) + + def _get_connection(self, host): + return HTTPConnection(host) + +class HTTPSHandler(KeepAliveHandler, urllib2.HTTPSHandler): + def __init__(self, ssl_factory=None): + KeepAliveHandler.__init__(self) + if not ssl_factory: + ssl_factory = sslfactory.get_factory() + self._ssl_factory = ssl_factory + + def https_open(self, req): + return self.do_open(req) + + def _get_connection(self, host): + return self._ssl_factory.get_https_connection(host) + +class HTTPResponse(httplib.HTTPResponse): + # we need to subclass HTTPResponse in order to + # 1) add readline() and readlines() methods + # 2) add close_connection() methods + # 3) add info() and geturl() methods + + # in order to add readline(), read must be modified to deal with a + # buffer. example: readline must read a buffer and then spit back + # one line at a time. The only real alternative is to read one + # BYTE at a time (ick). Once something has been read, it can't be + # put back (ok, maybe it can, but that's even uglier than this), + # so if you THEN do a normal read, you must first take stuff from + # the buffer. + + # the read method wraps the original to accomodate buffering, + # although read() never adds to the buffer. + # Both readline and readlines have been stolen with almost no + # modification from socket.py + + + def __init__(self, sock, debuglevel=0, strict=0, method=None): + if method: # the httplib in python 2.3 uses the method arg + httplib.HTTPResponse.__init__(self, sock, debuglevel, method) + else: # 2.2 doesn't + httplib.HTTPResponse.__init__(self, sock, debuglevel) + self.fileno = sock.fileno + self.code = None + self._rbuf = '' + self._rbufsize = 8096 + self._handler = None # inserted by the handler later + self._host = None # (same) + self._url = None # (same) + self._connection = None # (same) + + _raw_read = httplib.HTTPResponse.read + + def close(self): + if self.fp: + self.fp.close() + self.fp = None + if self._handler: + self._handler._request_closed(self, self._host, + self._connection) + + def close_connection(self): + self._handler._remove_connection(self._host, self._connection, close=1) + self.close() + + def info(self): + return self.headers + + def geturl(self): + return self._url + + def read(self, amt=None): + # the _rbuf test is only in this first if for speed. It's not + # logically necessary + if self._rbuf and not amt is None: + L = len(self._rbuf) + if amt > L: + amt -= L + else: + s = self._rbuf[:amt] + self._rbuf = self._rbuf[amt:] + return s + + s = self._rbuf + self._raw_read(amt) + self._rbuf = '' + return s + + def readline(self, limit=-1): + data = "" + i = self._rbuf.find('\n') + while i < 0 and not (0 < limit <= len(self._rbuf)): + new = self._raw_read(self._rbufsize) + if not new: break + i = new.find('\n') + if i >= 0: i = i + len(self._rbuf) + self._rbuf = self._rbuf + new + if i < 0: i = len(self._rbuf) + else: i = i+1 + if 0 <= limit < len(self._rbuf): i = limit + data, self._rbuf = self._rbuf[:i], self._rbuf[i:] + return data + + def readlines(self, sizehint = 0): + total = 0 + list = [] + while 1: + line = self.readline() + if not line: break + list.append(line) + total += len(line) + if sizehint and total >= sizehint: + break + return list + + +class HTTPConnection(httplib.HTTPConnection): + # use the modified response class + response_class = HTTPResponse + +class HTTPSConnection(httplib.HTTPSConnection): + response_class = HTTPResponse + +######################################################################### +##### TEST FUNCTIONS +######################################################################### + +def error_handler(url): + global HANDLE_ERRORS + orig = HANDLE_ERRORS + keepalive_handler = HTTPHandler() + opener = urllib2.build_opener(keepalive_handler) + urllib2.install_opener(opener) + pos = {0: 'off', 1: 'on'} + for i in (0, 1): + print " fancy error handling %s (HANDLE_ERRORS = %i)" % (pos[i], i) + HANDLE_ERRORS = i + try: + fo = urllib2.urlopen(url) + foo = fo.read() + fo.close() + try: status, reason = fo.status, fo.reason + except AttributeError: status, reason = None, None + except IOError, e: + print " EXCEPTION: %s" % e + raise + else: + print " status = %s, reason = %s" % (status, reason) + HANDLE_ERRORS = orig + hosts = keepalive_handler.open_connections() + print "open connections:", hosts + keepalive_handler.close_all() + +def continuity(url): + import md5 + format = '%25s: %s' + + # first fetch the file with the normal http handler + opener = urllib2.build_opener() + urllib2.install_opener(opener) + fo = urllib2.urlopen(url) + foo = fo.read() + fo.close() + m = md5.new(foo) + print format % ('normal urllib', m.hexdigest()) + + # now install the keepalive handler and try again + opener = urllib2.build_opener(HTTPHandler()) + urllib2.install_opener(opener) + + fo = urllib2.urlopen(url) + foo = fo.read() + fo.close() + m = md5.new(foo) + print format % ('keepalive read', m.hexdigest()) + + fo = urllib2.urlopen(url) + foo = '' + while 1: + f = fo.readline() + if f: foo = foo + f + else: break + fo.close() + m = md5.new(foo) + print format % ('keepalive readline', m.hexdigest()) + +def comp(N, url): + print ' making %i connections to:\n %s' % (N, url) + + sys.stdout.write(' first using the normal urllib handlers') + # first use normal opener + opener = urllib2.build_opener() + urllib2.install_opener(opener) + t1 = fetch(N, url) + print ' TIME: %.3f s' % t1 + + sys.stdout.write(' now using the keepalive handler ') + # now install the keepalive handler and try again + opener = urllib2.build_opener(HTTPHandler()) + urllib2.install_opener(opener) + t2 = fetch(N, url) + print ' TIME: %.3f s' % t2 + print ' improvement factor: %.2f' % (t1/t2, ) + +def fetch(N, url, delay=0): + import time + lens = [] + starttime = time.time() + for i in range(N): + if delay and i > 0: time.sleep(delay) + fo = urllib2.urlopen(url) + foo = fo.read() + fo.close() + lens.append(len(foo)) + diff = time.time() - starttime + + j = 0 + for i in lens[1:]: + j = j + 1 + if not i == lens[0]: + print "WARNING: inconsistent length on read %i: %i" % (j, i) + + return diff + +def test_timeout(url): + global DEBUG + dbbackup = DEBUG + class FakeLogger: + def debug(self, msg, *args): print msg % args + info = warning = error = debug + DEBUG = FakeLogger() + print " fetching the file to establish a connection" + fo = urllib2.urlopen(url) + data1 = fo.read() + fo.close() + + i = 20 + print " waiting %i seconds for the server to close the connection" % i + while i > 0: + sys.stdout.write('\r %2i' % i) + sys.stdout.flush() + time.sleep(1) + i -= 1 + sys.stderr.write('\r') + + print " fetching the file a second time" + fo = urllib2.urlopen(url) + data2 = fo.read() + fo.close() + + if data1 == data2: + print ' data are identical' + else: + print ' ERROR: DATA DIFFER' + + DEBUG = dbbackup + + +def test(url, N=10): + print "checking error hander (do this on a non-200)" + try: error_handler(url) + except IOError, e: + print "exiting - exception will prevent further tests" + sys.exit() + print + print "performing continuity test (making sure stuff isn't corrupted)" + continuity(url) + print + print "performing speed comparison" + comp(N, url) + print + print "performing dropped-connection check" + test_timeout(url) + +if __name__ == '__main__': + import time + import sys + try: + N = int(sys.argv[1]) + url = sys.argv[2] + except: + print "%s " % sys.argv[0] + else: + test(url, N) diff --git a/mic/urlgrabber/mirror.py b/mic/urlgrabber/mirror.py new file mode 100644 index 0000000..9664c6b --- /dev/null +++ b/mic/urlgrabber/mirror.py @@ -0,0 +1,458 @@ +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, +# Boston, MA 02111-1307 USA + +# This file is part of urlgrabber, a high-level cross-protocol url-grabber +# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko + +"""Module for downloading files from a pool of mirrors + +DESCRIPTION + + This module provides support for downloading files from a pool of + mirrors with configurable failover policies. To a large extent, the + failover policy is chosen by using different classes derived from + the main class, MirrorGroup. + + Instances of MirrorGroup (and cousins) act very much like URLGrabber + instances in that they have urlread, urlgrab, and urlopen methods. + They can therefore, be used in very similar ways. + + from urlgrabber.grabber import URLGrabber + from urlgrabber.mirror import MirrorGroup + gr = URLGrabber() + mg = MirrorGroup(gr, ['http://foo.com/some/directory/', + 'http://bar.org/maybe/somewhere/else/', + 'ftp://baz.net/some/other/place/entirely/'] + mg.urlgrab('relative/path.zip') + + The assumption is that all mirrors are identical AFTER the base urls + specified, so that any mirror can be used to fetch any file. + +FAILOVER + + The failover mechanism is designed to be customized by subclassing + from MirrorGroup to change the details of the behavior. In general, + the classes maintain a master mirror list and a "current mirror" + index. When a download is initiated, a copy of this list and index + is created for that download only. The specific failover policy + depends on the class used, and so is documented in the class + documentation. Note that ANY behavior of the class can be + overridden, so any failover policy at all is possible (although + you may need to change the interface in extreme cases). + +CUSTOMIZATION + + Most customization of a MirrorGroup object is done at instantiation + time (or via subclassing). There are four major types of + customization: + + 1) Pass in a custom urlgrabber - The passed in urlgrabber will be + used (by default... see #2) for the grabs, so options to it + apply for the url-fetching + + 2) Custom mirror list - Mirror lists can simply be a list of + stings mirrors (as shown in the example above) but each can + also be a dict, allowing for more options. For example, the + first mirror in the list above could also have been: + + {'mirror': 'http://foo.com/some/directory/', + 'grabber': , + 'kwargs': { }} + + All mirrors are converted to this format internally. If + 'grabber' is omitted, the default grabber will be used. If + kwargs are omitted, then (duh) they will not be used. + + 3) Pass keyword arguments when instantiating the mirror group. + See, for example, the failure_callback argument. + + 4) Finally, any kwargs passed in for the specific file (to the + urlgrab method, for example) will be folded in. The options + passed into the grabber's urlXXX methods will override any + options specified in a custom mirror dict. + +""" + +# $Id: mirror.py,v 1.14 2006/02/22 18:26:46 mstenner Exp $ + +import random +import thread # needed for locking to make this threadsafe + +from grabber import URLGrabError, CallbackObject, DEBUG + +try: + from i18n import _ +except ImportError, msg: + def _(st): return st + +class GrabRequest: + """This is a dummy class used to hold information about the specific + request. For example, a single file. By maintaining this information + separately, we can accomplish two things: + + 1) make it a little easier to be threadsafe + 2) have request-specific parameters + """ + pass + +class MirrorGroup: + """Base Mirror class + + Instances of this class are built with a grabber object and a list + of mirrors. Then all calls to urlXXX should be passed relative urls. + The requested file will be searched for on the first mirror. If the + grabber raises an exception (possibly after some retries) then that + mirror will be removed from the list, and the next will be attempted. + If all mirrors are exhausted, then an exception will be raised. + + MirrorGroup has the following failover policy: + + * downloads begin with the first mirror + + * by default (see default_action below) a failure (after retries) + causes it to increment the local AND master indices. Also, + the current mirror is removed from the local list (but NOT the + master list - the mirror can potentially be used for other + files) + + * if the local list is ever exhausted, a URLGrabError will be + raised (errno=256, no more mirrors) + + OPTIONS + + In addition to the required arguments "grabber" and "mirrors", + MirrorGroup also takes the following optional arguments: + + default_action + + A dict that describes the actions to be taken upon failure + (after retries). default_action can contain any of the + following keys (shown here with their default values): + + default_action = {'increment': 1, + 'increment_master': 1, + 'remove': 1, + 'remove_master': 0, + 'fail': 0} + + In this context, 'increment' means "use the next mirror" and + 'remove' means "never use this mirror again". The two + 'master' values refer to the instance-level mirror list (used + for all files), whereas the non-master values refer to the + current download only. + + The 'fail' option will cause immediate failure by re-raising + the exception and no further attempts to get the current + download. + + This dict can be set at instantiation time, + mg = MirrorGroup(grabber, mirrors, default_action={'fail':1}) + at method-execution time (only applies to current fetch), + filename = mg.urlgrab(url, default_action={'increment': 0}) + or by returning an action dict from the failure_callback + return {'fail':0} + in increasing precedence. + + If all three of these were done, the net result would be: + {'increment': 0, # set in method + 'increment_master': 1, # class default + 'remove': 1, # class default + 'remove_master': 0, # class default + 'fail': 0} # set at instantiation, reset + # from callback + + failure_callback + + this is a callback that will be called when a mirror "fails", + meaning the grabber raises some URLGrabError. If this is a + tuple, it is interpreted to be of the form (cb, args, kwargs) + where cb is the actual callable object (function, method, + etc). Otherwise, it is assumed to be the callable object + itself. The callback will be passed a grabber.CallbackObject + instance along with args and kwargs (if present). The following + attributes are defined withing the instance: + + obj.exception = < exception that was raised > + obj.mirror = < the mirror that was tried > + obj.relative_url = < url relative to the mirror > + obj.url = < full url that failed > + # .url is just the combination of .mirror + # and .relative_url + + The failure callback can return an action dict, as described + above. + + Like default_action, the failure_callback can be set at + instantiation time or when the urlXXX method is called. In + the latter case, it applies only for that fetch. + + The callback can re-raise the exception quite easily. For + example, this is a perfectly adequate callback function: + + def callback(obj): raise obj.exception + + WARNING: do not save the exception object (or the + CallbackObject instance). As they contain stack frame + references, they can lead to circular references. + + Notes: + * The behavior can be customized by deriving and overriding the + 'CONFIGURATION METHODS' + * The 'grabber' instance is kept as a reference, not copied. + Therefore, the grabber instance can be modified externally + and changes will take effect immediately. + """ + + # notes on thread-safety: + + # A GrabRequest should never be shared by multiple threads because + # it's never saved inside the MG object and never returned outside it. + # therefore, it should be safe to access/modify grabrequest data + # without a lock. However, accessing the mirrors and _next attributes + # of the MG itself must be done when locked to prevent (for example) + # removal of the wrong mirror. + + ############################################################## + # CONFIGURATION METHODS - intended to be overridden to + # customize behavior + def __init__(self, grabber, mirrors, **kwargs): + """Initialize the MirrorGroup object. + + REQUIRED ARGUMENTS + + grabber - URLGrabber instance + mirrors - a list of mirrors + + OPTIONAL ARGUMENTS + + failure_callback - callback to be used when a mirror fails + default_action - dict of failure actions + + See the module-level and class level documentation for more + details. + """ + + # OVERRIDE IDEAS: + # shuffle the list to randomize order + self.grabber = grabber + self.mirrors = self._parse_mirrors(mirrors) + self._next = 0 + self._lock = thread.allocate_lock() + self.default_action = None + self._process_kwargs(kwargs) + + # if these values are found in **kwargs passed to one of the urlXXX + # methods, they will be stripped before getting passed on to the + # grabber + options = ['default_action', 'failure_callback'] + + def _process_kwargs(self, kwargs): + self.failure_callback = kwargs.get('failure_callback') + self.default_action = kwargs.get('default_action') + + def _parse_mirrors(self, mirrors): + parsed_mirrors = [] + for m in mirrors: + if type(m) == type(''): m = {'mirror': m} + parsed_mirrors.append(m) + return parsed_mirrors + + def _load_gr(self, gr): + # OVERRIDE IDEAS: + # shuffle gr list + self._lock.acquire() + gr.mirrors = list(self.mirrors) + gr._next = self._next + self._lock.release() + + def _get_mirror(self, gr): + # OVERRIDE IDEAS: + # return a random mirror so that multiple mirrors get used + # even without failures. + if not gr.mirrors: + raise URLGrabError(256, _('No more mirrors to try.')) + return gr.mirrors[gr._next] + + def _failure(self, gr, cb_obj): + # OVERRIDE IDEAS: + # inspect the error - remove=1 for 404, remove=2 for connection + # refused, etc. (this can also be done via + # the callback) + cb = gr.kw.get('failure_callback') or self.failure_callback + if cb: + if type(cb) == type( () ): + cb, args, kwargs = cb + else: + args, kwargs = (), {} + action = cb(cb_obj, *args, **kwargs) or {} + else: + action = {} + # XXXX - decide - there are two ways to do this + # the first is action-overriding as a whole - use the entire action + # or fall back on module level defaults + #action = action or gr.kw.get('default_action') or self.default_action + # the other is to fall through for each element in the action dict + a = dict(self.default_action or {}) + a.update(gr.kw.get('default_action', {})) + a.update(action) + action = a + self.increment_mirror(gr, action) + if action and action.get('fail', 0): raise + + def increment_mirror(self, gr, action={}): + """Tell the mirror object increment the mirror index + + This increments the mirror index, which amounts to telling the + mirror object to use a different mirror (for this and future + downloads). + + This is a SEMI-public method. It will be called internally, + and you may never need to call it. However, it is provided + (and is made public) so that the calling program can increment + the mirror choice for methods like urlopen. For example, with + urlopen, there's no good way for the mirror group to know that + an error occurs mid-download (it's already returned and given + you the file object). + + remove --- can have several values + 0 do not remove the mirror from the list + 1 remove the mirror for this download only + 2 remove the mirror permanently + + beware of remove=0 as it can lead to infinite loops + """ + badmirror = gr.mirrors[gr._next] + + self._lock.acquire() + try: + ind = self.mirrors.index(badmirror) + except ValueError: + pass + else: + if action.get('remove_master', 0): + del self.mirrors[ind] + elif self._next == ind and action.get('increment_master', 1): + self._next += 1 + if self._next >= len(self.mirrors): self._next = 0 + self._lock.release() + + if action.get('remove', 1): + del gr.mirrors[gr._next] + elif action.get('increment', 1): + gr._next += 1 + if gr._next >= len(gr.mirrors): gr._next = 0 + + if DEBUG: + grm = [m['mirror'] for m in gr.mirrors] + DEBUG.info('GR mirrors: [%s] %i', ' '.join(grm), gr._next) + selfm = [m['mirror'] for m in self.mirrors] + DEBUG.info('MAIN mirrors: [%s] %i', ' '.join(selfm), self._next) + + ##################################################################### + # NON-CONFIGURATION METHODS + # these methods are designed to be largely workhorse methods that + # are not intended to be overridden. That doesn't mean you can't; + # if you want to, feel free, but most things can be done by + # by overriding the configuration methods :) + + def _join_url(self, base_url, rel_url): + if base_url.endswith('/') or rel_url.startswith('/'): + return base_url + rel_url + else: + return base_url + '/' + rel_url + + def _mirror_try(self, func, url, kw): + gr = GrabRequest() + gr.func = func + gr.url = url + gr.kw = dict(kw) + self._load_gr(gr) + + for k in self.options: + try: del kw[k] + except KeyError: pass + + while 1: + mirrorchoice = self._get_mirror(gr) + fullurl = self._join_url(mirrorchoice['mirror'], gr.url) + kwargs = dict(mirrorchoice.get('kwargs', {})) + kwargs.update(kw) + grabber = mirrorchoice.get('grabber') or self.grabber + func_ref = getattr(grabber, func) + if DEBUG: DEBUG.info('MIRROR: trying %s -> %s', url, fullurl) + try: + return func_ref( *(fullurl,), **kwargs ) + except URLGrabError, e: + if DEBUG: DEBUG.info('MIRROR: failed') + obj = CallbackObject() + obj.exception = e + obj.mirror = mirrorchoice['mirror'] + obj.relative_url = gr.url + obj.url = fullurl + self._failure(gr, obj) + + def urlgrab(self, url, filename=None, **kwargs): + kw = dict(kwargs) + kw['filename'] = filename + func = 'urlgrab' + return self._mirror_try(func, url, kw) + + def urlopen(self, url, **kwargs): + kw = dict(kwargs) + func = 'urlopen' + return self._mirror_try(func, url, kw) + + def urlread(self, url, limit=None, **kwargs): + kw = dict(kwargs) + kw['limit'] = limit + func = 'urlread' + return self._mirror_try(func, url, kw) + + +class MGRandomStart(MirrorGroup): + """A mirror group that starts at a random mirror in the list. + + This behavior of this class is identical to MirrorGroup, except that + it starts at a random location in the mirror list. + """ + + def __init__(self, grabber, mirrors, **kwargs): + """Initialize the object + + The arguments for intialization are the same as for MirrorGroup + """ + MirrorGroup.__init__(self, grabber, mirrors, **kwargs) + self._next = random.randrange(len(mirrors)) + +class MGRandomOrder(MirrorGroup): + """A mirror group that uses mirrors in a random order. + + This behavior of this class is identical to MirrorGroup, except that + it uses the mirrors in a random order. Note that the order is set at + initialization time and fixed thereafter. That is, it does not pick a + random mirror after each failure. + """ + + def __init__(self, grabber, mirrors, **kwargs): + """Initialize the object + + The arguments for intialization are the same as for MirrorGroup + """ + MirrorGroup.__init__(self, grabber, mirrors, **kwargs) + random.shuffle(self.mirrors) + +if __name__ == '__main__': + pass diff --git a/mic/urlgrabber/progress.py b/mic/urlgrabber/progress.py new file mode 100644 index 0000000..02db524 --- /dev/null +++ b/mic/urlgrabber/progress.py @@ -0,0 +1,530 @@ +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, +# Boston, MA 02111-1307 USA + +# This file is part of urlgrabber, a high-level cross-protocol url-grabber +# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko + +# $Id: progress.py,v 1.7 2005/08/19 21:59:07 mstenner Exp $ + +import sys +import time +import math +import thread + +class BaseMeter: + def __init__(self): + self.update_period = 0.3 # seconds + + self.filename = None + self.url = None + self.basename = None + self.text = None + self.size = None + self.start_time = None + self.last_amount_read = 0 + self.last_update_time = None + self.re = RateEstimator() + + def start(self, filename=None, url=None, basename=None, + size=None, now=None, text=None): + self.filename = filename + self.url = url + self.basename = basename + self.text = text + + #size = None ######### TESTING + self.size = size + if not size is None: self.fsize = format_number(size) + 'B' + + if now is None: now = time.time() + self.start_time = now + self.re.start(size, now) + self.last_amount_read = 0 + self.last_update_time = now + self._do_start(now) + + def _do_start(self, now=None): + pass + + def update(self, amount_read, now=None): + # for a real gui, you probably want to override and put a call + # to your mainloop iteration function here + if now is None: now = time.time() + if (now >= self.last_update_time + self.update_period) or \ + not self.last_update_time: + self.re.update(amount_read, now) + self.last_amount_read = amount_read + self.last_update_time = now + self._do_update(amount_read, now) + + def _do_update(self, amount_read, now=None): + pass + + def end(self, amount_read, now=None): + if now is None: now = time.time() + self.re.update(amount_read, now) + self.last_amount_read = amount_read + self.last_update_time = now + self._do_end(amount_read, now) + + def _do_end(self, amount_read, now=None): + pass + +class TextMeter(BaseMeter): + def __init__(self, fo=sys.stderr): + BaseMeter.__init__(self) + self.fo = fo + + def _do_update(self, amount_read, now=None): + etime = self.re.elapsed_time() + fetime = format_time(etime) + fread = format_number(amount_read) + #self.size = None + if self.text is not None: + text = self.text + else: + text = self.basename + if self.size is None: + out = '\r%-60.60s %5sB %s ' % \ + (text, fread, fetime) + else: + rtime = self.re.remaining_time() + frtime = format_time(rtime) + frac = self.re.fraction_read() + bar = '='*int(25 * frac) + + out = '\r%-25.25s %3i%% |%-25.25s| %5sB %8s ETA ' % \ + (text, frac*100, bar, fread, frtime) + + self.fo.write(out) + self.fo.flush() + + def _do_end(self, amount_read, now=None): + total_time = format_time(self.re.elapsed_time()) + total_size = format_number(amount_read) + if self.text is not None: + text = self.text + else: + text = self.basename + if self.size is None: + out = '\r%-60.60s %5sB %s ' % \ + (text, total_size, total_time) + else: + bar = '='*25 + out = '\r%-25.25s %3i%% |%-25.25s| %5sB %8s ' % \ + (text, 100, bar, total_size, total_time) + self.fo.write(out + '\n') + self.fo.flush() + +text_progress_meter = TextMeter + +class MultiFileHelper(BaseMeter): + def __init__(self, master): + BaseMeter.__init__(self) + self.master = master + + def _do_start(self, now): + self.master.start_meter(self, now) + + def _do_update(self, amount_read, now): + # elapsed time since last update + self.master.update_meter(self, now) + + def _do_end(self, amount_read, now): + self.ftotal_time = format_time(now - self.start_time) + self.ftotal_size = format_number(self.last_amount_read) + self.master.end_meter(self, now) + + def failure(self, message, now=None): + self.master.failure_meter(self, message, now) + + def message(self, message): + self.master.message_meter(self, message) + +class MultiFileMeter: + helperclass = MultiFileHelper + def __init__(self): + self.meters = [] + self.in_progress_meters = [] + self._lock = thread.allocate_lock() + self.update_period = 0.3 # seconds + + self.numfiles = None + self.finished_files = 0 + self.failed_files = 0 + self.open_files = 0 + self.total_size = None + self.failed_size = 0 + self.start_time = None + self.finished_file_size = 0 + self.last_update_time = None + self.re = RateEstimator() + + def start(self, numfiles=None, total_size=None, now=None): + if now is None: now = time.time() + self.numfiles = numfiles + self.finished_files = 0 + self.failed_files = 0 + self.open_files = 0 + self.total_size = total_size + self.failed_size = 0 + self.start_time = now + self.finished_file_size = 0 + self.last_update_time = now + self.re.start(total_size, now) + self._do_start(now) + + def _do_start(self, now): + pass + + def end(self, now=None): + if now is None: now = time.time() + self._do_end(now) + + def _do_end(self, now): + pass + + def lock(self): self._lock.acquire() + def unlock(self): self._lock.release() + + ########################################################### + # child meter creation and destruction + def newMeter(self): + newmeter = self.helperclass(self) + self.meters.append(newmeter) + return newmeter + + def removeMeter(self, meter): + self.meters.remove(meter) + + ########################################################### + # child functions - these should only be called by helpers + def start_meter(self, meter, now): + if not meter in self.meters: + raise ValueError('attempt to use orphaned meter') + self._lock.acquire() + try: + if not meter in self.in_progress_meters: + self.in_progress_meters.append(meter) + self.open_files += 1 + finally: + self._lock.release() + self._do_start_meter(meter, now) + + def _do_start_meter(self, meter, now): + pass + + def update_meter(self, meter, now): + if not meter in self.meters: + raise ValueError('attempt to use orphaned meter') + if (now >= self.last_update_time + self.update_period) or \ + not self.last_update_time: + self.re.update(self._amount_read(), now) + self.last_update_time = now + self._do_update_meter(meter, now) + + def _do_update_meter(self, meter, now): + pass + + def end_meter(self, meter, now): + if not meter in self.meters: + raise ValueError('attempt to use orphaned meter') + self._lock.acquire() + try: + try: self.in_progress_meters.remove(meter) + except ValueError: pass + self.open_files -= 1 + self.finished_files += 1 + self.finished_file_size += meter.last_amount_read + finally: + self._lock.release() + self._do_end_meter(meter, now) + + def _do_end_meter(self, meter, now): + pass + + def failure_meter(self, meter, message, now): + if not meter in self.meters: + raise ValueError('attempt to use orphaned meter') + self._lock.acquire() + try: + try: self.in_progress_meters.remove(meter) + except ValueError: pass + self.open_files -= 1 + self.failed_files += 1 + if meter.size and self.failed_size is not None: + self.failed_size += meter.size + else: + self.failed_size = None + finally: + self._lock.release() + self._do_failure_meter(meter, message, now) + + def _do_failure_meter(self, meter, message, now): + pass + + def message_meter(self, meter, message): + pass + + ######################################################## + # internal functions + def _amount_read(self): + tot = self.finished_file_size + for m in self.in_progress_meters: + tot += m.last_amount_read + return tot + + +class TextMultiFileMeter(MultiFileMeter): + def __init__(self, fo=sys.stderr): + self.fo = fo + MultiFileMeter.__init__(self) + + # files: ###/### ###% data: ######/###### ###% time: ##:##:##/##:##:## + def _do_update_meter(self, meter, now): + self._lock.acquire() + try: + format = "files: %3i/%-3i %3i%% data: %6.6s/%-6.6s %3i%% " \ + "time: %8.8s/%8.8s" + df = self.finished_files + tf = self.numfiles or 1 + pf = 100 * float(df)/tf + 0.49 + dd = self.re.last_amount_read + td = self.total_size + pd = 100 * (self.re.fraction_read() or 0) + 0.49 + dt = self.re.elapsed_time() + rt = self.re.remaining_time() + if rt is None: tt = None + else: tt = dt + rt + + fdd = format_number(dd) + 'B' + ftd = format_number(td) + 'B' + fdt = format_time(dt, 1) + ftt = format_time(tt, 1) + + out = '%-79.79s' % (format % (df, tf, pf, fdd, ftd, pd, fdt, ftt)) + self.fo.write('\r' + out) + self.fo.flush() + finally: + self._lock.release() + + def _do_end_meter(self, meter, now): + self._lock.acquire() + try: + format = "%-30.30s %6.6s %8.8s %9.9s" + fn = meter.basename + size = meter.last_amount_read + fsize = format_number(size) + 'B' + et = meter.re.elapsed_time() + fet = format_time(et, 1) + frate = format_number(size / et) + 'B/s' + + out = '%-79.79s' % (format % (fn, fsize, fet, frate)) + self.fo.write('\r' + out + '\n') + finally: + self._lock.release() + self._do_update_meter(meter, now) + + def _do_failure_meter(self, meter, message, now): + self._lock.acquire() + try: + format = "%-30.30s %6.6s %s" + fn = meter.basename + if type(message) in (type(''), type(u'')): + message = message.splitlines() + if not message: message = [''] + out = '%-79s' % (format % (fn, 'FAILED', message[0] or '')) + self.fo.write('\r' + out + '\n') + for m in message[1:]: self.fo.write(' ' + m + '\n') + self._lock.release() + finally: + self._do_update_meter(meter, now) + + def message_meter(self, meter, message): + self._lock.acquire() + try: + pass + finally: + self._lock.release() + + def _do_end(self, now): + self._do_update_meter(None, now) + self._lock.acquire() + try: + self.fo.write('\n') + self.fo.flush() + finally: + self._lock.release() + +###################################################################### +# support classes and functions + +class RateEstimator: + def __init__(self, timescale=5.0): + self.timescale = timescale + + def start(self, total=None, now=None): + if now is None: now = time.time() + self.total = total + self.start_time = now + self.last_update_time = now + self.last_amount_read = 0 + self.ave_rate = None + + def update(self, amount_read, now=None): + if now is None: now = time.time() + if amount_read == 0: + # if we just started this file, all bets are off + self.last_update_time = now + self.last_amount_read = 0 + self.ave_rate = None + return + + #print 'times', now, self.last_update_time + time_diff = now - self.last_update_time + read_diff = amount_read - self.last_amount_read + self.last_update_time = now + self.last_amount_read = amount_read + self.ave_rate = self._temporal_rolling_ave(\ + time_diff, read_diff, self.ave_rate, self.timescale) + #print 'results', time_diff, read_diff, self.ave_rate + + ##################################################################### + # result methods + def average_rate(self): + "get the average transfer rate (in bytes/second)" + return self.ave_rate + + def elapsed_time(self): + "the time between the start of the transfer and the most recent update" + return self.last_update_time - self.start_time + + def remaining_time(self): + "estimated time remaining" + if not self.ave_rate or not self.total: return None + return (self.total - self.last_amount_read) / self.ave_rate + + def fraction_read(self): + """the fraction of the data that has been read + (can be None for unknown transfer size)""" + if self.total is None: return None + elif self.total == 0: return 1.0 + else: return float(self.last_amount_read)/self.total + + ######################################################################### + # support methods + def _temporal_rolling_ave(self, time_diff, read_diff, last_ave, timescale): + """a temporal rolling average performs smooth averaging even when + updates come at irregular intervals. This is performed by scaling + the "epsilon" according to the time since the last update. + Specifically, epsilon = time_diff / timescale + + As a general rule, the average will take on a completely new value + after 'timescale' seconds.""" + epsilon = time_diff / timescale + if epsilon > 1: epsilon = 1.0 + return self._rolling_ave(time_diff, read_diff, last_ave, epsilon) + + def _rolling_ave(self, time_diff, read_diff, last_ave, epsilon): + """perform a "rolling average" iteration + a rolling average "folds" new data into an existing average with + some weight, epsilon. epsilon must be between 0.0 and 1.0 (inclusive) + a value of 0.0 means only the old value (initial value) counts, + and a value of 1.0 means only the newest value is considered.""" + + try: + recent_rate = read_diff / time_diff + except ZeroDivisionError: + recent_rate = None + if last_ave is None: return recent_rate + elif recent_rate is None: return last_ave + + # at this point, both last_ave and recent_rate are numbers + return epsilon * recent_rate + (1 - epsilon) * last_ave + + def _round_remaining_time(self, rt, start_time=15.0): + """round the remaining time, depending on its size + If rt is between n*start_time and (n+1)*start_time round downward + to the nearest multiple of n (for any counting number n). + If rt < start_time, round down to the nearest 1. + For example (for start_time = 15.0): + 2.7 -> 2.0 + 25.2 -> 25.0 + 26.4 -> 26.0 + 35.3 -> 34.0 + 63.6 -> 60.0 + """ + + if rt < 0: return 0.0 + shift = int(math.log(rt/start_time)/math.log(2)) + rt = int(rt) + if shift <= 0: return rt + return float(int(rt) >> shift << shift) + + +def format_time(seconds, use_hours=0): + if seconds is None or seconds < 0: + if use_hours: return '--:--:--' + else: return '--:--' + else: + seconds = int(seconds) + minutes = seconds / 60 + seconds = seconds % 60 + if use_hours: + hours = minutes / 60 + minutes = minutes % 60 + return '%02i:%02i:%02i' % (hours, minutes, seconds) + else: + return '%02i:%02i' % (minutes, seconds) + +def format_number(number, SI=0, space=' '): + """Turn numbers into human-readable metric-like numbers""" + symbols = ['', # (none) + 'k', # kilo + 'M', # mega + 'G', # giga + 'T', # tera + 'P', # peta + 'E', # exa + 'Z', # zetta + 'Y'] # yotta + + if SI: step = 1000.0 + else: step = 1024.0 + + thresh = 999 + depth = 0 + max_depth = len(symbols) - 1 + + # we want numbers between 0 and thresh, but don't exceed the length + # of our list. In that event, the formatting will be screwed up, + # but it'll still show the right number. + while number > thresh and depth < max_depth: + depth = depth + 1 + number = number / step + + if type(number) == type(1) or type(number) == type(1L): + # it's an int or a long, which means it didn't get divided, + # which means it's already short enough + format = '%i%s%s' + elif number < 9.95: + # must use 9.95 for proper sizing. For example, 9.99 will be + # rounded to 10.0 with the .1f format string (which is too long) + format = '%.1f%s%s' + else: + format = '%.0f%s%s' + + return(format % (float(number or 0), space, symbols[depth])) diff --git a/mic/urlgrabber/sslfactory.py b/mic/urlgrabber/sslfactory.py new file mode 100644 index 0000000..f7e6d3d --- /dev/null +++ b/mic/urlgrabber/sslfactory.py @@ -0,0 +1,89 @@ +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, +# Boston, MA 02111-1307 USA + +# This file is part of urlgrabber, a high-level cross-protocol url-grabber + +import httplib +import urllib2 + +try: + from M2Crypto import SSL + from M2Crypto import httpslib + from M2Crypto import m2urllib2 + + have_m2crypto = True +except ImportError: + have_m2crypto = False + +DEBUG = None + +if have_m2crypto: + + class M2SSLFactory: + + def __init__(self, ssl_ca_cert, ssl_context): + self.ssl_context = self._get_ssl_context(ssl_ca_cert, ssl_context) + + def _get_ssl_context(self, ssl_ca_cert, ssl_context): + """ + Create an ssl context using the CA cert file or ssl context. + + The CA cert is used first if it was passed as an option. If not, + then the supplied ssl context is used. If no ssl context was supplied, + None is returned. + """ + if ssl_ca_cert: + context = SSL.Context() + context.load_verify_locations(ssl_ca_cert) + context.set_verify(SSL.verify_peer, -1) + return context + else: + return ssl_context + + def create_https_connection(self, host, response_class = None): + connection = httplib.HTTPSConnection(host, self.ssl_context) + if response_class: + connection.response_class = response_class + return connection + + def create_opener(self, *handlers): + return m2urllib2.build_opener(self.ssl_context, *handlers) + + +class SSLFactory: + + def create_https_connection(self, host, response_class = None): + connection = httplib.HTTPSConnection(host) + if response_class: + connection.response_class = response_class + return connection + + def create_opener(self, *handlers): + return urllib2.build_opener(*handlers) + + + +def get_factory(ssl_ca_cert = None, ssl_context = None): + """ Return an SSLFactory, based on if M2Crypto is available. """ + if have_m2crypto: + return M2SSLFactory(ssl_ca_cert, ssl_context) + else: + # Log here if someone provides the args but we don't use them. + if ssl_ca_cert or ssl_context: + if DEBUG: + DEBUG.warning("SSL arguments supplied, but M2Crypto is not available. " + "Using Python SSL.") + return SSLFactory() diff --git a/setup.py b/setup.py index fe045c7..a3bc3f1 100644 --- a/setup.py +++ b/setup.py @@ -34,6 +34,7 @@ PACKAGES = [MOD_NAME, MOD_NAME + '/utils', MOD_NAME + '/imager', MOD_NAME + '/pluginbase', + MOD_NAME + '/urlgrabber', MOD_NAME + '/kickstart', MOD_NAME + '/kickstart/custom_commands', MOD_NAME + '/pykickstart', -- 2.7.4