There is python-urlgrabber package in Tools, use it instead of it in mic.
Change-Id: I64fde0066180281d7ded0c16c2e4f5c08a84b681
Signed-off-by: biao716.wang <biao716.wang@samsung.com>
+++ /dev/null
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Library General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-
-# Copyright 2002-2006 Michael D. Stenner, Ryan Tomayko
-
-# $Id: __init__.py,v 1.20 2006/09/22 00:58:55 mstenner Exp $
-
-"""A high-level cross-protocol url-grabber.
-
-Using urlgrabber, data can be fetched in three basic ways:
-
- urlgrab(url) copy the file to the local filesystem
- urlopen(url) open the remote file and return a file object
- (like urllib2.urlopen)
- urlread(url) return the contents of the file as a string
-
-When using these functions (or methods), urlgrabber supports the
-following features:
-
- * identical behavior for http://, ftp://, and file:// urls
- * http keepalive - faster downloads of many files by using
- only a single connection
- * byte ranges - fetch only a portion of the file
- * reget - for a urlgrab, resume a partial download
- * progress meters - the ability to report download progress
- automatically, even when using urlopen!
- * throttling - restrict bandwidth usage
- * retries - automatically retry a download if it fails. The
- number of retries and failure types are configurable.
- * authenticated server access for http and ftp
- * proxy support - support for authenticated http and ftp proxies
- * mirror groups - treat a list of mirrors as a single source,
- automatically switching mirrors if there is a failure.
-"""
-
-__version__ = '3.1.0'
-__date__ = '2006/09/21'
-__author__ = 'Michael D. Stenner <mstenner@linux.duke.edu>, ' \
- 'Ryan Tomayko <rtomayko@naeblis.cx>'
-__url__ = 'http://linux.duke.edu/projects/urlgrabber/'
-
-from grabber import urlgrab, urlopen, urlread
+++ /dev/null
-# This library is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public
-# License as published by the Free Software Foundation; either
-# version 2.1 of the License, or (at your option) any later version.
-#
-# This library is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the
-# Free Software Foundation, Inc.,
-# 59 Temple Place, Suite 330,
-# Boston, MA 02111-1307 USA
-
-# This file is part of urlgrabber, a high-level cross-protocol url-grabber
-# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
-
-# $Id: byterange.py,v 1.12 2006/07/20 20:15:58 mstenner Exp $
-
-import os
-import stat
-import urllib
-import urllib2
-import rfc822
-
-DEBUG = None
-
-try:
- from cStringIO import StringIO
-except ImportError as msg:
- from StringIO import StringIO
-
-class RangeError(IOError):
- """Error raised when an unsatisfiable range is requested."""
- pass
-
-class HTTPRangeHandler(urllib2.BaseHandler):
- """Handler that enables HTTP Range headers.
-
- This was extremely simple. The Range header is a HTTP feature to
- begin with so all this class does is tell urllib2 that the
- "206 Partial Content" reponse from the HTTP server is what we
- expected.
-
- Example:
- import urllib2
- import byterange
-
- range_handler = range.HTTPRangeHandler()
- opener = urllib2.build_opener(range_handler)
-
- # install it
- urllib2.install_opener(opener)
-
- # create Request and set Range header
- req = urllib2.Request('http://www.python.org/')
- req.header['Range'] = 'bytes=30-50'
- f = urllib2.urlopen(req)
- """
-
- def http_error_206(self, req, fp, code, msg, hdrs):
- # 206 Partial Content Response
- r = urllib.addinfourl(fp, hdrs, req.get_full_url())
- r.code = code
- r.msg = msg
- return r
-
- def http_error_416(self, req, fp, code, msg, hdrs):
- # HTTP's Range Not Satisfiable error
- raise RangeError('Requested Range Not Satisfiable')
-
-class HTTPSRangeHandler(HTTPRangeHandler):
- """ Range Header support for HTTPS. """
-
- def https_error_206(self, req, fp, code, msg, hdrs):
- return self.http_error_206(req, fp, code, msg, hdrs)
-
- def https_error_416(self, req, fp, code, msg, hdrs):
- self.https_error_416(req, fp, code, msg, hdrs)
-
-class RangeableFileObject:
- """File object wrapper to enable raw range handling.
- This was implemented primarilary for handling range
- specifications for file:// urls. This object effectively makes
- a file object look like it consists only of a range of bytes in
- the stream.
-
- Examples:
- # expose 10 bytes, starting at byte position 20, from
- # /etc/aliases.
- >>> fo = RangeableFileObject(file('/etc/passwd', 'r'), (20,30))
- # seek seeks within the range (to position 23 in this case)
- >>> fo.seek(3)
- # tell tells where your at _within the range_ (position 3 in
- # this case)
- >>> fo.tell()
- # read EOFs if an attempt is made to read past the last
- # byte in the range. the following will return only 7 bytes.
- >>> fo.read(30)
- """
-
- def __init__(self, fo, rangetup):
- """Create a RangeableFileObject.
- fo -- a file like object. only the read() method need be
- supported but supporting an optimized seek() is
- preferable.
- rangetup -- a (firstbyte,lastbyte) tuple specifying the range
- to work over.
- The file object provided is assumed to be at byte offset 0.
- """
- self.fo = fo
- (self.firstbyte, self.lastbyte) = range_tuple_normalize(rangetup)
- self.realpos = 0
- self._do_seek(self.firstbyte)
-
- def __getattr__(self, name):
- """This effectively allows us to wrap at the instance level.
- Any attribute not found in _this_ object will be searched for
- in self.fo. This includes methods."""
- if hasattr(self.fo, name):
- return getattr(self.fo, name)
- raise AttributeError (name)
-
- def tell(self):
- """Return the position within the range.
- This is different from fo.seek in that position 0 is the
- first byte position of the range tuple. For example, if
- this object was created with a range tuple of (500,899),
- tell() will return 0 when at byte position 500 of the file.
- """
- return (self.realpos - self.firstbyte)
-
- def seek(self,offset,whence=0):
- """Seek within the byte range.
- Positioning is identical to that described under tell().
- """
- assert whence in (0, 1, 2)
- if whence == 0: # absolute seek
- realoffset = self.firstbyte + offset
- elif whence == 1: # relative seek
- realoffset = self.realpos + offset
- elif whence == 2: # absolute from end of file
- # XXX: are we raising the right Error here?
- raise IOError('seek from end of file not supported.')
-
- # do not allow seek past lastbyte in range
- if self.lastbyte and (realoffset >= self.lastbyte):
- realoffset = self.lastbyte
-
- self._do_seek(realoffset - self.realpos)
-
- def read(self, size=-1):
- """Read within the range.
- This method will limit the size read based on the range.
- """
- size = self._calc_read_size(size)
- rslt = self.fo.read(size)
- self.realpos += len(rslt)
- return rslt
-
- def readline(self, size=-1):
- """Read lines within the range.
- This method will limit the size read based on the range.
- """
- size = self._calc_read_size(size)
- rslt = self.fo.readline(size)
- self.realpos += len(rslt)
- return rslt
-
- def _calc_read_size(self, size):
- """Handles calculating the amount of data to read based on
- the range.
- """
- if self.lastbyte:
- if size > -1:
- if ((self.realpos + size) >= self.lastbyte):
- size = (self.lastbyte - self.realpos)
- else:
- size = (self.lastbyte - self.realpos)
- return size
-
- def _do_seek(self,offset):
- """Seek based on whether wrapped object supports seek().
- offset is relative to the current position (self.realpos).
- """
- assert offset >= 0
- if not hasattr(self.fo, 'seek'):
- self._poor_mans_seek(offset)
- else:
- self.fo.seek(self.realpos + offset)
- self.realpos+= offset
-
- def _poor_mans_seek(self,offset):
- """Seek by calling the wrapped file objects read() method.
- This is used for file like objects that do not have native
- seek support. The wrapped objects read() method is called
- to manually seek to the desired position.
- offset -- read this number of bytes from the wrapped
- file object.
- raise RangeError if we encounter EOF before reaching the
- specified offset.
- """
- pos = 0
- bufsize = 1024
- while pos < offset:
- if (pos + bufsize) > offset:
- bufsize = offset - pos
- buf = self.fo.read(bufsize)
- if len(buf) != bufsize:
- raise RangeError('Requested Range Not Satisfiable')
- pos+= bufsize
-
-class FileRangeHandler(urllib2.FileHandler):
- """FileHandler subclass that adds Range support.
- This class handles Range headers exactly like an HTTP
- server would.
- """
- def open_local_file(self, req):
- import mimetypes
- import mimetools
- host = req.get_host()
- file = req.get_selector()
- localfile = urllib.url2pathname(file)
- stats = os.stat(localfile)
- size = stats[stat.ST_SIZE]
- modified = rfc822.formatdate(stats[stat.ST_MTIME])
- mtype = mimetypes.guess_type(file)[0]
- if host:
- host, port = urllib.splitport(host)
- if port or socket.gethostbyname(host) not in self.get_names():
- raise urllib2.URLError('file not on local host')
- fo = open(localfile,'rb')
- brange = req.headers.get('Range',None)
- brange = range_header_to_tuple(brange)
- assert brange != ()
- if brange:
- (fb,lb) = brange
- if lb == '': lb = size
- if fb < 0 or fb > size or lb > size:
- raise RangeError('Requested Range Not Satisfiable')
- size = (lb - fb)
- fo = RangeableFileObject(fo, (fb,lb))
- headers = mimetools.Message(StringIO(
- 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
- (mtype or 'text/plain', size, modified)))
- return urllib.addinfourl(fo, headers, 'file:'+file)
-
-
-# FTP Range Support
-# Unfortunately, a large amount of base FTP code had to be copied
-# from urllib and urllib2 in order to insert the FTP REST command.
-# Code modifications for range support have been commented as
-# follows:
-# -- range support modifications start/end here
-
-from urllib import splitport, splituser, splitpasswd, splitattr, \
- unquote, addclosehook, addinfourl
-import ftplib
-import socket
-import sys
-import ftplib
-import mimetypes
-import mimetools
-
-class FTPRangeHandler(urllib2.FTPHandler):
- def ftp_open(self, req):
- host = req.get_host()
- if not host:
- raise IOError ('ftp error', 'no host given')
- host, port = splitport(host)
- if port is None:
- port = ftplib.FTP_PORT
-
- # username/password handling
- user, host = splituser(host)
- if user:
- user, passwd = splitpasswd(user)
- else:
- passwd = None
- host = unquote(host)
- user = unquote(user or '')
- passwd = unquote(passwd or '')
-
- try:
- host = socket.gethostbyname(host)
- except socket.error as msg:
- raise urllib2.URLError(msg)
- path, attrs = splitattr(req.get_selector())
- dirs = path.split('/')
- dirs = map(unquote, dirs)
- dirs, file = dirs[:-1], dirs[-1]
- if dirs and not dirs[0]:
- dirs = dirs[1:]
- try:
- fw = self.connect_ftp(user, passwd, host, port, dirs)
- type = file and 'I' or 'D'
- for attr in attrs:
- attr, value = splitattr(attr)
- if attr.lower() == 'type' and \
- value in ('a', 'A', 'i', 'I', 'd', 'D'):
- type = value.upper()
-
- # -- range support modifications start here
- rest = None
- range_tup = range_header_to_tuple(req.headers.get('Range',None))
- assert range_tup != ()
- if range_tup:
- (fb,lb) = range_tup
- if fb > 0: rest = fb
- # -- range support modifications end here
-
- fp, retrlen = fw.retrfile(file, type, rest)
-
- # -- range support modifications start here
- if range_tup:
- (fb,lb) = range_tup
- if lb == '':
- if retrlen is None or retrlen == 0:
- raise RangeError('Requested Range Not Satisfiable due to unobtainable file length.')
- lb = retrlen
- retrlen = lb - fb
- if retrlen < 0:
- # beginning of range is larger than file
- raise RangeError('Requested Range Not Satisfiable')
- else:
- retrlen = lb - fb
- fp = RangeableFileObject(fp, (0,retrlen))
- # -- range support modifications end here
-
- headers = ""
- mtype = mimetypes.guess_type(req.get_full_url())[0]
- if mtype:
- headers += "Content-Type: %s\n" % mtype
- if retrlen is not None and retrlen >= 0:
- headers += "Content-Length: %d\n" % retrlen
- sf = StringIO(headers)
- headers = mimetools.Message(sf)
- return addinfourl(fp, headers, req.get_full_url())
- except ftplib.all_errors as msg:
- raise IOError (('ftp error', msg), sys.exc_info()[2])
-
- def connect_ftp(self, user, passwd, host, port, dirs):
- fw = ftpwrapper(user, passwd, host, port, dirs)
- return fw
-
-class ftpwrapper(urllib.ftpwrapper):
- # range support note:
- # this ftpwrapper code is copied directly from
- # urllib. The only enhancement is to add the rest
- # argument and pass it on to ftp.ntransfercmd
- def retrfile(self, file, type, rest=None):
- self.endtransfer()
- if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
- else: cmd = 'TYPE ' + type; isdir = 0
- try:
- self.ftp.voidcmd(cmd)
- except ftplib.all_errors:
- self.init()
- self.ftp.voidcmd(cmd)
- conn = None
- if file and not isdir:
- # Use nlst to see if the file exists at all
- try:
- self.ftp.nlst(file)
- except ftplib.error_perm as reason:
- raise IOError (('ftp error', reason), sys.exc_info()[2])
- # Restore the transfer mode!
- self.ftp.voidcmd(cmd)
- # Try to retrieve as a file
- try:
- cmd = 'RETR ' + file
- conn = self.ftp.ntransfercmd(cmd, rest)
- except ftplib.error_perm as reason:
- if str(reason)[:3] == '501':
- # workaround for REST not supported error
- fp, retrlen = self.retrfile(file, type)
- fp = RangeableFileObject(fp, (rest,''))
- return (fp, retrlen)
- elif str(reason)[:3] != '550':
- raise IOError (('ftp error', reason), sys.exc_info()[2])
- if not conn:
- # Set transfer mode to ASCII!
- self.ftp.voidcmd('TYPE A')
- # Try a directory listing
- if file: cmd = 'LIST ' + file
- else: cmd = 'LIST'
- conn = self.ftp.ntransfercmd(cmd)
- self.busy = 1
- # Pass back both a suitably decorated object and a retrieval length
- return (addclosehook(conn[0].makefile('rb'),
- self.endtransfer), conn[1])
-
-
-####################################################################
-# Range Tuple Functions
-# XXX: These range tuple functions might go better in a class.
-
-_rangere = None
-def range_header_to_tuple(range_header):
- """Get a (firstbyte,lastbyte) tuple from a Range header value.
-
- Range headers have the form "bytes=<firstbyte>-<lastbyte>". This
- function pulls the firstbyte and lastbyte values and returns
- a (firstbyte,lastbyte) tuple. If lastbyte is not specified in
- the header value, it is returned as an empty string in the
- tuple.
-
- Return None if range_header is None
- Return () if range_header does not conform to the range spec
- pattern.
-
- """
- global _rangere
- if range_header is None: return None
- if _rangere is None:
- import re
- _rangere = re.compile(r'^bytes=(\d{1,})-(\d*)')
- match = _rangere.match(range_header)
- if match:
- tup = range_tuple_normalize(match.group(1,2))
- if tup and tup[1]:
- tup = (tup[0],tup[1]+1)
- return tup
- return ()
-
-def range_tuple_to_header(range_tup):
- """Convert a range tuple to a Range header value.
- Return a string of the form "bytes=<firstbyte>-<lastbyte>" or None
- if no range is needed.
- """
- if range_tup is None: return None
- range_tup = range_tuple_normalize(range_tup)
- if range_tup:
- if range_tup[1]:
- range_tup = (range_tup[0],range_tup[1] - 1)
- return 'bytes=%s-%s' % range_tup
-
-def range_tuple_normalize(range_tup):
- """Normalize a (first_byte,last_byte) range tuple.
- Return a tuple whose first element is guaranteed to be an int
- and whose second element will be '' (meaning: the last byte) or
- an int. Finally, return None if the normalized tuple == (0,'')
- as that is equivelant to retrieving the entire file.
- """
- if range_tup is None: return None
- # handle first byte
- fb = range_tup[0]
- if fb in (None,''): fb = 0
- else: fb = int(fb)
- # handle last byte
- try: lb = range_tup[1]
- except IndexError: lb = ''
- else:
- if lb is None: lb = ''
- elif lb != '': lb = int(lb)
- # check if range is over the entire file
- if (fb,lb) == (0,''): return None
- # check that the range is valid
- if lb < fb: raise RangeError('Invalid byte range: %s-%s' % (fb,lb))
- return (fb,lb)
-
+++ /dev/null
-# This library is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public
-# License as published by the Free Software Foundation; either
-# version 2.1 of the License, or (at your option) any later version.
-#
-# This library is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the
-# Free Software Foundation, Inc.,
-# 59 Temple Place, Suite 330,
-# Boston, MA 02111-1307 USA
-
-# This file is part of urlgrabber, a high-level cross-protocol url-grabber
-# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
-
-"""A high-level cross-protocol url-grabber.
-
-GENERAL ARGUMENTS (kwargs)
-
- Where possible, the module-level default is indicated, and legal
- values are provided.
-
- copy_local = 0 [0|1]
-
- ignored except for file:// urls, in which case it specifies
- whether urlgrab should still make a copy of the file, or simply
- point to the existing copy. The module level default for this
- option is 0.
-
- close_connection = 0 [0|1]
-
- tells URLGrabber to close the connection after a file has been
- transfered. This is ignored unless the download happens with the
- http keepalive handler (keepalive=1). Otherwise, the connection
- is left open for further use. The module level default for this
- option is 0 (keepalive connections will not be closed).
-
- keepalive = 1 [0|1]
-
- specifies whether keepalive should be used for HTTP/1.1 servers
- that support it. The module level default for this option is 1
- (keepalive is enabled).
-
- progress_obj = None
-
- a class instance that supports the following methods:
- po.start(filename, url, basename, length, text)
- # length will be None if unknown
- po.update(read) # read == bytes read so far
- po.end()
-
- text = None
-
- specifies an alternativ text item in the beginning of the progress
- bar line. If not given, the basename of the file is used.
-
- throttle = 1.0
-
- a number - if it's an int, it's the bytes/second throttle limit.
- If it's a float, it is first multiplied by bandwidth. If throttle
- == 0, throttling is disabled. If None, the module-level default
- (which can be set on default_grabber.throttle) is used. See
- BANDWIDTH THROTTLING for more information.
-
- timeout = None
-
- a positive float expressing the number of seconds to wait for socket
- operations. If the value is None or 0.0, socket operations will block
- forever. Setting this option causes urlgrabber to call the settimeout
- method on the Socket object used for the request. See the Python
- documentation on settimeout for more information.
- http://www.python.org/doc/current/lib/socket-objects.html
-
- bandwidth = 0
-
- the nominal max bandwidth in bytes/second. If throttle is a float
- and bandwidth == 0, throttling is disabled. If None, the
- module-level default (which can be set on
- default_grabber.bandwidth) is used. See BANDWIDTH THROTTLING for
- more information.
-
- range = None
-
- a tuple of the form (first_byte, last_byte) describing a byte
- range to retrieve. Either or both of the values may set to
- None. If first_byte is None, byte offset 0 is assumed. If
- last_byte is None, the last byte available is assumed. Note that
- the range specification is python-like in that (0,10) will yeild
- the first 10 bytes of the file.
-
- If set to None, no range will be used.
-
- reget = None [None|'simple'|'check_timestamp']
-
- whether to attempt to reget a partially-downloaded file. Reget
- only applies to .urlgrab and (obviously) only if there is a
- partially downloaded file. Reget has two modes:
-
- 'simple' -- the local file will always be trusted. If there
- are 100 bytes in the local file, then the download will always
- begin 100 bytes into the requested file.
-
- 'check_timestamp' -- the timestamp of the server file will be
- compared to the timestamp of the local file. ONLY if the
- local file is newer than or the same age as the server file
- will reget be used. If the server file is newer, or the
- timestamp is not returned, the entire file will be fetched.
-
- NOTE: urlgrabber can do very little to verify that the partial
- file on disk is identical to the beginning of the remote file.
- You may want to either employ a custom "checkfunc" or simply avoid
- using reget in situations where corruption is a concern.
-
- user_agent = 'urlgrabber/VERSION'
-
- a string, usually of the form 'AGENT/VERSION' that is provided to
- HTTP servers in the User-agent header. The module level default
- for this option is "urlgrabber/VERSION".
-
- http_headers = None
-
- a tuple of 2-tuples, each containing a header and value. These
- will be used for http and https requests only. For example, you
- can do
- http_headers = (('Pragma', 'no-cache'),)
-
- ftp_headers = None
-
- this is just like http_headers, but will be used for ftp requests.
-
- proxies = None
-
- a dictionary that maps protocol schemes to proxy hosts. For
- example, to use a proxy server on host "foo" port 3128 for http
- and https URLs:
- proxies={ 'http' : 'http://foo:3128', 'https' : 'http://foo:3128' }
- note that proxy authentication information may be provided using
- normal URL constructs:
- proxies={ 'http' : 'http://user:host@foo:3128' }
- Lastly, if proxies is None, the default environment settings will
- be used.
-
- prefix = None
-
- a url prefix that will be prepended to all requested urls. For
- example:
- g = URLGrabber(prefix='http://foo.com/mirror/')
- g.urlgrab('some/file.txt')
- ## this will fetch 'http://foo.com/mirror/some/file.txt'
- This option exists primarily to allow identical behavior to
- MirrorGroup (and derived) instances. Note: a '/' will be inserted
- if necessary, so you cannot specify a prefix that ends with a
- partial file or directory name.
-
- opener = None
-
- Overrides the default urllib2.OpenerDirector provided to urllib2
- when making requests. This option exists so that the urllib2
- handler chain may be customized. Note that the range, reget,
- proxy, and keepalive features require that custom handlers be
- provided to urllib2 in order to function properly. If an opener
- option is provided, no attempt is made by urlgrabber to ensure
- chain integrity. You are responsible for ensuring that any
- extension handlers are present if said features are required.
-
- data = None
-
- Only relevant for the HTTP family (and ignored for other
- protocols), this allows HTTP POSTs. When the data kwarg is
- present (and not None), an HTTP request will automatically become
- a POST rather than GET. This is done by direct passthrough to
- urllib2. If you use this, you may also want to set the
- 'Content-length' and 'Content-type' headers with the http_headers
- option. Note that python 2.2 handles the case of these
- badly and if you do not use the proper case (shown here), your
- values will be overridden with the defaults.
-
-
-RETRY RELATED ARGUMENTS
-
- retry = None
-
- the number of times to retry the grab before bailing. If this is
- zero, it will retry forever. This was intentional... really, it
- was :). If this value is not supplied or is supplied but is None
- retrying does not occur.
-
- retrycodes = [-1,2,4,5,6,7]
-
- a sequence of errorcodes (values of e.errno) for which it should
- retry. See the doc on URLGrabError for more details on this. You
- might consider modifying a copy of the default codes rather than
- building yours from scratch so that if the list is extended in the
- future (or one code is split into two) you can still enjoy the
- benefits of the default list. You can do that with something like
- this:
-
- retrycodes = urlgrabber.grabber.URLGrabberOptions().retrycodes
- if 12 not in retrycodes:
- retrycodes.append(12)
-
- checkfunc = None
-
- a function to do additional checks. This defaults to None, which
- means no additional checking. The function should simply return
- on a successful check. It should raise URLGrabError on an
- unsuccessful check. Raising of any other exception will be
- considered immediate failure and no retries will occur.
-
- If it raises URLGrabError, the error code will determine the retry
- behavior. Negative error numbers are reserved for use by these
- passed in functions, so you can use many negative numbers for
- different types of failure. By default, -1 results in a retry,
- but this can be customized with retrycodes.
-
- If you simply pass in a function, it will be given exactly one
- argument: a CallbackObject instance with the .url attribute
- defined and either .filename (for urlgrab) or .data (for urlread).
- For urlgrab, .filename is the name of the local file. For
- urlread, .data is the actual string data. If you need other
- arguments passed to the callback (program state of some sort), you
- can do so like this:
-
- checkfunc=(function, ('arg1', 2), {'kwarg': 3})
-
- if the downloaded file has filename /tmp/stuff, then this will
- result in this call (for urlgrab):
-
- function(obj, 'arg1', 2, kwarg=3)
- # obj.filename = '/tmp/stuff'
- # obj.url = 'http://foo.com/stuff'
-
- NOTE: both the "args" tuple and "kwargs" dict must be present if
- you use this syntax, but either (or both) can be empty.
-
- failure_callback = None
-
- The callback that gets called during retries when an attempt to
- fetch a file fails. The syntax for specifying the callback is
- identical to checkfunc, except for the attributes defined in the
- CallbackObject instance. The attributes for failure_callback are:
-
- exception = the raised exception
- url = the url we're trying to fetch
- tries = the number of tries so far (including this one)
- retry = the value of the retry option
-
- The callback is present primarily to inform the calling program of
- the failure, but if it raises an exception (including the one it's
- passed) that exception will NOT be caught and will therefore cause
- future retries to be aborted.
-
- The callback is called for EVERY failure, including the last one.
- On the last try, the callback can raise an alternate exception,
- but it cannot (without severe trickiness) prevent the exception
- from being raised.
-
- interrupt_callback = None
-
- This callback is called if KeyboardInterrupt is received at any
- point in the transfer. Basically, this callback can have three
- impacts on the fetch process based on the way it exits:
-
- 1) raise no exception: the current fetch will be aborted, but
- any further retries will still take place
-
- 2) raise a URLGrabError: if you're using a MirrorGroup, then
- this will prompt a failover to the next mirror according to
- the behavior of the MirrorGroup subclass. It is recommended
- that you raise URLGrabError with code 15, 'user abort'. If
- you are NOT using a MirrorGroup subclass, then this is the
- same as (3).
-
- 3) raise some other exception (such as KeyboardInterrupt), which
- will not be caught at either the grabber or mirror levels.
- That is, it will be raised up all the way to the caller.
-
- This callback is very similar to failure_callback. They are
- passed the same arguments, so you could use the same function for
- both.
-
- urlparser = URLParser()
-
- The URLParser class handles pre-processing of URLs, including
- auth-handling for user/pass encoded in http urls, file handing
- (that is, filenames not sent as a URL), and URL quoting. If you
- want to override any of this behavior, you can pass in a
- replacement instance. See also the 'quote' option.
-
- quote = None
-
- Whether or not to quote the path portion of a url.
- quote = 1 -> quote the URLs (they're not quoted yet)
- quote = 0 -> do not quote them (they're already quoted)
- quote = None -> guess what to do
-
- This option only affects proper urls like 'file:///etc/passwd'; it
- does not affect 'raw' filenames like '/etc/passwd'. The latter
- will always be quoted as they are converted to URLs. Also, only
- the path part of a url is quoted. If you need more fine-grained
- control, you should probably subclass URLParser and pass it in via
- the 'urlparser' option.
-
-BANDWIDTH THROTTLING
-
- urlgrabber supports throttling via two values: throttle and
- bandwidth Between the two, you can either specify and absolute
- throttle threshold or specify a theshold as a fraction of maximum
- available bandwidth.
-
- throttle is a number - if it's an int, it's the bytes/second
- throttle limit. If it's a float, it is first multiplied by
- bandwidth. If throttle == 0, throttling is disabled. If None, the
- module-level default (which can be set with set_throttle) is used.
-
- bandwidth is the nominal max bandwidth in bytes/second. If throttle
- is a float and bandwidth == 0, throttling is disabled. If None, the
- module-level default (which can be set with set_bandwidth) is used.
-
- THROTTLING EXAMPLES:
-
- Lets say you have a 100 Mbps connection. This is (about) 10^8 bits
- per second, or 12,500,000 Bytes per second. You have a number of
- throttling options:
-
- *) set_bandwidth(12500000); set_throttle(0.5) # throttle is a float
-
- This will limit urlgrab to use half of your available bandwidth.
-
- *) set_throttle(6250000) # throttle is an int
-
- This will also limit urlgrab to use half of your available
- bandwidth, regardless of what bandwidth is set to.
-
- *) set_throttle(6250000); set_throttle(1.0) # float
-
- Use half your bandwidth
-
- *) set_throttle(6250000); set_throttle(2.0) # float
-
- Use up to 12,500,000 Bytes per second (your nominal max bandwidth)
-
- *) set_throttle(6250000); set_throttle(0) # throttle = 0
-
- Disable throttling - this is more efficient than a very large
- throttle setting.
-
- *) set_throttle(0); set_throttle(1.0) # throttle is float, bandwidth = 0
-
- Disable throttling - this is the default when the module is loaded.
-
- SUGGESTED AUTHOR IMPLEMENTATION (THROTTLING)
-
- While this is flexible, it's not extremely obvious to the user. I
- suggest you implement a float throttle as a percent to make the
- distinction between absolute and relative throttling very explicit.
-
- Also, you may want to convert the units to something more convenient
- than bytes/second, such as kbps or kB/s, etc.
-
-"""
-
-# $Id: grabber.py,v 1.48 2006/09/22 00:58:05 mstenner Exp $
-
-import os
-import os.path
-import sys
-import urlparse
-import rfc822
-import time
-import string
-import urllib
-import urllib2
-from stat import * # S_* and ST_*
-
-########################################################################
-# MODULE INITIALIZATION
-########################################################################
-try:
- exec('from ' + (__name__.split('.'))[0] + ' import __version__')
-except:
- __version__ = '???'
-
-import sslfactory
-
-auth_handler = urllib2.HTTPBasicAuthHandler( \
- urllib2.HTTPPasswordMgrWithDefaultRealm())
-
-try:
- from i18n import _
-except ImportError as msg:
- def _(st): return st
-
-try:
- from httplib import HTTPException
-except ImportError as msg:
- HTTPException = None
-
-try:
- # This is a convenient way to make keepalive optional.
- # Just rename the module so it can't be imported.
- import keepalive
- from keepalive import HTTPHandler, HTTPSHandler
- have_keepalive = True
-except ImportError as msg:
- have_keepalive = False
-
-try:
- # add in range support conditionally too
- import byterange
- from byterange import HTTPRangeHandler, HTTPSRangeHandler, \
- FileRangeHandler, FTPRangeHandler, range_tuple_normalize, \
- range_tuple_to_header, RangeError
-except ImportError as msg:
- range_handlers = ()
- RangeError = None
- have_range = 0
-else:
- range_handlers = (HTTPRangeHandler(), HTTPSRangeHandler(),
- FileRangeHandler(), FTPRangeHandler())
- have_range = 1
-
-
-# check whether socket timeout support is available (Python >= 2.3)
-import socket
-try:
- TimeoutError = socket.timeout
- have_socket_timeout = True
-except AttributeError:
- TimeoutError = None
- have_socket_timeout = False
-
-########################################################################
-# functions for debugging output. These functions are here because they
-# are also part of the module initialization.
-DEBUG = None
-def set_logger(DBOBJ):
- """Set the DEBUG object. This is called by _init_default_logger when
- the environment variable URLGRABBER_DEBUG is set, but can also be
- called by a calling program. Basically, if the calling program uses
- the logging module and would like to incorporate urlgrabber logging,
- then it can do so this way. It's probably not necessary as most
- internal logging is only for debugging purposes.
-
- The passed-in object should be a logging.Logger instance. It will
- be pushed into the keepalive and byterange modules if they're
- being used. The mirror module pulls this object in on import, so
- you will need to manually push into it. In fact, you may find it
- tidier to simply push your logging object (or objects) into each
- of these modules independently.
- """
-
- global DEBUG
- DEBUG = DBOBJ
- if have_keepalive and keepalive.DEBUG is None:
- keepalive.DEBUG = DBOBJ
- if have_range and byterange.DEBUG is None:
- byterange.DEBUG = DBOBJ
- if sslfactory.DEBUG is None:
- sslfactory.DEBUG = DBOBJ
-
-def _init_default_logger():
- '''Examines the environment variable URLGRABBER_DEBUG and creates
- a logging object (logging.logger) based on the contents. It takes
- the form
-
- URLGRABBER_DEBUG=level,filename
-
- where "level" can be either an integer or a log level from the
- logging module (DEBUG, INFO, etc). If the integer is zero or
- less, logging will be disabled. Filename is the filename where
- logs will be sent. If it is "-", then stdout will be used. If
- the filename is empty or missing, stderr will be used. If the
- variable cannot be processed or the logging module cannot be
- imported (python < 2.3) then logging will be disabled. Here are
- some examples:
-
- URLGRABBER_DEBUG=1,debug.txt # log everything to debug.txt
- URLGRABBER_DEBUG=WARNING,- # log warning and higher to stdout
- URLGRABBER_DEBUG=INFO # log info and higher to stderr
-
- This funtion is called during module initialization. It is not
- intended to be called from outside. The only reason it is a
- function at all is to keep the module-level namespace tidy and to
- collect the code into a nice block.'''
-
- try:
- dbinfo = os.environ['URLGRABBER_DEBUG'].split(',')
- import logging
- level = logging._levelNames.get(dbinfo[0], int(dbinfo[0]))
- if level < 1: raise ValueError()
-
- formatter = logging.Formatter('%(asctime)s %(message)s')
- if len(dbinfo) > 1: filename = dbinfo[1]
- else: filename = ''
- if filename == '': handler = logging.StreamHandler(sys.stderr)
- elif filename == '-': handler = logging.StreamHandler(sys.stdout)
- else: handler = logging.FileHandler(filename)
- handler.setFormatter(formatter)
- DBOBJ = logging.getLogger('urlgrabber')
- DBOBJ.addHandler(handler)
- DBOBJ.setLevel(level)
- except (KeyError, ImportError, ValueError):
- DBOBJ = None
- set_logger(DBOBJ)
-
-_init_default_logger()
-########################################################################
-# END MODULE INITIALIZATION
-########################################################################
-
-
-
-class URLGrabError(IOError):
- """
- URLGrabError error codes:
-
- URLGrabber error codes (0 -- 255)
- 0 - everything looks good (you should never see this)
- 1 - malformed url
- 2 - local file doesn't exist
- 3 - request for non-file local file (dir, etc)
- 4 - IOError on fetch
- 5 - OSError on fetch
- 6 - no content length header when we expected one
- 7 - HTTPException
- 8 - Exceeded read limit (for urlread)
- 9 - Requested byte range not satisfiable.
- 10 - Byte range requested, but range support unavailable
- 11 - Illegal reget mode
- 12 - Socket timeout
- 13 - malformed proxy url
- 14 - HTTPError (includes .code and .exception attributes)
- 15 - user abort
-
- MirrorGroup error codes (256 -- 511)
- 256 - No more mirrors left to try
-
- Custom (non-builtin) classes derived from MirrorGroup (512 -- 767)
- [ this range reserved for application-specific error codes ]
-
- Retry codes (< 0)
- -1 - retry the download, unknown reason
-
- Note: to test which group a code is in, you can simply do integer
- division by 256: e.errno / 256
-
- Negative codes are reserved for use by functions passed in to
- retrygrab with checkfunc. The value -1 is built in as a generic
- retry code and is already included in the retrycodes list.
- Therefore, you can create a custom check function that simply
- returns -1 and the fetch will be re-tried. For more customized
- retries, you can use other negative number and include them in
- retry-codes. This is nice for outputting useful messages about
- what failed.
-
- You can use these error codes like so:
- try: urlgrab(url)
- except URLGrabError, e:
- if e.errno == 3: ...
- # or
- print e.strerror
- # or simply
- print e #### print '[Errno %i] %s' % (e.errno, e.strerror)
- """
- pass
-
-class CallbackObject:
- """Container for returned callback data.
-
- This is currently a dummy class into which urlgrabber can stuff
- information for passing to callbacks. This way, the prototype for
- all callbacks is the same, regardless of the data that will be
- passed back. Any function that accepts a callback function as an
- argument SHOULD document what it will define in this object.
-
- It is possible that this class will have some greater
- functionality in the future.
- """
- def __init__(self, **kwargs):
- self.__dict__.update(kwargs)
-
-def urlgrab(url, filename=None, **kwargs):
- """grab the file at <url> and make a local copy at <filename>
- If filename is none, the basename of the url is used.
- urlgrab returns the filename of the local file, which may be different
- from the passed-in filename if the copy_local kwarg == 0.
-
- See module documentation for a description of possible kwargs.
- """
- return default_grabber.urlgrab(url, filename, **kwargs)
-
-def urlopen(url, **kwargs):
- """open the url and return a file object
- If a progress object or throttle specifications exist, then
- a special file object will be returned that supports them.
- The file object can be treated like any other file object.
-
- See module documentation for a description of possible kwargs.
- """
- return default_grabber.urlopen(url, **kwargs)
-
-def urlread(url, limit=None, **kwargs):
- """read the url into a string, up to 'limit' bytes
- If the limit is exceeded, an exception will be thrown. Note that urlread
- is NOT intended to be used as a way of saying "I want the first N bytes"
- but rather 'read the whole file into memory, but don't use too much'
-
- See module documentation for a description of possible kwargs.
- """
- return default_grabber.urlread(url, limit, **kwargs)
-
-
-class URLParser:
- """Process the URLs before passing them to urllib2.
-
- This class does several things:
-
- * add any prefix
- * translate a "raw" file to a proper file: url
- * handle any http or https auth that's encoded within the url
- * quote the url
-
- Only the "parse" method is called directly, and it calls sub-methods.
-
- An instance of this class is held in the options object, which
- means that it's easy to change the behavior by sub-classing and
- passing the replacement in. It need only have a method like:
-
- url, parts = urlparser.parse(url, opts)
- """
-
- def parse(self, url, opts):
- """parse the url and return the (modified) url and its parts
-
- Note: a raw file WILL be quoted when it's converted to a URL.
- However, other urls (ones which come with a proper scheme) may
- or may not be quoted according to opts.quote
-
- opts.quote = 1 --> quote it
- opts.quote = 0 --> do not quote it
- opts.quote = None --> guess
- """
- quote = opts.quote
-
- if opts.prefix:
- url = self.add_prefix(url, opts.prefix)
-
- parts = urlparse.urlparse(url)
- (scheme, host, path, parm, query, frag) = parts
-
- if not scheme or (len(scheme) == 1 and scheme in string.letters):
- # if a scheme isn't specified, we guess that it's "file:"
- if url[0] not in '/\\': url = os.path.abspath(url)
- url = 'file:' + urllib.pathname2url(url)
- parts = urlparse.urlparse(url)
- quote = 0 # pathname2url quotes, so we won't do it again
-
- if scheme in ['http', 'https']:
- parts = self.process_http(parts)
-
- if quote is None:
- quote = self.guess_should_quote(parts)
- if quote:
- parts = self.quote(parts)
-
- url = urlparse.urlunparse(parts)
- return url, parts
-
- def add_prefix(self, url, prefix):
- if prefix[-1] == '/' or url[0] == '/':
- url = prefix + url
- else:
- url = prefix + '/' + url
- return url
-
- def process_http(self, parts):
- (scheme, host, path, parm, query, frag) = parts
-
- if '@' in host and auth_handler:
- try:
- user_pass, host = host.split('@', 1)
- if ':' in user_pass:
- user, password = user_pass.split(':', 1)
- except ValueError as e:
- raise URLGrabError(1, _('Bad URL: %s') % host)
- if DEBUG: DEBUG.info('adding HTTP auth: %s, XXXXXXXX', user)
- auth_handler.add_password(None, host, user, password)
-
- return (scheme, host, path, parm, query, frag)
-
- def quote(self, parts):
- """quote the URL
-
- This method quotes ONLY the path part. If you need to quote
- other parts, you should override this and pass in your derived
- class. The other alternative is to quote other parts before
- passing into urlgrabber.
- """
- (scheme, host, path, parm, query, frag) = parts
- path = urllib.quote(path)
- return (scheme, host, path, parm, query, frag)
-
- hexvals = '0123456789ABCDEF'
- def guess_should_quote(self, parts):
- """
- Guess whether we should quote a path. This amounts to
- guessing whether it's already quoted.
-
- find ' ' -> 1
- find '%' -> 1
- find '%XX' -> 0
- else -> 1
- """
- (scheme, host, path, parm, query, frag) = parts
- if ' ' in path:
- return 1
- ind = string.find(path, '%')
- if ind > -1:
- while ind > -1:
- if len(path) < ind+3:
- return 1
- code = path[ind+1:ind+3].upper()
- if code[0] not in self.hexvals or \
- code[1] not in self.hexvals:
- return 1
- ind = string.find(path, '%', ind+1)
- return 0
- return 1
-
-class URLGrabberOptions:
- """Class to ease kwargs handling."""
-
- def __init__(self, delegate=None, **kwargs):
- """Initialize URLGrabberOptions object.
- Set default values for all options and then update options specified
- in kwargs.
- """
- self.delegate = delegate
- if delegate is None:
- self._set_defaults()
- self._set_attributes(**kwargs)
-
- def __getattr__(self, name):
- if self.delegate and hasattr(self.delegate, name):
- return getattr(self.delegate, name)
- raise AttributeError (name)
-
- def raw_throttle(self):
- """Calculate raw throttle value from throttle and bandwidth
- values.
- """
- if self.throttle <= 0:
- return 0
- elif type(self.throttle) == type(0):
- return float(self.throttle)
- else: # throttle is a float
- return self.bandwidth * self.throttle
-
- def derive(self, **kwargs):
- """Create a derived URLGrabberOptions instance.
- This method creates a new instance and overrides the
- options specified in kwargs.
- """
- return URLGrabberOptions(delegate=self, **kwargs)
-
- def _set_attributes(self, **kwargs):
- """Update object attributes with those provided in kwargs."""
- self.__dict__.update(kwargs)
- if have_range and kwargs.has_key('range'):
- # normalize the supplied range value
- self.range = range_tuple_normalize(self.range)
- if not self.reget in [None, 'simple', 'check_timestamp']:
- raise URLGrabError(11, _('Illegal reget mode: %s') \
- % (self.reget, ))
-
- def _set_defaults(self):
- """Set all options to their default values.
- When adding new options, make sure a default is
- provided here.
- """
- self.progress_obj = None
- self.throttle = 1.0
- self.bandwidth = 0
- self.retry = None
- self.retrycodes = [-1,2,4,5,6,7]
- self.checkfunc = None
- self.copy_local = 0
- self.close_connection = 0
- self.range = None
- self.user_agent = 'urlgrabber/%s' % __version__
- self.keepalive = 1
- self.proxies = None
- self.reget = None
- self.failure_callback = None
- self.interrupt_callback = None
- self.prefix = None
- self.opener = None
- self.cache_openers = True
- self.timeout = None
- self.text = None
- self.http_headers = None
- self.ftp_headers = None
- self.data = None
- self.urlparser = URLParser()
- self.quote = None
- self.ssl_ca_cert = None
- self.ssl_context = None
-
-class URLGrabber:
- """Provides easy opening of URLs with a variety of options.
-
- All options are specified as kwargs. Options may be specified when
- the class is created and may be overridden on a per request basis.
-
- New objects inherit default values from default_grabber.
- """
-
- def __init__(self, **kwargs):
- self.opts = URLGrabberOptions(**kwargs)
-
- def _retry(self, opts, func, *args):
- tries = 0
- while 1:
- # there are only two ways out of this loop. The second has
- # several "sub-ways"
- # 1) via the return in the "try" block
- # 2) by some exception being raised
- # a) an excepton is raised that we don't "except"
- # b) a callback raises ANY exception
- # c) we're not retry-ing or have run out of retries
- # d) the URLGrabError code is not in retrycodes
- # beware of infinite loops :)
- tries = tries + 1
- exception = None
- retrycode = None
- callback = None
- if DEBUG: DEBUG.info('attempt %i/%s: %s',
- tries, opts.retry, args[0])
- try:
- r = apply(func, (opts,) + args, {})
- if DEBUG: DEBUG.info('success')
- return r
- except URLGrabError as e:
- exception = e
- callback = opts.failure_callback
- retrycode = e.errno
- except KeyboardInterrupt as e:
- exception = e
- callback = opts.interrupt_callback
-
- if DEBUG: DEBUG.info('exception: %s', exception)
- if callback:
- if DEBUG: DEBUG.info('calling callback: %s', callback)
- cb_func, cb_args, cb_kwargs = self._make_callback(callback)
- obj = CallbackObject(exception=exception, url=args[0],
- tries=tries, retry=opts.retry)
- cb_func(obj, *cb_args, **cb_kwargs)
-
- if (opts.retry is None) or (tries == opts.retry):
- if DEBUG:DEBUG.info('retries exceeded, re-raising')
- raise Exception()
- if (retrycode is not None) and (retrycode not in opts.retrycodes):
- if DEBUG:DEBUG.info('retrycode (%i) not in list %s, re-raising',
- retrycode, opts.retrycodes)
- raise Exception()
-
- def urlopen(self, url, **kwargs):
- """open the url and return a file object
- If a progress object or throttle value specified when this
- object was created, then a special file object will be
- returned that supports them. The file object can be treated
- like any other file object.
- """
- opts = self.opts.derive(**kwargs)
- (url,parts) = opts.urlparser.parse(url, opts)
- def retryfunc(opts, url):
- return URLGrabberFileObject(url, filename=None, opts=opts)
- return self._retry(opts, retryfunc, url)
-
- def urlgrab(self, url, filename=None, **kwargs):
- """grab the file at <url> and make a local copy at <filename>
- If filename is none, the basename of the url is used.
- urlgrab returns the filename of the local file, which may be
- different from the passed-in filename if copy_local == 0.
- """
- opts = self.opts.derive(**kwargs)
- (url,parts) = opts.urlparser.parse(url, opts)
- (scheme, host, path, parm, query, frag) = parts
- if filename is None:
- filename = os.path.basename( urllib.unquote(path) )
- if scheme == 'file' and not opts.copy_local:
- # just return the name of the local file - don't make a
- # copy currently
- path = urllib.url2pathname(path)
- if host:
- path = os.path.normpath('//' + host + path)
- if not os.path.exists(path):
- raise URLGrabError(2,
- _('Local file does not exist: %s') % (path, ))
- elif not os.path.isfile(path):
- raise URLGrabError(3,
- _('Not a normal file: %s') % (path, ))
- elif not opts.range:
- return path
-
- def retryfunc(opts, url, filename):
- fo = URLGrabberFileObject(url, filename, opts)
- try:
- fo._do_grab()
- if not opts.checkfunc is None:
- cb_func, cb_args, cb_kwargs = \
- self._make_callback(opts.checkfunc)
- obj = CallbackObject()
- obj.filename = filename
- obj.url = url
- apply(cb_func, (obj, )+cb_args, cb_kwargs)
- finally:
- fo.close()
- return filename
-
- return self._retry(opts, retryfunc, url, filename)
-
- def urlread(self, url, limit=None, **kwargs):
- """read the url into a string, up to 'limit' bytes
- If the limit is exceeded, an exception will be thrown. Note
- that urlread is NOT intended to be used as a way of saying
- "I want the first N bytes" but rather 'read the whole file
- into memory, but don't use too much'
- """
- opts = self.opts.derive(**kwargs)
- (url,parts) = opts.urlparser.parse(url, opts)
- if limit is not None:
- limit = limit + 1
-
- def retryfunc(opts, url, limit):
- fo = URLGrabberFileObject(url, filename=None, opts=opts)
- s = ''
- try:
- # this is an unfortunate thing. Some file-like objects
- # have a default "limit" of None, while the built-in (real)
- # file objects have -1. They each break the other, so for
- # now, we just force the default if necessary.
- if limit is None: s = fo.read()
- else: s = fo.read(limit)
-
- if not opts.checkfunc is None:
- cb_func, cb_args, cb_kwargs = \
- self._make_callback(opts.checkfunc)
- obj = CallbackObject()
- obj.data = s
- obj.url = url
- apply(cb_func, (obj, )+cb_args, cb_kwargs)
- finally:
- fo.close()
- return s
-
- s = self._retry(opts, retryfunc, url, limit)
- if limit and len(s) > limit:
- raise URLGrabError(8,
- _('Exceeded limit (%i): %s') % (limit, url))
- return s
-
- def _make_callback(self, callback_obj):
- if callable(callback_obj):
- return callback_obj, (), {}
- else:
- return callback_obj
-
-# create the default URLGrabber used by urlXXX functions.
-# NOTE: actual defaults are set in URLGrabberOptions
-default_grabber = URLGrabber()
-
-class URLGrabberFileObject:
- """This is a file-object wrapper that supports progress objects
- and throttling.
-
- This exists to solve the following problem: lets say you want to
- drop-in replace a normal open with urlopen. You want to use a
- progress meter and/or throttling, but how do you do that without
- rewriting your code? Answer: urlopen will return a wrapped file
- object that does the progress meter and-or throttling internally.
- """
-
- def __init__(self, url, filename, opts):
- self.url = url
- self.filename = filename
- self.opts = opts
- self.fo = None
- self._rbuf = ''
- self._rbufsize = 1024*8
- self._ttime = time.time()
- self._tsize = 0
- self._amount_read = 0
- self._opener = None
- self._do_open()
-
- def __getattr__(self, name):
- """This effectively allows us to wrap at the instance level.
- Any attribute not found in _this_ object will be searched for
- in self.fo. This includes methods."""
- if hasattr(self.fo, name):
- return getattr(self.fo, name)
- raise AttributeError (name)
-
- def _get_opener(self):
- """Build a urllib2 OpenerDirector based on request options."""
- if self.opts.opener:
- return self.opts.opener
- elif self._opener is None:
- handlers = []
- need_keepalive_handler = (have_keepalive and self.opts.keepalive)
- need_range_handler = (range_handlers and \
- (self.opts.range or self.opts.reget))
- # if you specify a ProxyHandler when creating the opener
- # it _must_ come before all other handlers in the list or urllib2
- # chokes.
- if self.opts.proxies:
- handlers.append( CachedProxyHandler(self.opts.proxies) )
-
- # -------------------------------------------------------
- # OK, these next few lines are a serious kludge to get
- # around what I think is a bug in python 2.2's
- # urllib2. The basic idea is that default handlers
- # get applied first. If you override one (like a
- # proxy handler), then the default gets pulled, but
- # the replacement goes on the end. In the case of
- # proxies, this means the normal handler picks it up
- # first and the proxy isn't used. Now, this probably
- # only happened with ftp or non-keepalive http, so not
- # many folks saw it. The simple approach to fixing it
- # is just to make sure you override the other
- # conflicting defaults as well. I would LOVE to see
- # these go way or be dealt with more elegantly. The
- # problem isn't there after 2.2. -MDS 2005/02/24
- if not need_keepalive_handler:
- handlers.append( urllib2.HTTPHandler() )
- if not need_range_handler:
- handlers.append( urllib2.FTPHandler() )
- # -------------------------------------------------------
-
- ssl_factory = sslfactory.get_factory(self.opts.ssl_ca_cert,
- self.opts.ssl_context)
-
- if need_keepalive_handler:
- handlers.append(HTTPHandler())
- handlers.append(HTTPSHandler(ssl_factory))
- if need_range_handler:
- handlers.extend( range_handlers )
- handlers.append( auth_handler )
- if self.opts.cache_openers:
- self._opener = CachedOpenerDirector(ssl_factory, *handlers)
- else:
- self._opener = ssl_factory.create_opener(*handlers)
- # OK, I don't like to do this, but otherwise, we end up with
- # TWO user-agent headers.
- self._opener.addheaders = []
- return self._opener
-
- def _do_open(self):
- opener = self._get_opener()
-
- req = urllib2.Request(self.url, self.opts.data) # build request object
- self._add_headers(req) # add misc headers that we need
- self._build_range(req) # take care of reget and byterange stuff
-
- fo, hdr = self._make_request(req, opener)
- if self.reget_time and self.opts.reget == 'check_timestamp':
- # do this if we have a local file with known timestamp AND
- # we're in check_timestamp reget mode.
- fetch_again = 0
- try:
- modified_tuple = hdr.getdate_tz('last-modified')
- modified_stamp = rfc822.mktime_tz(modified_tuple)
- if modified_stamp > self.reget_time: fetch_again = 1
- except (TypeError,):
- fetch_again = 1
-
- if fetch_again:
- # the server version is newer than the (incomplete) local
- # version, so we should abandon the version we're getting
- # and fetch the whole thing again.
- fo.close()
- self.opts.reget = None
- del req.headers['Range']
- self._build_range(req)
- fo, hdr = self._make_request(req, opener)
-
- (scheme, host, path, parm, query, frag) = urlparse.urlparse(self.url)
- path = urllib.unquote(path)
- if not (self.opts.progress_obj or self.opts.raw_throttle() \
- or self.opts.timeout):
- # if we're not using the progress_obj, throttling, or timeout
- # we can get a performance boost by going directly to
- # the underlying fileobject for reads.
- self.read = fo.read
- if hasattr(fo, 'readline'):
- self.readline = fo.readline
- elif self.opts.progress_obj:
- try:
- length = int(hdr['Content-Length'])
- length = length + self._amount_read # Account for regets
- except (KeyError, ValueError, TypeError):
- length = None
-
- self.opts.progress_obj.start(str(self.filename),
- urllib.unquote(self.url),
- os.path.basename(path),
- length, text=self.opts.text)
- self.opts.progress_obj.update(0)
- (self.fo, self.hdr) = (fo, hdr)
-
- def _add_headers(self, req):
- if self.opts.user_agent:
- req.add_header('User-agent', self.opts.user_agent)
- try: req_type = req.get_type()
- except ValueError: req_type = None
- if self.opts.http_headers and req_type in ('http', 'https'):
- for h, v in self.opts.http_headers:
- req.add_header(h, v)
- if self.opts.ftp_headers and req_type == 'ftp':
- for h, v in self.opts.ftp_headers:
- req.add_header(h, v)
-
- def _build_range(self, req):
- self.reget_time = None
- self.append = 0
- reget_length = 0
- rt = None
- if have_range and self.opts.reget and type(self.filename) == type(''):
- # we have reget turned on and we're dumping to a file
- try:
- s = os.stat(self.filename)
- except OSError:
- pass
- else:
- self.reget_time = s[ST_MTIME]
- reget_length = s[ST_SIZE]
-
- # Set initial length when regetting
- self._amount_read = reget_length
-
- rt = reget_length, ''
- self.append = 1
-
- if self.opts.range:
- if not have_range:
- raise URLGrabError(10, _('Byte range requested but range '\
- 'support unavailable'))
- rt = self.opts.range
- if rt[0]: rt = (rt[0] + reget_length, rt[1])
-
- if rt:
- header = range_tuple_to_header(rt)
- if header: req.add_header('Range', header)
-
- def _make_request(self, req, opener):
- try:
- if have_socket_timeout and self.opts.timeout:
- old_to = socket.getdefaulttimeout()
- socket.setdefaulttimeout(self.opts.timeout)
- try:
- fo = opener.open(req)
- finally:
- socket.setdefaulttimeout(old_to)
- else:
- fo = opener.open(req)
- hdr = fo.info()
- except ValueError as e:
- raise URLGrabError(1, _('Bad URL: %s') % (e, ))
- except RangeError as e:
- raise URLGrabError(9, str(e))
- except urllib2.HTTPError as e:
- new_e = URLGrabError(14, str(e))
- new_e.code = e.code
- new_e.exception = e
- raise new_e
- except IOError as e:
- if hasattr(e, 'reason') and have_socket_timeout and \
- isinstance(e.reason, TimeoutError):
- raise URLGrabError(12, _('Timeout: %s') % (e, ))
- else:
- raise URLGrabError(4, _('IOError: %s') % (e, ))
- except OSError as e:
- raise URLGrabError(5, _('OSError: %s') % (e, ))
- except HTTPException as e:
- raise URLGrabError(7, _('HTTP Exception (%s): %s') % \
- (e.__class__.__name__, e))
- else:
- return (fo, hdr)
-
- def _do_grab(self):
- """dump the file to self.filename."""
- if self.append: new_fo = open(self.filename, 'ab')
- else: new_fo = open(self.filename, 'wb')
- bs = 1024*8
- size = 0
-
- block = self.read(bs)
- size = size + len(block)
- while block:
- new_fo.write(block)
- block = self.read(bs)
- size = size + len(block)
-
- new_fo.close()
- try:
- modified_tuple = self.hdr.getdate_tz('last-modified')
- modified_stamp = rfc822.mktime_tz(modified_tuple)
- os.utime(self.filename, (modified_stamp, modified_stamp))
- except (TypeError,) as e: pass
-
- return size
-
- def _fill_buffer(self, amt=None):
- """fill the buffer to contain at least 'amt' bytes by reading
- from the underlying file object. If amt is None, then it will
- read until it gets nothing more. It updates the progress meter
- and throttles after every self._rbufsize bytes."""
- # the _rbuf test is only in this first 'if' for speed. It's not
- # logically necessary
- if self._rbuf and not amt is None:
- L = len(self._rbuf)
- if amt > L:
- amt = amt - L
- else:
- return
-
- # if we've made it here, then we don't have enough in the buffer
- # and we need to read more.
-
- buf = [self._rbuf]
- bufsize = len(self._rbuf)
- while amt is None or amt:
- # first, delay if necessary for throttling reasons
- if self.opts.raw_throttle():
- diff = self._tsize/self.opts.raw_throttle() - \
- (time.time() - self._ttime)
- if diff > 0: time.sleep(diff)
- self._ttime = time.time()
-
- # now read some data, up to self._rbufsize
- if amt is None: readamount = self._rbufsize
- else: readamount = min(amt, self._rbufsize)
- try:
- new = self.fo.read(readamount)
- except socket.error as e:
- raise URLGrabError(4, _('Socket Error: %s') % (e, ))
- except TimeoutError as e:
- raise URLGrabError(12, _('Timeout: %s') % (e, ))
- except IOError as e:
- raise URLGrabError(4, _('IOError: %s') %(e,))
- newsize = len(new)
- if not newsize: break # no more to read
-
- if amt: amt = amt - newsize
- buf.append(new)
- bufsize = bufsize + newsize
- self._tsize = newsize
- self._amount_read = self._amount_read + newsize
- if self.opts.progress_obj:
- self.opts.progress_obj.update(self._amount_read)
-
- self._rbuf = string.join(buf, '')
- return
-
- def read(self, amt=None): #pylint: disable=method-hidden
- self._fill_buffer(amt)
- if amt is None:
- s, self._rbuf = self._rbuf, ''
- else:
- s, self._rbuf = self._rbuf[:amt], self._rbuf[amt:]
- return s
-
- def readline(self, limit=-1): #pylint: disable=method-hidden
- i = string.find(self._rbuf, '\n')
- while i < 0 and not (0 < limit <= len(self._rbuf)):
- L = len(self._rbuf)
- self._fill_buffer(L + self._rbufsize)
- if not len(self._rbuf) > L: break
- i = string.find(self._rbuf, '\n', L)
-
- if i < 0: i = len(self._rbuf)
- else: i = i+1
- if 0 <= limit < len(self._rbuf): i = limit
-
- s, self._rbuf = self._rbuf[:i], self._rbuf[i:]
- return s
-
- def close(self):
- if self.opts.progress_obj:
- self.opts.progress_obj.end(self._amount_read)
- self.fo.close()
- if self.opts.close_connection:
- try: self.fo.close_connection()
- except: pass
-
-_handler_cache = []
-def CachedOpenerDirector(ssl_factory = None, *handlers):
- for (cached_handlers, opener) in _handler_cache:
- if cached_handlers == handlers:
- for handler in opener.handlers:
- handler.add_parent(opener)
- return opener
- if not ssl_factory:
- ssl_factory = sslfactory.get_factory()
- opener = ssl_factory.create_opener(*handlers)
- _handler_cache.append( (handlers, opener) )
- return opener
-
-_proxy_cache = []
-def CachedProxyHandler(proxies):
- for (pdict, handler) in _proxy_cache:
- if pdict == proxies:
- if DEBUG: DEBUG.debug('re-using proxy settings: %s', proxies)
- break
- else:
- for k, v in proxies.items():
- utype, url = urllib.splittype(v)
- host, other = urllib.splithost(url)
- if (utype is None) or (host is None):
- raise URLGrabError(13, _('Bad proxy URL: %s') % v)
-
- if DEBUG: DEBUG.info('creating new proxy handler: %s', proxies)
- handler = urllib2.ProxyHandler(proxies)
- _proxy_cache.append( (proxies, handler) )
- return handler
-
-#####################################################################
-# DEPRECATED FUNCTIONS
-def set_throttle(new_throttle):
- """Deprecated. Use: default_grabber.throttle = new_throttle"""
- default_grabber.throttle = new_throttle
-
-def set_bandwidth(new_bandwidth):
- """Deprecated. Use: default_grabber.bandwidth = new_bandwidth"""
- default_grabber.bandwidth = new_bandwidth
-
-def set_progress_obj(new_progress_obj):
- """Deprecated. Use: default_grabber.progress_obj = new_progress_obj"""
- default_grabber.progress_obj = new_progress_obj
-
-def set_user_agent(new_user_agent):
- """Deprecated. Use: default_grabber.user_agent = new_user_agent"""
- default_grabber.user_agent = new_user_agent
-
-def retrygrab(url, filename=None, copy_local=0, close_connection=0,
- progress_obj=None, throttle=None, bandwidth=None,
- numtries=3, retrycodes=[-1,2,4,5,6,7], checkfunc=None):
- """Deprecated. Use: urlgrab() with the retry arg instead"""
- kwargs = {'copy_local' : copy_local,
- 'close_connection' : close_connection,
- 'progress_obj' : progress_obj,
- 'throttle' : throttle,
- 'bandwidth' : bandwidth,
- 'retry' : numtries,
- 'retrycodes' : retrycodes,
- 'checkfunc' : checkfunc
- }
- return urlgrab(url, filename, **kwargs)
-
-
-#####################################################################
-# TESTING
-def _main_test():
- import sys
- try:
- #url, filename = sys.argv[1:3]
- url = sys.argv[1]
- filename = sys.argv[2]
- except ValueError:
- print( 'usage:', sys.argv[0], \
- '<url> <filename> [copy_local=0|1] [close_connection=0|1]')
- sys.exit()
-
- kwargs = {}
- for a in sys.argv[3:]:
- k, v = string.split(a, '=', 1)
- kwargs[k] = int(v)
-
- set_throttle(1.0)
- set_bandwidth(32 * 1024)
- print( "throttle: %s, throttle bandwidth: %s B/s" % (default_grabber.throttle,
- default_grabber.bandwidth))
-
- try: from progress import text_progress_meter
- except ImportError as e: pass
- else: kwargs['progress_obj'] = text_progress_meter()
-
- try: name = apply(urlgrab, (url, filename), kwargs)
- except URLGrabError as e: print( e)
- else: print( 'LOCAL FILE:', name)
-
-
-def _retry_test():
- import sys
- try:
- #url, filename = sys.argv[1:3], fix pylint unbalanced-tuple-unpacking error
- url = sys.argv[1]
- filename = sys.argv[2]
- except ValueError:
- print( 'usage:', sys.argv[0], \
- '<url> <filename> [copy_local=0|1] [close_connection=0|1]')
- sys.exit()
-
- kwargs = {}
- for a in sys.argv[3:]:
- k, v = string.split(a, '=', 1)
- kwargs[k] = int(v)
-
- try: from progress import text_progress_meter
- except ImportError as e: pass
- else: kwargs['progress_obj'] = text_progress_meter()
-
- def cfunc(filename, hello, there='foo'):
- print( hello, there)
- import random
- rnum = random.random()
- if rnum < .5:
- print( 'forcing retry')
- raise URLGrabError(-1, 'forcing retry')
- if rnum < .75:
- print( 'forcing failure')
- raise URLGrabError(-2, 'forcing immediate failure')
- print( 'success')
- return
-
- kwargs['checkfunc'] = (cfunc, ('hello',), {'there':'there'})
- try: name = apply(retrygrab, (url, filename), kwargs)
- except URLGrabError as e: print( e)
- else: print( 'LOCAL FILE:', name)
-
-def _file_object_test(filename=None):
- import random, cStringIO, sys
- if filename is None:
- filename = __file__
- print( 'using file "%s" for comparisons' % filename)
- fo = open(filename)
- s_input = fo.read()
- fo.close()
-
- for testfunc in [_test_file_object_smallread,
- _test_file_object_readall,
- _test_file_object_readline,
- _test_file_object_readlines]:
- fo_input = cStringIO.StringIO(s_input)
- fo_output = cStringIO.StringIO()
- wrapper = URLGrabberFileObject(fo_input, None, 0)
- print( 'testing %-30s ' % testfunc.__name__,
- testfunc(wrapper, fo_output))
- s_output = fo_output.getvalue()
- if s_output == s_input: print( 'passed')
- else: print( 'FAILED')
-
-def _test_file_object_smallread(wrapper, fo_output):
- while 1:
- s = wrapper.read(23)
- fo_output.write(s)
- if not s: return
-
-def _test_file_object_readall(wrapper, fo_output):
- s = wrapper.read()
- fo_output.write(s)
-
-def _test_file_object_readline(wrapper, fo_output):
- while 1:
- s = wrapper.readline()
- fo_output.write(s)
- if not s: return
-
-def _test_file_object_readlines(wrapper, fo_output):
- li = wrapper.readlines()
- fo_output.write(string.join(li, ''))
-
-if __name__ == '__main__':
- _main_test()
- _retry_test()
- _file_object_test('test')
+++ /dev/null
-# This library is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public
-# License as published by the Free Software Foundation; either
-# version 2.1 of the License, or (at your option) any later version.
-#
-# This library is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the
-# Free Software Foundation, Inc.,
-# 59 Temple Place, Suite 330,
-# Boston, MA 02111-1307 USA
-
-# This file is part of urlgrabber, a high-level cross-protocol url-grabber
-# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
-
-"""An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive.
-
->>> import urllib2
->>> from keepalive import HTTPHandler
->>> keepalive_handler = HTTPHandler()
->>> opener = urllib2.build_opener(keepalive_handler)
->>> urllib2.install_opener(opener)
->>>
->>> fo = urllib2.urlopen('http://www.python.org')
-
-If a connection to a given host is requested, and all of the existing
-connections are still in use, another connection will be opened. If
-the handler tries to use an existing connection but it fails in some
-way, it will be closed and removed from the pool.
-
-To remove the handler, simply re-run build_opener with no arguments, and
-install that opener.
-
-You can explicitly close connections by using the close_connection()
-method of the returned file-like object (described below) or you can
-use the handler methods:
-
- close_connection(host)
- close_all()
- open_connections()
-
-NOTE: using the close_connection and close_all methods of the handler
-should be done with care when using multiple threads.
- * there is nothing that prevents another thread from creating new
- connections immediately after connections are closed
- * no checks are done to prevent in-use connections from being closed
-
->>> keepalive_handler.close_all()
-
-EXTRA ATTRIBUTES AND METHODS
-
- Upon a status of 200, the object returned has a few additional
- attributes and methods, which should not be used if you want to
- remain consistent with the normal urllib2-returned objects:
-
- close_connection() - close the connection to the host
- readlines() - you know, readlines()
- status - the return status (ie 404)
- reason - english translation of status (ie 'File not found')
-
- If you want the best of both worlds, use this inside an
- AttributeError-catching try:
-
- >>> try: status = fo.status
- >>> except AttributeError: status = None
-
- Unfortunately, these are ONLY there if status == 200, so it's not
- easy to distinguish between non-200 responses. The reason is that
- urllib2 tries to do clever things with error codes 301, 302, 401,
- and 407, and it wraps the object upon return.
-
- For python versions earlier than 2.4, you can avoid this fancy error
- handling by setting the module-level global HANDLE_ERRORS to zero.
- You see, prior to 2.4, it's the HTTP Handler's job to determine what
- to handle specially, and what to just pass up. HANDLE_ERRORS == 0
- means "pass everything up". In python 2.4, however, this job no
- longer belongs to the HTTP Handler and is now done by a NEW handler,
- HTTPErrorProcessor. Here's the bottom line:
-
- python version < 2.4
- HANDLE_ERRORS == 1 (default) pass up 200, treat the rest as
- errors
- HANDLE_ERRORS == 0 pass everything up, error processing is
- left to the calling code
- python version >= 2.4
- HANDLE_ERRORS == 1 pass up 200, treat the rest as errors
- HANDLE_ERRORS == 0 (default) pass everything up, let the
- other handlers (specifically,
- HTTPErrorProcessor) decide what to do
-
- In practice, setting the variable either way makes little difference
- in python 2.4, so for the most consistent behavior across versions,
- you probably just want to use the defaults, which will give you
- exceptions on errors.
-
-"""
-
-# $Id: keepalive.py,v 1.16 2006/09/22 00:58:05 mstenner Exp $
-
-import urllib2
-import httplib
-import socket
-import thread
-
-DEBUG = None
-
-import sslfactory
-
-import sys
-if sys.version_info < (2, 4): HANDLE_ERRORS = 1
-else: HANDLE_ERRORS = 0
-
-class ConnectionManager:
- """
- The connection manager must be able to:
- * keep track of all existing
- """
- def __init__(self):
- self._lock = thread.allocate_lock()
- self._hostmap = {} # map hosts to a list of connections
- self._connmap = {} # map connections to host
- self._readymap = {} # map connection to ready state
-
- def add(self, host, connection, ready):
- self._lock.acquire()
- try:
- if not self._hostmap.has_key(host): self._hostmap[host] = []
- self._hostmap[host].append(connection)
- self._connmap[connection] = host
- self._readymap[connection] = ready
- finally:
- self._lock.release()
-
- def remove(self, connection):
- self._lock.acquire()
- try:
- try:
- host = self._connmap[connection]
- except KeyError:
- pass
- else:
- del self._connmap[connection]
- del self._readymap[connection]
- self._hostmap[host].remove(connection)
- if not self._hostmap[host]: del self._hostmap[host]
- finally:
- self._lock.release()
-
- def set_ready(self, connection, ready):
- try: self._readymap[connection] = ready
- except KeyError: pass
-
- def get_ready_conn(self, host):
- conn = None
- self._lock.acquire()
- try:
- if self._hostmap.has_key(host):
- for c in self._hostmap[host]:
- if self._readymap[c]:
- self._readymap[c] = 0
- conn = c
- break
- finally:
- self._lock.release()
- return conn
-
- def get_all(self, host=None):
- if host:
- return list(self._hostmap.get(host, []))
- else:
- return dict(self._hostmap)
-
-class KeepAliveHandler:
- def __init__(self):
- self._cm = ConnectionManager()
-
- #### Connection Management
- def open_connections(self):
- """return a list of connected hosts and the number of connections
- to each. [('foo.com:80', 2), ('bar.org', 1)]"""
- return [(host, len(li)) for (host, li) in self._cm.get_all().items()]
-
- def close_connection(self, host):
- """close connection(s) to <host>
- host is the host:port spec, as in 'www.cnn.com:8080' as passed in.
- no error occurs if there is no connection to that host."""
- for h in self._cm.get_all(host):
- self._cm.remove(h)
- h.close()
-
- def close_all(self):
- """close all open connections"""
- for host, conns in self._cm.get_all().items():
- for h in conns:
- self._cm.remove(h)
- h.close()
-
- def _request_closed(self, request, host, connection):
- """tells us that this request is now closed and the the
- connection is ready for another request"""
- self._cm.set_ready(connection, 1)
-
- def _remove_connection(self, host, connection, close=0):
- if close: connection.close()
- self._cm.remove(connection)
-
- #### Transaction Execution
- def do_open(self, req):
- host = req.get_host()
- if not host:
- raise urllib2.URLError('no host given')
-
- try:
- h = self._cm.get_ready_conn(host)
- while h:
- r = self._reuse_connection(h, req, host)
-
- # if this response is non-None, then it worked and we're
- # done. Break out, skipping the else block.
- if r: break
-
- # connection is bad - possibly closed by server
- # discard it and ask for the next free connection
- h.close()
- self._cm.remove(h)
- h = self._cm.get_ready_conn(host)
- else:
- # no (working) free connections were found. Create a new one.
- h = self._get_connection(host)
- if DEBUG: DEBUG.info("creating new connection to %s (%d)",
- host, id(h))
- self._cm.add(host, h, 0)
- self._start_transaction(h, req)
- r = h.getresponse()
- except (socket.error, httplib.HTTPException) as err:
- raise urllib2.URLError(err)
-
- # if not a persistent connection, don't try to reuse it
- if r.will_close: self._cm.remove(h)
-
- if DEBUG: DEBUG.info("STATUS: %s, %s", r.status, r.reason)
- r._handler = self
- r._host = host
- r._url = req.get_full_url()
- r._connection = h
- r.code = r.status
- r.headers = r.msg
- r.msg = r.reason
-
- if r.status == 200 or not HANDLE_ERRORS:
- return r
- else:
- return self.parent.error('http', req, r,
- r.status, r.msg, r.headers)
-
- def _reuse_connection(self, h, req, host):
- """start the transaction with a re-used connection
- return a response object (r) upon success or None on failure.
- This DOES not close or remove bad connections in cases where
- it returns. However, if an unexpected exception occurs, it
- will close and remove the connection before re-raising.
- """
- try:
- self._start_transaction(h, req)
- r = h.getresponse()
- # note: just because we got something back doesn't mean it
- # worked. We'll check the version below, too.
- except (socket.error, httplib.HTTPException):
- r = None
- except:
- # adding this block just in case we've missed
- # something we will still raise the exception, but
- # lets try and close the connection and remove it
- # first. We previously got into a nasty loop
- # where an exception was uncaught, and so the
- # connection stayed open. On the next try, the
- # same exception was raised, etc. The tradeoff is
- # that it's now possible this call will raise
- # a DIFFERENT exception
- if DEBUG: DEBUG.error("unexpected exception - closing " + \
- "connection to %s (%d)", host, id(h))
- self._cm.remove(h)
- h.close()
- raise
-
- if r is None or r.version == 9:
- # httplib falls back to assuming HTTP 0.9 if it gets a
- # bad header back. This is most likely to happen if
- # the socket has been closed by the server since we
- # last used the connection.
- if DEBUG: DEBUG.info("failed to re-use connection to %s (%d)",
- host, id(h))
- r = None
- else:
- if DEBUG: DEBUG.info("re-using connection to %s (%d)", host, id(h))
-
- return r
-
- def _start_transaction(self, h, req):
- try:
- if req.has_data():
- data = req.get_data()
- h.putrequest('POST', req.get_selector())
- if not req.headers.has_key('Content-type'):
- h.putheader('Content-type',
- 'application/x-www-form-urlencoded')
- if not req.headers.has_key('Content-length'):
- h.putheader('Content-length', '%d' % len(data))
- else:
- h.putrequest('GET', req.get_selector())
- except (socket.error, httplib.HTTPException) as err:
- raise urllib2.URLError(err)
-
- for args in self.parent.addheaders:
- h.putheader(*args)
- for k, v in req.headers.items():
- h.putheader(k, v)
- h.endheaders()
- if req.has_data():
- h.send(data)
-
- def _get_connection(self, host):
- return NotImplementedError
-
-class HTTPHandler(KeepAliveHandler, urllib2.HTTPHandler):
- def __init__(self):
- KeepAliveHandler.__init__(self)
-
- def http_open(self, req):
- return self.do_open(req)
-
- def _get_connection(self, host):
- return HTTPConnection(host)
-
-class HTTPSHandler(KeepAliveHandler, urllib2.HTTPSHandler):
- def __init__(self, ssl_factory=None):
- KeepAliveHandler.__init__(self)
- if not ssl_factory:
- ssl_factory = sslfactory.get_factory()
- self._ssl_factory = ssl_factory
-
- def https_open(self, req):
- return self.do_open(req)
-
- def _get_connection(self, host):
- return self._ssl_factory.get_https_connection(host)
-
-class HTTPResponse(httplib.HTTPResponse):
- # we need to subclass HTTPResponse in order to
- # 1) add readline() and readlines() methods
- # 2) add close_connection() methods
- # 3) add info() and geturl() methods
-
- # in order to add readline(), read must be modified to deal with a
- # buffer. example: readline must read a buffer and then spit back
- # one line at a time. The only real alternative is to read one
- # BYTE at a time (ick). Once something has been read, it can't be
- # put back (ok, maybe it can, but that's even uglier than this),
- # so if you THEN do a normal read, you must first take stuff from
- # the buffer.
-
- # the read method wraps the original to accomodate buffering,
- # although read() never adds to the buffer.
- # Both readline and readlines have been stolen with almost no
- # modification from socket.py
-
-
- def __init__(self, sock, debuglevel=0, strict=0, method=None):
- if method: # the httplib in python 2.3 uses the method arg
- httplib.HTTPResponse.__init__(self, sock, debuglevel, method)
- else: # 2.2 doesn't
- httplib.HTTPResponse.__init__(self, sock, debuglevel)
- self.fileno = sock.fileno
- self.code = None
- self._rbuf = ''
- self._rbufsize = 8096
- self._handler = None # inserted by the handler later
- self._host = None # (same)
- self._url = None # (same)
- self._connection = None # (same)
-
- _raw_read = httplib.HTTPResponse.read
-
- def close(self):
- if self.fp:
- self.fp.close()
- self.fp = None
- if self._handler:
- self._handler._request_closed(self, self._host,
- self._connection)
-
- def close_connection(self):
- self._handler._remove_connection(self._host, self._connection, close=1)
- self.close()
-
- def info(self):
- return self.headers
-
- def geturl(self):
- return self._url
-
- def read(self, amt=None):
- # the _rbuf test is only in this first if for speed. It's not
- # logically necessary
- if self._rbuf and not amt is None:
- L = len(self._rbuf)
- if amt > L:
- amt -= L
- else:
- s = self._rbuf[:amt]
- self._rbuf = self._rbuf[amt:]
- return s
-
- s = self._rbuf + self._raw_read(amt)
- self._rbuf = ''
- return s
-
- def readline(self, limit=-1):
- data = ""
- i = self._rbuf.find('\n')
- while i < 0 and not (0 < limit <= len(self._rbuf)):
- new = self._raw_read(self._rbufsize)
- if not new: break
- i = new.find('\n')
- if i >= 0: i = i + len(self._rbuf)
- self._rbuf = self._rbuf + new
- if i < 0: i = len(self._rbuf)
- else: i = i+1
- if 0 <= limit < len(self._rbuf): i = limit
- data, self._rbuf = self._rbuf[:i], self._rbuf[i:]
- return data
-
- def readlines(self, sizehint = 0):
- total = 0
- list = []
- while 1:
- line = self.readline()
- if not line: break
- list.append(line)
- total += len(line)
- if sizehint and total >= sizehint:
- break
- return list
-
-
-class HTTPConnection(httplib.HTTPConnection):
- # use the modified response class
- response_class = HTTPResponse
-
-class HTTPSConnection(httplib.HTTPSConnection):
- response_class = HTTPResponse
-
-#########################################################################
-##### TEST FUNCTIONS
-#########################################################################
-
-def error_handler(url):
- global HANDLE_ERRORS
- orig = HANDLE_ERRORS
- keepalive_handler = HTTPHandler()
- opener = urllib2.build_opener(keepalive_handler)
- urllib2.install_opener(opener)
- pos = {0: 'off', 1: 'on'}
- for i in (0, 1):
- print (" fancy error handling %s (HANDLE_ERRORS = %i)" % (pos[i], i))
- HANDLE_ERRORS = i
- try:
- fo = urllib2.urlopen(url)
- foo = fo.read()
- fo.close()
- try: status, reason = fo.status, fo.reason
- except AttributeError: status, reason = None, None
- except IOError as e:
- print (" EXCEPTION: %s" % e)
- raise
- else:
- print (" status = %s, reason = %s" % (status, reason))
- HANDLE_ERRORS = orig
- hosts = keepalive_handler.open_connections()
- print( "open connections:", hosts)
- keepalive_handler.close_all()
-
-def continuity(url):
- import md5
- format = '%25s: %s'
-
- # first fetch the file with the normal http handler
- opener = urllib2.build_opener()
- urllib2.install_opener(opener)
- fo = urllib2.urlopen(url)
- foo = fo.read()
- fo.close()
- m = md5.new(foo)
- print(format % ('normal urllib', m.hexdigest()))
-
- # now install the keepalive handler and try again
- opener = urllib2.build_opener(HTTPHandler())
- urllib2.install_opener(opener)
-
- fo = urllib2.urlopen(url)
- foo = fo.read()
- fo.close()
- m = md5.new(foo)
- print(format % ('keepalive read', m.hexdigest()))
-
- fo = urllib2.urlopen(url)
- foo = ''
- while 1:
- f = fo.readline()
- if f: foo = foo + f
- else: break
- fo.close()
- m = md5.new(foo)
- print( format % ('keepalive readline', m.hexdigest()))
-
-def comp(N, url):
- print( ' making %i connections to:\n %s' % (N, url))
-
- sys.stdout.write(' first using the normal urllib handlers')
- # first use normal opener
- opener = urllib2.build_opener()
- urllib2.install_opener(opener)
- t1 = fetch(N, url)
- print( ' TIME: %.3f s' % t1)
-
- sys.stdout.write(' now using the keepalive handler ')
- # now install the keepalive handler and try again
- opener = urllib2.build_opener(HTTPHandler())
- urllib2.install_opener(opener)
- t2 = fetch(N, url)
- print (' TIME: %.3f s' % t2)
- print( ' improvement factor: %.2f' % (t1/t2, ))
-
-def fetch(N, url, delay=0):
- import time
- lens = []
- starttime = time.time()
- for i in range(N):
- if delay and i > 0: time.sleep(delay)
- fo = urllib2.urlopen(url)
- foo = fo.read()
- fo.close()
- lens.append(len(foo))
- diff = time.time() - starttime
-
- j = 0
- for i in lens[1:]:
- j = j + 1
- if not i == lens[0]:
- print( "WARNING: inconsistent length on read %i: %i" % (j, i))
-
- return diff
-
-def test_timeout(url):
- global DEBUG
- dbbackup = DEBUG
- class FakeLogger:
- def debug(self, msg, *args): print( msg % args)
- info = warning = error = debug
- DEBUG = FakeLogger()
- print( " fetching the file to establish a connection")
- fo = urllib2.urlopen(url)
- data1 = fo.read()
- fo.close()
-
- i = 20
- print( " waiting %i seconds for the server to close the connection" % i)
- while i > 0:
- sys.stdout.write('\r %2i' % i)
- sys.stdout.flush()
- time.sleep(1)
- i -= 1
- sys.stderr.write('\r')
-
- print( " fetching the file a second time")
- fo = urllib2.urlopen(url)
- data2 = fo.read()
- fo.close()
-
- if data1 == data2:
- print( ' data are identical')
- else:
- print( ' ERROR: DATA DIFFER')
-
- DEBUG = dbbackup
-
-
-def test(url, N=10):
- print( "checking error hander (do this on a non-200)")
- try: error_handler(url)
- except IOError as e:
- print( "exiting - exception will prevent further tests")
- sys.exit()
- print()
- print( "performing continuity test (making sure stuff isn't corrupted)")
- continuity(url)
- print()
- print( "performing speed comparison")
- comp(N, url)
- print()
- print( "performing dropped-connection check")
- test_timeout(url)
-
-if __name__ == '__main__':
- import time
- import sys
- try:
- N = int(sys.argv[1])
- url = sys.argv[2]
- except:
- print( "%s <integer> <url>" % sys.argv[0])
- else:
- test(url, N)
+++ /dev/null
-# This library is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public
-# License as published by the Free Software Foundation; either
-# version 2.1 of the License, or (at your option) any later version.
-#
-# This library is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the
-# Free Software Foundation, Inc.,
-# 59 Temple Place, Suite 330,
-# Boston, MA 02111-1307 USA
-
-# This file is part of urlgrabber, a high-level cross-protocol url-grabber
-# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
-
-"""Module for downloading files from a pool of mirrors
-
-DESCRIPTION
-
- This module provides support for downloading files from a pool of
- mirrors with configurable failover policies. To a large extent, the
- failover policy is chosen by using different classes derived from
- the main class, MirrorGroup.
-
- Instances of MirrorGroup (and cousins) act very much like URLGrabber
- instances in that they have urlread, urlgrab, and urlopen methods.
- They can therefore, be used in very similar ways.
-
- from urlgrabber.grabber import URLGrabber
- from urlgrabber.mirror import MirrorGroup
- gr = URLGrabber()
- mg = MirrorGroup(gr, ['http://foo.com/some/directory/',
- 'http://bar.org/maybe/somewhere/else/',
- 'ftp://baz.net/some/other/place/entirely/']
- mg.urlgrab('relative/path.zip')
-
- The assumption is that all mirrors are identical AFTER the base urls
- specified, so that any mirror can be used to fetch any file.
-
-FAILOVER
-
- The failover mechanism is designed to be customized by subclassing
- from MirrorGroup to change the details of the behavior. In general,
- the classes maintain a master mirror list and a "current mirror"
- index. When a download is initiated, a copy of this list and index
- is created for that download only. The specific failover policy
- depends on the class used, and so is documented in the class
- documentation. Note that ANY behavior of the class can be
- overridden, so any failover policy at all is possible (although
- you may need to change the interface in extreme cases).
-
-CUSTOMIZATION
-
- Most customization of a MirrorGroup object is done at instantiation
- time (or via subclassing). There are four major types of
- customization:
-
- 1) Pass in a custom urlgrabber - The passed in urlgrabber will be
- used (by default... see #2) for the grabs, so options to it
- apply for the url-fetching
-
- 2) Custom mirror list - Mirror lists can simply be a list of
- stings mirrors (as shown in the example above) but each can
- also be a dict, allowing for more options. For example, the
- first mirror in the list above could also have been:
-
- {'mirror': 'http://foo.com/some/directory/',
- 'grabber': <a custom grabber to be used for this mirror>,
- 'kwargs': { <a dict of arguments passed to the grabber> }}
-
- All mirrors are converted to this format internally. If
- 'grabber' is omitted, the default grabber will be used. If
- kwargs are omitted, then (duh) they will not be used.
-
- 3) Pass keyword arguments when instantiating the mirror group.
- See, for example, the failure_callback argument.
-
- 4) Finally, any kwargs passed in for the specific file (to the
- urlgrab method, for example) will be folded in. The options
- passed into the grabber's urlXXX methods will override any
- options specified in a custom mirror dict.
-
-"""
-
-# $Id: mirror.py,v 1.14 2006/02/22 18:26:46 mstenner Exp $
-
-import random
-import thread # needed for locking to make this threadsafe
-
-from grabber import URLGrabError, CallbackObject, DEBUG
-
-try:
- from i18n import _
-except ImportError as msg:
- def _(st): return st
-
-class GrabRequest:
- """This is a dummy class used to hold information about the specific
- request. For example, a single file. By maintaining this information
- separately, we can accomplish two things:
-
- 1) make it a little easier to be threadsafe
- 2) have request-specific parameters
- """
- pass
-
-class MirrorGroup:
- """Base Mirror class
-
- Instances of this class are built with a grabber object and a list
- of mirrors. Then all calls to urlXXX should be passed relative urls.
- The requested file will be searched for on the first mirror. If the
- grabber raises an exception (possibly after some retries) then that
- mirror will be removed from the list, and the next will be attempted.
- If all mirrors are exhausted, then an exception will be raised.
-
- MirrorGroup has the following failover policy:
-
- * downloads begin with the first mirror
-
- * by default (see default_action below) a failure (after retries)
- causes it to increment the local AND master indices. Also,
- the current mirror is removed from the local list (but NOT the
- master list - the mirror can potentially be used for other
- files)
-
- * if the local list is ever exhausted, a URLGrabError will be
- raised (errno=256, no more mirrors)
-
- OPTIONS
-
- In addition to the required arguments "grabber" and "mirrors",
- MirrorGroup also takes the following optional arguments:
-
- default_action
-
- A dict that describes the actions to be taken upon failure
- (after retries). default_action can contain any of the
- following keys (shown here with their default values):
-
- default_action = {'increment': 1,
- 'increment_master': 1,
- 'remove': 1,
- 'remove_master': 0,
- 'fail': 0}
-
- In this context, 'increment' means "use the next mirror" and
- 'remove' means "never use this mirror again". The two
- 'master' values refer to the instance-level mirror list (used
- for all files), whereas the non-master values refer to the
- current download only.
-
- The 'fail' option will cause immediate failure by re-raising
- the exception and no further attempts to get the current
- download.
-
- This dict can be set at instantiation time,
- mg = MirrorGroup(grabber, mirrors, default_action={'fail':1})
- at method-execution time (only applies to current fetch),
- filename = mg.urlgrab(url, default_action={'increment': 0})
- or by returning an action dict from the failure_callback
- return {'fail':0}
- in increasing precedence.
-
- If all three of these were done, the net result would be:
- {'increment': 0, # set in method
- 'increment_master': 1, # class default
- 'remove': 1, # class default
- 'remove_master': 0, # class default
- 'fail': 0} # set at instantiation, reset
- # from callback
-
- failure_callback
-
- this is a callback that will be called when a mirror "fails",
- meaning the grabber raises some URLGrabError. If this is a
- tuple, it is interpreted to be of the form (cb, args, kwargs)
- where cb is the actual callable object (function, method,
- etc). Otherwise, it is assumed to be the callable object
- itself. The callback will be passed a grabber.CallbackObject
- instance along with args and kwargs (if present). The following
- attributes are defined withing the instance:
-
- obj.exception = < exception that was raised >
- obj.mirror = < the mirror that was tried >
- obj.relative_url = < url relative to the mirror >
- obj.url = < full url that failed >
- # .url is just the combination of .mirror
- # and .relative_url
-
- The failure callback can return an action dict, as described
- above.
-
- Like default_action, the failure_callback can be set at
- instantiation time or when the urlXXX method is called. In
- the latter case, it applies only for that fetch.
-
- The callback can re-raise the exception quite easily. For
- example, this is a perfectly adequate callback function:
-
- def callback(obj): raise obj.exception
-
- WARNING: do not save the exception object (or the
- CallbackObject instance). As they contain stack frame
- references, they can lead to circular references.
-
- Notes:
- * The behavior can be customized by deriving and overriding the
- 'CONFIGURATION METHODS'
- * The 'grabber' instance is kept as a reference, not copied.
- Therefore, the grabber instance can be modified externally
- and changes will take effect immediately.
- """
-
- # notes on thread-safety:
-
- # A GrabRequest should never be shared by multiple threads because
- # it's never saved inside the MG object and never returned outside it.
- # therefore, it should be safe to access/modify grabrequest data
- # without a lock. However, accessing the mirrors and _next attributes
- # of the MG itself must be done when locked to prevent (for example)
- # removal of the wrong mirror.
-
- ##############################################################
- # CONFIGURATION METHODS - intended to be overridden to
- # customize behavior
- def __init__(self, grabber, mirrors, **kwargs):
- """Initialize the MirrorGroup object.
-
- REQUIRED ARGUMENTS
-
- grabber - URLGrabber instance
- mirrors - a list of mirrors
-
- OPTIONAL ARGUMENTS
-
- failure_callback - callback to be used when a mirror fails
- default_action - dict of failure actions
-
- See the module-level and class level documentation for more
- details.
- """
-
- # OVERRIDE IDEAS:
- # shuffle the list to randomize order
- self.grabber = grabber
- self.mirrors = self._parse_mirrors(mirrors)
- self._next = 0
- self._lock = thread.allocate_lock()
- self.default_action = None
- self._process_kwargs(kwargs)
-
- # if these values are found in **kwargs passed to one of the urlXXX
- # methods, they will be stripped before getting passed on to the
- # grabber
- options = ['default_action', 'failure_callback']
-
- def _process_kwargs(self, kwargs):
- self.failure_callback = kwargs.get('failure_callback')
- self.default_action = kwargs.get('default_action')
-
- def _parse_mirrors(self, mirrors):
- parsed_mirrors = []
- for m in mirrors:
- if type(m) == type(''): m = {'mirror': m}
- parsed_mirrors.append(m)
- return parsed_mirrors
-
- def _load_gr(self, gr):
- # OVERRIDE IDEAS:
- # shuffle gr list
- self._lock.acquire()
- gr.mirrors = list(self.mirrors)
- gr._next = self._next
- self._lock.release()
-
- def _get_mirror(self, gr):
- # OVERRIDE IDEAS:
- # return a random mirror so that multiple mirrors get used
- # even without failures.
- if not gr.mirrors:
- raise URLGrabError(256, _('No more mirrors to try.'))
- return gr.mirrors[gr._next]
-
- def _failure(self, gr, cb_obj):
- # OVERRIDE IDEAS:
- # inspect the error - remove=1 for 404, remove=2 for connection
- # refused, etc. (this can also be done via
- # the callback)
- cb = gr.kw.get('failure_callback') or self.failure_callback
- if cb:
- if type(cb) == type( () ):
- cb, args, kwargs = cb
- else:
- args, kwargs = (), {}
- action = cb(cb_obj, *args, **kwargs) or {}
- else:
- action = {}
- # XXXX - decide - there are two ways to do this
- # the first is action-overriding as a whole - use the entire action
- # or fall back on module level defaults
- #action = action or gr.kw.get('default_action') or self.default_action
- # the other is to fall through for each element in the action dict
- a = dict(self.default_action or {})
- a.update(gr.kw.get('default_action', {}))
- a.update(action)
- action = a
- self.increment_mirror(gr, action)
- if action and action.get('fail', 0): raise Exception()
-
- def increment_mirror(self, gr, action={}):
- """Tell the mirror object increment the mirror index
-
- This increments the mirror index, which amounts to telling the
- mirror object to use a different mirror (for this and future
- downloads).
-
- This is a SEMI-public method. It will be called internally,
- and you may never need to call it. However, it is provided
- (and is made public) so that the calling program can increment
- the mirror choice for methods like urlopen. For example, with
- urlopen, there's no good way for the mirror group to know that
- an error occurs mid-download (it's already returned and given
- you the file object).
-
- remove --- can have several values
- 0 do not remove the mirror from the list
- 1 remove the mirror for this download only
- 2 remove the mirror permanently
-
- beware of remove=0 as it can lead to infinite loops
- """
- badmirror = gr.mirrors[gr._next]
-
- self._lock.acquire()
- try:
- ind = self.mirrors.index(badmirror)
- except ValueError:
- pass
- else:
- if action.get('remove_master', 0):
- del self.mirrors[ind]
- elif self._next == ind and action.get('increment_master', 1):
- self._next += 1
- if self._next >= len(self.mirrors): self._next = 0
- self._lock.release()
-
- if action.get('remove', 1):
- del gr.mirrors[gr._next]
- elif action.get('increment', 1):
- gr._next += 1
- if gr._next >= len(gr.mirrors): gr._next = 0
-
- if DEBUG:
- grm = [m['mirror'] for m in gr.mirrors]
- DEBUG.info('GR mirrors: [%s] %i', ' '.join(grm), gr._next)
- selfm = [m['mirror'] for m in self.mirrors]
- DEBUG.info('MAIN mirrors: [%s] %i', ' '.join(selfm), self._next)
-
- #####################################################################
- # NON-CONFIGURATION METHODS
- # these methods are designed to be largely workhorse methods that
- # are not intended to be overridden. That doesn't mean you can't;
- # if you want to, feel free, but most things can be done by
- # by overriding the configuration methods :)
-
- def _join_url(self, base_url, rel_url):
- if base_url.endswith('/') or rel_url.startswith('/'):
- return base_url + rel_url
- else:
- return base_url + '/' + rel_url
-
- def _mirror_try(self, func, url, kw):
- gr = GrabRequest()
- gr.func = func
- gr.url = url
- gr.kw = dict(kw)
- self._load_gr(gr)
-
- for k in self.options:
- try: del kw[k]
- except KeyError: pass
-
- while 1:
- mirrorchoice = self._get_mirror(gr)
- fullurl = self._join_url(mirrorchoice['mirror'], gr.url)
- kwargs = dict(mirrorchoice.get('kwargs', {}))
- kwargs.update(kw)
- grabber = mirrorchoice.get('grabber') or self.grabber
- func_ref = getattr(grabber, func)
- if DEBUG: DEBUG.info('MIRROR: trying %s -> %s', url, fullurl)
- try:
- return func_ref( *(fullurl,), **kwargs )
- except URLGrabError as e:
- if DEBUG: DEBUG.info('MIRROR: failed')
- obj = CallbackObject()
- obj.exception = e
- obj.mirror = mirrorchoice['mirror']
- obj.relative_url = gr.url
- obj.url = fullurl
- self._failure(gr, obj)
-
- def urlgrab(self, url, filename=None, **kwargs):
- kw = dict(kwargs)
- kw['filename'] = filename
- func = 'urlgrab'
- return self._mirror_try(func, url, kw)
-
- def urlopen(self, url, **kwargs):
- kw = dict(kwargs)
- func = 'urlopen'
- return self._mirror_try(func, url, kw)
-
- def urlread(self, url, limit=None, **kwargs):
- kw = dict(kwargs)
- kw['limit'] = limit
- func = 'urlread'
- return self._mirror_try(func, url, kw)
-
-
-class MGRandomStart(MirrorGroup):
- """A mirror group that starts at a random mirror in the list.
-
- This behavior of this class is identical to MirrorGroup, except that
- it starts at a random location in the mirror list.
- """
-
- def __init__(self, grabber, mirrors, **kwargs):
- """Initialize the object
-
- The arguments for intialization are the same as for MirrorGroup
- """
- MirrorGroup.__init__(self, grabber, mirrors, **kwargs)
- self._next = random.randrange(len(mirrors))
-
-class MGRandomOrder(MirrorGroup):
- """A mirror group that uses mirrors in a random order.
-
- This behavior of this class is identical to MirrorGroup, except that
- it uses the mirrors in a random order. Note that the order is set at
- initialization time and fixed thereafter. That is, it does not pick a
- random mirror after each failure.
- """
-
- def __init__(self, grabber, mirrors, **kwargs):
- """Initialize the object
-
- The arguments for intialization are the same as for MirrorGroup
- """
- MirrorGroup.__init__(self, grabber, mirrors, **kwargs)
- random.shuffle(self.mirrors)
-
-if __name__ == '__main__':
- pass
+++ /dev/null
-# This library is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public
-# License as published by the Free Software Foundation; either
-# version 2.1 of the License, or (at your option) any later version.
-#
-# This library is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the
-# Free Software Foundation, Inc.,
-# 59 Temple Place, Suite 330,
-# Boston, MA 02111-1307 USA
-
-# This file is part of urlgrabber, a high-level cross-protocol url-grabber
-# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
-
-# $Id: progress.py,v 1.7 2005/08/19 21:59:07 mstenner Exp $
-
-import sys
-import time
-import math
-import thread
-
-class BaseMeter:
- def __init__(self):
- self.update_period = 0.3 # seconds
-
- self.filename = None
- self.url = None
- self.basename = None
- self.text = None
- self.size = None
- self.start_time = None
- self.last_amount_read = 0
- self.last_update_time = None
- self.re = RateEstimator()
-
- def start(self, filename=None, url=None, basename=None,
- size=None, now=None, text=None):
- self.filename = filename
- self.url = url
- self.basename = basename
- self.text = text
-
- #size = None ######### TESTING
- self.size = size
- if not size is None: self.fsize = format_number(size) + 'B'
-
- if now is None: now = time.time()
- self.start_time = now
- self.re.start(size, now)
- self.last_amount_read = 0
- self.last_update_time = now
- self._do_start(now)
-
- def _do_start(self, now=None):
- pass
-
- def update(self, amount_read, now=None):
- # for a real gui, you probably want to override and put a call
- # to your mainloop iteration function here
- if now is None: now = time.time()
- if (now >= self.last_update_time + self.update_period) or \
- not self.last_update_time:
- self.re.update(amount_read, now)
- self.last_amount_read = amount_read
- self.last_update_time = now
- self._do_update(amount_read, now)
-
- def _do_update(self, amount_read, now=None):
- pass
-
- def end(self, amount_read, now=None):
- if now is None: now = time.time()
- self.re.update(amount_read, now)
- self.last_amount_read = amount_read
- self.last_update_time = now
- self._do_end(amount_read, now)
-
- def _do_end(self, amount_read, now=None):
- pass
-
-class TextMeter(BaseMeter):
- def __init__(self, fo=sys.stderr):
- BaseMeter.__init__(self)
- self.fo = fo
-
- def _do_update(self, amount_read, now=None):
- etime = self.re.elapsed_time()
- fetime = format_time(etime)
- fread = format_number(amount_read)
- #self.size = None
- if self.text is not None:
- text = self.text
- else:
- text = self.basename
- if self.size is None:
- out = '\r%-60.60s %5sB %s ' % \
- (text, fread, fetime)
- else:
- rtime = self.re.remaining_time()
- frtime = format_time(rtime)
- frac = self.re.fraction_read()
- bar = '='*int(25 * frac)
-
- out = '\r%-25.25s %3i%% |%-25.25s| %5sB %8s ETA ' % \
- (text, frac*100, bar, fread, frtime)
-
- self.fo.write(out)
- self.fo.flush()
-
- def _do_end(self, amount_read, now=None):
- total_time = format_time(self.re.elapsed_time())
- total_size = format_number(amount_read)
- if self.text is not None:
- text = self.text
- else:
- text = self.basename
- if self.size is None:
- out = '\r%-60.60s %5sB %s ' % \
- (text, total_size, total_time)
- else:
- bar = '='*25
- out = '\r%-25.25s %3i%% |%-25.25s| %5sB %8s ' % \
- (text, 100, bar, total_size, total_time)
- self.fo.write(out + '\n')
- self.fo.flush()
-
-text_progress_meter = TextMeter
-
-class MultiFileHelper(BaseMeter):
- def __init__(self, master):
- BaseMeter.__init__(self)
- self.master = master
-
- def _do_start(self, now):
- self.master.start_meter(self, now)
-
- def _do_update(self, amount_read, now):
- # elapsed time since last update
- self.master.update_meter(self, now)
-
- def _do_end(self, amount_read, now):
- self.ftotal_time = format_time(now - self.start_time)
- self.ftotal_size = format_number(self.last_amount_read)
- self.master.end_meter(self, now)
-
- def failure(self, message, now=None):
- self.master.failure_meter(self, message, now)
-
- def message(self, message):
- self.master.message_meter(self, message)
-
-class MultiFileMeter:
- helperclass = MultiFileHelper
- def __init__(self):
- self.meters = []
- self.in_progress_meters = []
- self._lock = thread.allocate_lock()
- self.update_period = 0.3 # seconds
-
- self.numfiles = None
- self.finished_files = 0
- self.failed_files = 0
- self.open_files = 0
- self.total_size = None
- self.failed_size = 0
- self.start_time = None
- self.finished_file_size = 0
- self.last_update_time = None
- self.re = RateEstimator()
-
- def start(self, numfiles=None, total_size=None, now=None):
- if now is None: now = time.time()
- self.numfiles = numfiles
- self.finished_files = 0
- self.failed_files = 0
- self.open_files = 0
- self.total_size = total_size
- self.failed_size = 0
- self.start_time = now
- self.finished_file_size = 0
- self.last_update_time = now
- self.re.start(total_size, now)
- self._do_start(now)
-
- def _do_start(self, now):
- pass
-
- def end(self, now=None):
- if now is None: now = time.time()
- self._do_end(now)
-
- def _do_end(self, now):
- pass
-
- def lock(self): self._lock.acquire()
- def unlock(self): self._lock.release()
-
- ###########################################################
- # child meter creation and destruction
- def newMeter(self):
- newmeter = self.helperclass(self)
- self.meters.append(newmeter)
- return newmeter
-
- def removeMeter(self, meter):
- self.meters.remove(meter)
-
- ###########################################################
- # child functions - these should only be called by helpers
- def start_meter(self, meter, now):
- if not meter in self.meters:
- raise ValueError('attempt to use orphaned meter')
- self._lock.acquire()
- try:
- if not meter in self.in_progress_meters:
- self.in_progress_meters.append(meter)
- self.open_files += 1
- finally:
- self._lock.release()
- self._do_start_meter(meter, now)
-
- def _do_start_meter(self, meter, now):
- pass
-
- def update_meter(self, meter, now):
- if not meter in self.meters:
- raise ValueError('attempt to use orphaned meter')
- if (now >= self.last_update_time + self.update_period) or \
- not self.last_update_time:
- self.re.update(self._amount_read(), now)
- self.last_update_time = now
- self._do_update_meter(meter, now)
-
- def _do_update_meter(self, meter, now):
- pass
-
- def end_meter(self, meter, now):
- if not meter in self.meters:
- raise ValueError('attempt to use orphaned meter')
- self._lock.acquire()
- try:
- try: self.in_progress_meters.remove(meter)
- except ValueError: pass
- self.open_files -= 1
- self.finished_files += 1
- self.finished_file_size += meter.last_amount_read
- finally:
- self._lock.release()
- self._do_end_meter(meter, now)
-
- def _do_end_meter(self, meter, now):
- pass
-
- def failure_meter(self, meter, message, now):
- if not meter in self.meters:
- raise ValueError('attempt to use orphaned meter')
- self._lock.acquire()
- try:
- try: self.in_progress_meters.remove(meter)
- except ValueError: pass
- self.open_files -= 1
- self.failed_files += 1
- if meter.size and self.failed_size is not None:
- self.failed_size += meter.size
- else:
- self.failed_size = None
- finally:
- self._lock.release()
- self._do_failure_meter(meter, message, now)
-
- def _do_failure_meter(self, meter, message, now):
- pass
-
- def message_meter(self, meter, message):
- pass
-
- ########################################################
- # internal functions
- def _amount_read(self):
- tot = self.finished_file_size
- for m in self.in_progress_meters:
- tot += m.last_amount_read
- return tot
-
-
-class TextMultiFileMeter(MultiFileMeter):
- def __init__(self, fo=sys.stderr):
- self.fo = fo
- MultiFileMeter.__init__(self)
-
- # files: ###/### ###% data: ######/###### ###% time: ##:##:##/##:##:##
- def _do_update_meter(self, meter, now):
- self._lock.acquire()
- try:
- format = "files: %3i/%-3i %3i%% data: %6.6s/%-6.6s %3i%% " \
- "time: %8.8s/%8.8s"
- df = self.finished_files
- tf = self.numfiles or 1
- pf = 100 * float(df)/tf + 0.49
- dd = self.re.last_amount_read
- td = self.total_size
- pd = 100 * (self.re.fraction_read() or 0) + 0.49
- dt = self.re.elapsed_time()
- rt = self.re.remaining_time()
- if rt is None: tt = None
- else: tt = dt + rt
-
- fdd = format_number(dd) + 'B'
- ftd = format_number(td) + 'B'
- fdt = format_time(dt, 1)
- ftt = format_time(tt, 1)
-
- out = '%-79.79s' % (format % (df, tf, pf, fdd, ftd, pd, fdt, ftt))
- self.fo.write('\r' + out)
- self.fo.flush()
- finally:
- self._lock.release()
-
- def _do_end_meter(self, meter, now):
- self._lock.acquire()
- try:
- format = "%-30.30s %6.6s %8.8s %9.9s"
- fn = meter.basename
- size = meter.last_amount_read
- fsize = format_number(size) + 'B'
- et = meter.re.elapsed_time()
- fet = format_time(et, 1)
- frate = format_number(size / et) + 'B/s'
-
- out = '%-79.79s' % (format % (fn, fsize, fet, frate))
- self.fo.write('\r' + out + '\n')
- finally:
- self._lock.release()
- self._do_update_meter(meter, now)
-
- def _do_failure_meter(self, meter, message, now):
- self._lock.acquire()
- try:
- format = "%-30.30s %6.6s %s"
- fn = meter.basename
- if type(message) in (type(''), type(u'')):
- message = message.splitlines()
- if not message: message = ['']
- out = '%-79s' % (format % (fn, 'FAILED', message[0] or ''))
- self.fo.write('\r' + out + '\n')
- for m in message[1:]: self.fo.write(' ' + m + '\n')
- self._lock.release()
- finally:
- self._do_update_meter(meter, now)
-
- def message_meter(self, meter, message):
- self._lock.acquire()
- try:
- pass
- finally:
- self._lock.release()
-
- def _do_end(self, now):
- self._do_update_meter(None, now)
- self._lock.acquire()
- try:
- self.fo.write('\n')
- self.fo.flush()
- finally:
- self._lock.release()
-
-######################################################################
-# support classes and functions
-
-class RateEstimator:
- def __init__(self, timescale=5.0):
- self.timescale = timescale
-
- def start(self, total=None, now=None):
- if now is None: now = time.time()
- self.total = total
- self.start_time = now
- self.last_update_time = now
- self.last_amount_read = 0
- self.ave_rate = None
-
- def update(self, amount_read, now=None):
- if now is None: now = time.time()
- if amount_read == 0:
- # if we just started this file, all bets are off
- self.last_update_time = now
- self.last_amount_read = 0
- self.ave_rate = None
- return
-
- #print 'times', now, self.last_update_time
- time_diff = now - self.last_update_time
- read_diff = amount_read - self.last_amount_read
- self.last_update_time = now
- self.last_amount_read = amount_read
- self.ave_rate = self._temporal_rolling_ave(\
- time_diff, read_diff, self.ave_rate, self.timescale)
- #print 'results', time_diff, read_diff, self.ave_rate
-
- #####################################################################
- # result methods
- def average_rate(self):
- "get the average transfer rate (in bytes/second)"
- return self.ave_rate
-
- def elapsed_time(self):
- "the time between the start of the transfer and the most recent update"
- return self.last_update_time - self.start_time
-
- def remaining_time(self):
- "estimated time remaining"
- if not self.ave_rate or not self.total: return None
- return (self.total - self.last_amount_read) / self.ave_rate
-
- def fraction_read(self):
- """the fraction of the data that has been read
- (can be None for unknown transfer size)"""
- if self.total is None: return None
- elif self.total == 0: return 1.0
- else: return float(self.last_amount_read)/self.total
-
- #########################################################################
- # support methods
- def _temporal_rolling_ave(self, time_diff, read_diff, last_ave, timescale):
- """a temporal rolling average performs smooth averaging even when
- updates come at irregular intervals. This is performed by scaling
- the "epsilon" according to the time since the last update.
- Specifically, epsilon = time_diff / timescale
-
- As a general rule, the average will take on a completely new value
- after 'timescale' seconds."""
- epsilon = time_diff / timescale
- if epsilon > 1: epsilon = 1.0
- return self._rolling_ave(time_diff, read_diff, last_ave, epsilon)
-
- def _rolling_ave(self, time_diff, read_diff, last_ave, epsilon):
- """perform a "rolling average" iteration
- a rolling average "folds" new data into an existing average with
- some weight, epsilon. epsilon must be between 0.0 and 1.0 (inclusive)
- a value of 0.0 means only the old value (initial value) counts,
- and a value of 1.0 means only the newest value is considered."""
-
- try:
- recent_rate = read_diff / time_diff
- except ZeroDivisionError:
- recent_rate = None
- if last_ave is None: return recent_rate
- elif recent_rate is None: return last_ave
-
- # at this point, both last_ave and recent_rate are numbers
- return epsilon * recent_rate + (1 - epsilon) * last_ave
-
- def _round_remaining_time(self, rt, start_time=15.0):
- """round the remaining time, depending on its size
- If rt is between n*start_time and (n+1)*start_time round downward
- to the nearest multiple of n (for any counting number n).
- If rt < start_time, round down to the nearest 1.
- For example (for start_time = 15.0):
- 2.7 -> 2.0
- 25.2 -> 25.0
- 26.4 -> 26.0
- 35.3 -> 34.0
- 63.6 -> 60.0
- """
-
- if rt < 0: return 0.0
- shift = int(math.log(rt/start_time)/math.log(2))
- rt = int(rt)
- if shift <= 0: return rt
- return float(int(rt) >> shift << shift)
-
-
-def format_time(seconds, use_hours=0):
- if seconds is None or seconds < 0:
- if use_hours: return '--:--:--'
- else: return '--:--'
- else:
- seconds = int(seconds)
- minutes = seconds / 60
- seconds = seconds % 60
- if use_hours:
- hours = minutes / 60
- minutes = minutes % 60
- return '%02i:%02i:%02i' % (hours, minutes, seconds)
- else:
- return '%02i:%02i' % (minutes, seconds)
-
-def format_number(number, SI=0, space=' '):
- """Turn numbers into human-readable metric-like numbers"""
- symbols = ['', # (none)
- 'k', # kilo
- 'M', # mega
- 'G', # giga
- 'T', # tera
- 'P', # peta
- 'E', # exa
- 'Z', # zetta
- 'Y'] # yotta
-
- if SI: step = 1000.0
- else: step = 1024.0
-
- thresh = 999
- depth = 0
- max_depth = len(symbols) - 1
-
- # we want numbers between 0 and thresh, but don't exceed the length
- # of our list. In that event, the formatting will be screwed up,
- # but it'll still show the right number.
- while number > thresh and depth < max_depth:
- depth = depth + 1
- number = number / step
-
- if type(number) == type(1) or type(number) == type(1):
- # it's an int or a long, which means it didn't get divided,
- # which means it's already short enough
- format = '%i%s%s'
- elif number < 9.95:
- # must use 9.95 for proper sizing. For example, 9.99 will be
- # rounded to 10.0 with the .1f format string (which is too long)
- format = '%.1f%s%s'
- else:
- format = '%.0f%s%s'
-
- return(format % (float(number or 0), space, symbols[depth]))
+++ /dev/null
-# This library is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public
-# License as published by the Free Software Foundation; either
-# version 2.1 of the License, or (at your option) any later version.
-#
-# This library is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the
-# Free Software Foundation, Inc.,
-# 59 Temple Place, Suite 330,
-# Boston, MA 02111-1307 USA
-
-# This file is part of urlgrabber, a high-level cross-protocol url-grabber
-
-import httplib
-import urllib2
-
-try:
- from M2Crypto import SSL
- from M2Crypto import httpslib
- from M2Crypto import m2urllib2
-
- SSL.Connection.clientPostConnectionCheck = None
- have_m2crypto = True
-except ImportError:
- have_m2crypto = False
-
-DEBUG = None
-
-if have_m2crypto:
-
- class M2SSLFactory:
-
- def __init__(self, ssl_ca_cert, ssl_context):
- self.ssl_context = self._get_ssl_context(ssl_ca_cert, ssl_context)
-
- def _get_ssl_context(self, ssl_ca_cert, ssl_context):
- """
- Create an ssl context using the CA cert file or ssl context.
-
- The CA cert is used first if it was passed as an option. If not,
- then the supplied ssl context is used. If no ssl context was supplied,
- None is returned.
- """
- if ssl_ca_cert:
- context = SSL.Context()
- context.load_verify_locations(ssl_ca_cert)
- context.set_verify(SSL.verify_none, -1)
- return context
- else:
- return ssl_context
-
- def create_https_connection(self, host, response_class = None):
- connection = httplib.HTTPSConnection(host, self.ssl_context)
- if response_class:
- connection.response_class = response_class
- return connection
-
- def create_opener(self, *handlers):
- return m2urllib2.build_opener(self.ssl_context, *handlers)
-
-
-class SSLFactory:
-
- def create_https_connection(self, host, response_class = None):
- connection = httplib.HTTPSConnection(host)
- if response_class:
- connection.response_class = response_class
- return connection
-
- def create_opener(self, *handlers):
- return urllib2.build_opener(*handlers)
-
-
-
-def get_factory(ssl_ca_cert = None, ssl_context = None):
- """ Return an SSLFactory, based on if M2Crypto is available. """
- if have_m2crypto:
- return M2SSLFactory(ssl_ca_cert, ssl_context)
- else:
- # Log here if someone provides the args but we don't use them.
- if ssl_ca_cert or ssl_context:
- if DEBUG:
- DEBUG.warning("SSL arguments supplied, but M2Crypto is not available. "
- "Using Python SSL.")
- return SSLFactory()
MOD_NAME + '/3rdparty/pykickstart',
MOD_NAME + '/3rdparty/pykickstart/commands',
MOD_NAME + '/3rdparty/pykickstart/handlers',
- MOD_NAME + '/3rdparty/pykickstart/urlgrabber',
]
IMAGER_PLUGINS = glob.glob(os.path.join("plugins", "imager", "*.py"))