1 """Open an arbitrary URL.
3 See the following document for more info on URLs:
4 "Names and Addresses, URIs, URLs, URNs, URCs", at
5 http://www.w3.org/pub/WWW/Addressing/Overview.html
7 See also the HTTP spec (from which the error codes are derived):
8 "HTTP - Hypertext Transfer Protocol", at
9 http://www.w3.org/pub/WWW/Protocols/
11 Related standards and specs:
12 - RFC1808: the "relative URL" spec. (authoritative status)
13 - RFC1738 - the "URL standard". (authoritative status)
14 - RFC1630 - the "URI spec". (informational status)
16 The object returned by URLopener().open(file) will differ per
17 protocol. All you know is that is has methods read(), readline(),
18 readlines(), fileno(), close() and info(). The read*(), fileno()
19 and close() methods work like those of open files.
20 The info() method returns a mimetools.Message object which can be
21 used to query various info about the object, if available.
22 (mimetools.Message objects are queried with the getheader() method.)
32 from urlparse import urljoin as basejoin
34 __all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
35 "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
36 "urlencode", "url2pathname", "pathname2url", "splittag",
37 "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
38 "splittype", "splithost", "splituser", "splitpasswd", "splitport",
39 "splitnport", "splitquery", "splitattr", "splitvalue",
42 __version__ = '1.17' # XXX This version is not always updated :-(
44 MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
46 # Helper for non-unix systems
48 from nturl2path import url2pathname, pathname2url
49 elif os.name == 'riscos':
50 from rourl2path import url2pathname, pathname2url
52 def url2pathname(pathname):
53 """OS-specific conversion from a relative URL of the 'file' scheme
54 to a file system path; not recommended for general use."""
55 return unquote(pathname)
57 def pathname2url(pathname):
58 """OS-specific conversion from a file system path to a relative URL
59 of the 'file' scheme; not recommended for general use."""
60 return quote(pathname)
62 # This really consists of two pieces:
63 # (1) a class which handles opening of all sorts of URLs
64 # (plus assorted utilities etc.)
65 # (2) a set of functions for parsing URLs
66 # XXX Should these be separated out into different modules?
69 # Shortcut for basic usage
71 def urlopen(url, data=None, proxies=None):
72 """Create a file-like object for the specified URL to read from."""
73 from warnings import warnpy3k
74 warnpy3k("urllib.urlopen() has been removed in Python 3.0 in "
75 "favor of urllib2.urlopen()", stacklevel=2)
78 if proxies is not None:
79 opener = FancyURLopener(proxies=proxies)
81 opener = FancyURLopener()
86 return opener.open(url)
88 return opener.open(url, data)
89 def urlretrieve(url, filename=None, reporthook=None, data=None):
92 _urlopener = FancyURLopener()
93 return _urlopener.retrieve(url, filename, reporthook, data)
108 # exception raised when downloaded size does not match content-length
109 class ContentTooShortError(IOError):
110 def __init__(self, message, content):
111 IOError.__init__(self, message)
112 self.content = content
116 """Class to open URLs.
117 This is a class rather than just a subroutine because we may need
118 more than one set of global protocol-specific options.
119 Note -- this is a base class for those who don't want the
120 automatic handling of errors type 302 (relocated) and 401
121 (authorization needed)."""
125 version = "Python-urllib/%s" % __version__
128 def __init__(self, proxies=None, **x509):
130 proxies = getproxies()
131 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
132 self.proxies = proxies
133 self.key_file = x509.get('key_file')
134 self.cert_file = x509.get('cert_file')
135 self.addheaders = [('User-Agent', self.version)]
136 self.__tempfiles = []
137 self.__unlink = os.unlink # See cleanup()
138 self.tempcache = None
139 # Undocumented feature: if you assign {} to tempcache,
140 # it is used to cache files retrieved with
141 # self.retrieve(). This is not enabled by default
142 # since it does not work for changing documents (and I
143 # haven't got the logic to check expiration headers
145 self.ftpcache = ftpcache
146 # Undocumented feature: you can use a different
147 # ftp cache by assigning to the .ftpcache member;
148 # in case you want logically independent URL openers
149 # XXX This is not threadsafe. Bah.
158 # This code sometimes runs when the rest of this module
159 # has already been deleted, so it can't use any globals
160 # or import anything.
162 for file in self.__tempfiles:
167 del self.__tempfiles[:]
169 self.tempcache.clear()
171 def addheader(self, *args):
172 """Add a header to be used by the HTTP interface only
173 e.g. u.addheader('Accept', 'sound/basic')"""
174 self.addheaders.append(args)
177 def open(self, fullurl, data=None):
178 """Use URLopener().open(file) instead of open(file, 'r')."""
179 fullurl = unwrap(toBytes(fullurl))
180 # percent encode url, fixing lame server errors for e.g, like space
182 fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]|")
183 if self.tempcache and fullurl in self.tempcache:
184 filename, headers = self.tempcache[fullurl]
185 fp = open(filename, 'rb')
186 return addinfourl(fp, headers, fullurl)
187 urltype, url = splittype(fullurl)
190 if urltype in self.proxies:
191 proxy = self.proxies[urltype]
192 urltype, proxyhost = splittype(proxy)
193 host, selector = splithost(proxyhost)
194 url = (host, fullurl) # Signal special case to open_*()
197 name = 'open_' + urltype
199 name = name.replace('-', '_')
200 if not hasattr(self, name):
202 return self.open_unknown_proxy(proxy, fullurl, data)
204 return self.open_unknown(fullurl, data)
207 return getattr(self, name)(url)
209 return getattr(self, name)(url, data)
210 except socket.error, msg:
211 raise IOError, ('socket error', msg), sys.exc_info()[2]
213 def open_unknown(self, fullurl, data=None):
214 """Overridable interface to open unknown URL type."""
215 type, url = splittype(fullurl)
216 raise IOError, ('url error', 'unknown url type', type)
218 def open_unknown_proxy(self, proxy, fullurl, data=None):
219 """Overridable interface to open unknown URL type."""
220 type, url = splittype(fullurl)
221 raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
224 def retrieve(self, url, filename=None, reporthook=None, data=None):
225 """retrieve(url) returns (filename, headers) for a local object
226 or (tempfilename, headers) for a remote object."""
227 url = unwrap(toBytes(url))
228 if self.tempcache and url in self.tempcache:
229 return self.tempcache[url]
230 type, url1 = splittype(url)
231 if filename is None and (not type or type == 'file'):
233 fp = self.open_local_file(url1)
236 return url2pathname(splithost(url1)[1]), hdrs
239 fp = self.open(url, data)
243 tfp = open(filename, 'wb')
246 garbage, path = splittype(url)
247 garbage, path = splithost(path or "")
248 path, garbage = splitquery(path or "")
249 path, garbage = splitattr(path or "")
250 suffix = os.path.splitext(path)[1]
251 (fd, filename) = tempfile.mkstemp(suffix)
252 self.__tempfiles.append(filename)
253 tfp = os.fdopen(fd, 'wb')
255 result = filename, headers
256 if self.tempcache is not None:
257 self.tempcache[url] = result
262 if "content-length" in headers:
263 size = int(headers["Content-Length"])
265 reporthook(blocknum, bs, size)
274 reporthook(blocknum, bs, size)
280 # raise exception if actual size does not match content-length header
281 if size >= 0 and read < size:
282 raise ContentTooShortError("retrieval incomplete: got only %i out "
283 "of %i bytes" % (read, size), result)
287 # Each method named open_<type> knows how to open that type of URL
289 def open_http(self, url, data=None):
290 """Use HTTP protocol."""
294 if isinstance(url, str):
295 host, selector = splithost(url)
297 user_passwd, host = splituser(host)
302 # check whether the proxy contains authorization information
303 proxy_passwd, host = splituser(host)
304 # now we proceed with the url we want to obtain
305 urltype, rest = splittype(selector)
308 if urltype.lower() != 'http':
311 realhost, rest = splithost(rest)
313 user_passwd, realhost = splituser(realhost)
315 selector = "%s://%s%s" % (urltype, realhost, rest)
316 if proxy_bypass(realhost):
319 #print "proxy via http:", host, selector
320 if not host: raise IOError, ('http error', 'no host given')
323 proxy_passwd = unquote(proxy_passwd)
324 proxy_auth = base64.b64encode(proxy_passwd).strip()
329 user_passwd = unquote(user_passwd)
330 auth = base64.b64encode(user_passwd).strip()
333 h = httplib.HTTP(host)
335 h.putrequest('POST', selector)
336 h.putheader('Content-Type', 'application/x-www-form-urlencoded')
337 h.putheader('Content-Length', '%d' % len(data))
339 h.putrequest('GET', selector)
340 if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
341 if auth: h.putheader('Authorization', 'Basic %s' % auth)
342 if realhost: h.putheader('Host', realhost)
343 for args in self.addheaders: h.putheader(*args)
345 errcode, errmsg, headers = h.getreply()
349 # something went wrong with the HTTP status line
350 raise IOError, ('http protocol error', 0,
351 'got a bad status line', None)
352 # According to RFC 2616, "2xx" code indicates that the client's
353 # request was successfully received, understood, and accepted.
354 if (200 <= errcode < 300):
355 return addinfourl(fp, headers, "http:" + url, errcode)
358 return self.http_error(url, fp, errcode, errmsg, headers)
360 return self.http_error(url, fp, errcode, errmsg, headers, data)
362 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
363 """Handle http errors.
364 Derived class can override this, or provide specific handlers
365 named http_error_DDD where DDD is the 3-digit error code."""
366 # First check if there's a specific handler for this error
367 name = 'http_error_%d' % errcode
368 if hasattr(self, name):
369 method = getattr(self, name)
371 result = method(url, fp, errcode, errmsg, headers)
373 result = method(url, fp, errcode, errmsg, headers, data)
374 if result: return result
375 return self.http_error_default(url, fp, errcode, errmsg, headers)
377 def http_error_default(self, url, fp, errcode, errmsg, headers):
378 """Default error handler: close the connection and raise IOError."""
380 raise IOError, ('http error', errcode, errmsg, headers)
383 def open_https(self, url, data=None):
384 """Use HTTPS protocol."""
389 if isinstance(url, str):
390 host, selector = splithost(url)
392 user_passwd, host = splituser(host)
397 # here, we determine, whether the proxy contains authorization information
398 proxy_passwd, host = splituser(host)
399 urltype, rest = splittype(selector)
402 if urltype.lower() != 'https':
405 realhost, rest = splithost(rest)
407 user_passwd, realhost = splituser(realhost)
409 selector = "%s://%s%s" % (urltype, realhost, rest)
410 #print "proxy via https:", host, selector
411 if not host: raise IOError, ('https error', 'no host given')
413 proxy_passwd = unquote(proxy_passwd)
414 proxy_auth = base64.b64encode(proxy_passwd).strip()
418 user_passwd = unquote(user_passwd)
419 auth = base64.b64encode(user_passwd).strip()
422 h = httplib.HTTPS(host, 0,
423 key_file=self.key_file,
424 cert_file=self.cert_file)
426 h.putrequest('POST', selector)
427 h.putheader('Content-Type',
428 'application/x-www-form-urlencoded')
429 h.putheader('Content-Length', '%d' % len(data))
431 h.putrequest('GET', selector)
432 if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
433 if auth: h.putheader('Authorization', 'Basic %s' % auth)
434 if realhost: h.putheader('Host', realhost)
435 for args in self.addheaders: h.putheader(*args)
437 errcode, errmsg, headers = h.getreply()
441 # something went wrong with the HTTP status line
442 raise IOError, ('http protocol error', 0,
443 'got a bad status line', None)
444 # According to RFC 2616, "2xx" code indicates that the client's
445 # request was successfully received, understood, and accepted.
446 if (200 <= errcode < 300):
447 return addinfourl(fp, headers, "https:" + url, errcode)
450 return self.http_error(url, fp, errcode, errmsg, headers)
452 return self.http_error(url, fp, errcode, errmsg, headers,
455 def open_file(self, url):
456 """Use local file or FTP depending on form of URL."""
457 if not isinstance(url, str):
458 raise IOError, ('file error', 'proxy support for file protocol currently not implemented')
459 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
460 return self.open_ftp(url)
462 return self.open_local_file(url)
464 def open_local_file(self, url):
465 """Use local file."""
466 import mimetypes, mimetools, email.utils
468 from cStringIO import StringIO
470 from StringIO import StringIO
471 host, file = splithost(url)
472 localname = url2pathname(file)
474 stats = os.stat(localname)
476 raise IOError(e.errno, e.strerror, e.filename)
478 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
479 mtype = mimetypes.guess_type(url)[0]
480 headers = mimetools.Message(StringIO(
481 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
482 (mtype or 'text/plain', size, modified)))
486 urlfile = 'file://' + file
487 elif file[:2] == './':
488 raise ValueError("local file url may start with / or file:. Unknown url of type: %s" % url)
489 return addinfourl(open(localname, 'rb'),
491 host, port = splitport(host)
493 and socket.gethostbyname(host) in (localhost(), thishost()):
496 urlfile = 'file://' + file
497 return addinfourl(open(localname, 'rb'),
499 raise IOError, ('local file error', 'not on local host')
501 def open_ftp(self, url):
502 """Use FTP protocol."""
503 if not isinstance(url, str):
504 raise IOError, ('ftp error', 'proxy support for ftp protocol currently not implemented')
505 import mimetypes, mimetools
507 from cStringIO import StringIO
509 from StringIO import StringIO
510 host, path = splithost(url)
511 if not host: raise IOError, ('ftp error', 'no host given')
512 host, port = splitport(host)
513 user, host = splituser(host)
514 if user: user, passwd = splitpasswd(user)
518 passwd = passwd or ''
519 host = socket.gethostbyname(host)
522 port = ftplib.FTP_PORT
525 path, attrs = splitattr(path)
527 dirs = path.split('/')
528 dirs, file = dirs[:-1], dirs[-1]
529 if dirs and not dirs[0]: dirs = dirs[1:]
530 if dirs and not dirs[0]: dirs[0] = '/'
531 key = user, host, port, '/'.join(dirs)
533 if len(self.ftpcache) > MAXFTPCACHE:
534 # Prune the cache, rather arbitrarily
535 for k in self.ftpcache.keys():
541 if not key in self.ftpcache:
542 self.ftpcache[key] = \
543 ftpwrapper(user, passwd, host, port, dirs)
544 if not file: type = 'D'
547 attr, value = splitvalue(attr)
548 if attr.lower() == 'type' and \
549 value in ('a', 'A', 'i', 'I', 'd', 'D'):
551 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
552 mtype = mimetypes.guess_type("ftp:" + url)[0]
555 headers += "Content-Type: %s\n" % mtype
556 if retrlen is not None and retrlen >= 0:
557 headers += "Content-Length: %d\n" % retrlen
558 headers = mimetools.Message(StringIO(headers))
559 return addinfourl(fp, headers, "ftp:" + url)
560 except ftperrors(), msg:
561 raise IOError, ('ftp error', msg), sys.exc_info()[2]
563 def open_data(self, url, data=None):
564 """Use "data" URL."""
565 if not isinstance(url, str):
566 raise IOError, ('data error', 'proxy support for data protocol currently not implemented')
569 # syntax of data URLs:
570 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
571 # mediatype := [ type "/" subtype ] *( ";" parameter )
573 # parameter := attribute "=" value
576 from cStringIO import StringIO
578 from StringIO import StringIO
580 [type, data] = url.split(',', 1)
582 raise IOError, ('data error', 'bad data URL')
584 type = 'text/plain;charset=US-ASCII'
585 semi = type.rfind(';')
586 if semi >= 0 and '=' not in type[semi:]:
587 encoding = type[semi+1:]
592 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %H:%M:%S GMT',
593 time.gmtime(time.time())))
594 msg.append('Content-type: %s' % type)
595 if encoding == 'base64':
596 data = base64.decodestring(data)
599 msg.append('Content-Length: %d' % len(data))
604 headers = mimetools.Message(f, 0)
605 #f.fileno = None # needed for addinfourl
606 return addinfourl(f, headers, url)
609 class FancyURLopener(URLopener):
610 """Derived class with handlers for errors we can handle (perhaps)."""
612 def __init__(self, *args, **kwargs):
613 URLopener.__init__(self, *args, **kwargs)
618 def http_error_default(self, url, fp, errcode, errmsg, headers):
619 """Default error handling -- don't raise an exception."""
620 return addinfourl(fp, headers, "http:" + url, errcode)
622 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
623 """Error 302 -- relocated (temporarily)."""
625 if self.maxtries and self.tries >= self.maxtries:
626 if hasattr(self, "http_error_500"):
627 meth = self.http_error_500
629 meth = self.http_error_default
631 return meth(url, fp, 500,
632 "Internal Server Error: Redirect Recursion", headers)
633 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
638 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
639 if 'location' in headers:
640 newurl = headers['location']
641 elif 'uri' in headers:
642 newurl = headers['uri']
646 # In case the server sent a relative URL, join with original:
647 newurl = basejoin(self.type + ":" + url, newurl)
649 # For security reasons we do not allow redirects to protocols
650 # other than HTTP, HTTPS or FTP.
651 newurl_lower = newurl.lower()
652 if not (newurl_lower.startswith('http://') or
653 newurl_lower.startswith('https://') or
654 newurl_lower.startswith('ftp://')):
655 raise IOError('redirect error', errcode,
656 errmsg + " - Redirection to url '%s' is not allowed" %
660 return self.open(newurl)
662 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
663 """Error 301 -- also relocated (permanently)."""
664 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
666 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
667 """Error 303 -- also relocated (essentially identical to 302)."""
668 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
670 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
671 """Error 307 -- relocated, but turn POST into error."""
673 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
675 return self.http_error_default(url, fp, errcode, errmsg, headers)
677 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
678 """Error 401 -- authentication required.
679 This function supports Basic authentication only."""
680 if not 'www-authenticate' in headers:
681 URLopener.http_error_default(self, url, fp,
682 errcode, errmsg, headers)
683 stuff = headers['www-authenticate']
685 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
687 URLopener.http_error_default(self, url, fp,
688 errcode, errmsg, headers)
689 scheme, realm = match.groups()
690 if scheme.lower() != 'basic':
691 URLopener.http_error_default(self, url, fp,
692 errcode, errmsg, headers)
693 name = 'retry_' + self.type + '_basic_auth'
695 return getattr(self,name)(url, realm)
697 return getattr(self,name)(url, realm, data)
699 def http_error_407(self, url, fp, errcode, errmsg, headers, data=None):
700 """Error 407 -- proxy authentication required.
701 This function supports Basic authentication only."""
702 if not 'proxy-authenticate' in headers:
703 URLopener.http_error_default(self, url, fp,
704 errcode, errmsg, headers)
705 stuff = headers['proxy-authenticate']
707 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
709 URLopener.http_error_default(self, url, fp,
710 errcode, errmsg, headers)
711 scheme, realm = match.groups()
712 if scheme.lower() != 'basic':
713 URLopener.http_error_default(self, url, fp,
714 errcode, errmsg, headers)
715 name = 'retry_proxy_' + self.type + '_basic_auth'
717 return getattr(self,name)(url, realm)
719 return getattr(self,name)(url, realm, data)
721 def retry_proxy_http_basic_auth(self, url, realm, data=None):
722 host, selector = splithost(url)
723 newurl = 'http://' + host + selector
724 proxy = self.proxies['http']
725 urltype, proxyhost = splittype(proxy)
726 proxyhost, proxyselector = splithost(proxyhost)
727 i = proxyhost.find('@') + 1
728 proxyhost = proxyhost[i:]
729 user, passwd = self.get_user_passwd(proxyhost, realm, i)
730 if not (user or passwd): return None
731 proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost
732 self.proxies['http'] = 'http://' + proxyhost + proxyselector
734 return self.open(newurl)
736 return self.open(newurl, data)
738 def retry_proxy_https_basic_auth(self, url, realm, data=None):
739 host, selector = splithost(url)
740 newurl = 'https://' + host + selector
741 proxy = self.proxies['https']
742 urltype, proxyhost = splittype(proxy)
743 proxyhost, proxyselector = splithost(proxyhost)
744 i = proxyhost.find('@') + 1
745 proxyhost = proxyhost[i:]
746 user, passwd = self.get_user_passwd(proxyhost, realm, i)
747 if not (user or passwd): return None
748 proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost
749 self.proxies['https'] = 'https://' + proxyhost + proxyselector
751 return self.open(newurl)
753 return self.open(newurl, data)
755 def retry_http_basic_auth(self, url, realm, data=None):
756 host, selector = splithost(url)
757 i = host.find('@') + 1
759 user, passwd = self.get_user_passwd(host, realm, i)
760 if not (user or passwd): return None
761 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
762 newurl = 'http://' + host + selector
764 return self.open(newurl)
766 return self.open(newurl, data)
768 def retry_https_basic_auth(self, url, realm, data=None):
769 host, selector = splithost(url)
770 i = host.find('@') + 1
772 user, passwd = self.get_user_passwd(host, realm, i)
773 if not (user or passwd): return None
774 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
775 newurl = 'https://' + host + selector
777 return self.open(newurl)
779 return self.open(newurl, data)
781 def get_user_passwd(self, host, realm, clear_cache=0):
782 key = realm + '@' + host.lower()
783 if key in self.auth_cache:
785 del self.auth_cache[key]
787 return self.auth_cache[key]
788 user, passwd = self.prompt_user_passwd(host, realm)
789 if user or passwd: self.auth_cache[key] = (user, passwd)
792 def prompt_user_passwd(self, host, realm):
793 """Override this in a GUI environment!"""
796 user = raw_input("Enter username for %s at %s: " % (realm,
798 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
801 except KeyboardInterrupt:
810 """Return the IP address of the magic hostname 'localhost'."""
812 if _localhost is None:
813 _localhost = socket.gethostbyname('localhost')
818 """Return the IP address of the current host."""
820 if _thishost is None:
821 _thishost = socket.gethostbyname(socket.gethostname())
826 """Return the set of errors raised by the FTP class."""
828 if _ftperrors is None:
830 _ftperrors = ftplib.all_errors
835 """Return an empty mimetools.Message object."""
837 if _noheaders is None:
840 from cStringIO import StringIO
842 from StringIO import StringIO
843 _noheaders = mimetools.Message(StringIO(), 0)
844 _noheaders.fp.close() # Recycle file descriptor
851 """Class used by open_ftp() for cache of open FTP connections."""
853 def __init__(self, user, passwd, host, port, dirs,
854 timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
861 self.timeout = timeout
863 self.keepalive = persistent
869 self.ftp = ftplib.FTP()
870 self.ftp.connect(self.host, self.port, self.timeout)
871 self.ftp.login(self.user, self.passwd)
872 for dir in self.dirs:
875 def retrfile(self, file, type):
878 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
879 else: cmd = 'TYPE ' + type; isdir = 0
881 self.ftp.voidcmd(cmd)
882 except ftplib.all_errors:
884 self.ftp.voidcmd(cmd)
886 if file and not isdir:
887 # Try to retrieve as a file
890 conn, retrlen = self.ftp.ntransfercmd(cmd)
891 except ftplib.error_perm, reason:
892 if str(reason)[:3] != '550':
893 raise IOError, ('ftp error', reason), sys.exc_info()[2]
895 # Set transfer mode to ASCII!
896 self.ftp.voidcmd('TYPE A')
897 # Try a directory listing. Verify that directory exists.
903 except ftplib.error_perm, reason:
904 raise IOError, ('ftp error', reason), sys.exc_info()[2]
910 conn, retrlen = self.ftp.ntransfercmd(cmd)
912 ftpobj = addclosehook(conn.makefile('rb'), self.file_close)
915 # Pass back both a suitably decorated object and a retrieval length
916 return (ftpobj, retrlen)
918 def endtransfer(self):
928 self.keepalive = False
929 if self.refcount <= 0:
932 def file_close(self):
935 if self.refcount <= 0 and not self.keepalive:
938 def real_close(self):
946 """Base class for addinfo and addclosehook."""
948 def __init__(self, fp):
950 self.read = self.fp.read
951 self.readline = self.fp.readline
952 if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
953 if hasattr(self.fp, "fileno"):
954 self.fileno = self.fp.fileno
956 self.fileno = lambda: None
957 if hasattr(self.fp, "__iter__"):
958 self.__iter__ = self.fp.__iter__
959 if hasattr(self.fp, "next"):
960 self.next = self.fp.next
963 return '<%s at %r whose fp = %r>' % (self.__class__.__name__,
969 self.readlines = None
971 if self.fp: self.fp.close()
974 class addclosehook(addbase):
975 """Class to add a close hook to an open file."""
977 def __init__(self, fp, closehook, *hookargs):
978 addbase.__init__(self, fp)
979 self.closehook = closehook
980 self.hookargs = hookargs
985 self.closehook(*self.hookargs)
986 self.closehook = None
989 class addinfo(addbase):
990 """class to add an info() method to an open file."""
992 def __init__(self, fp, headers):
993 addbase.__init__(self, fp)
994 self.headers = headers
999 class addinfourl(addbase):
1000 """class to add info() and geturl() methods to an open file."""
1002 def __init__(self, fp, headers, url, code=None):
1003 addbase.__init__(self, fp)
1004 self.headers = headers
1018 # Utilities to parse URLs (most of these return None for missing parts):
1019 # unwrap('<URL:type://host/path>') --> 'type://host/path'
1020 # splittype('type:opaquestring') --> 'type', 'opaquestring'
1021 # splithost('//host[:port]/path') --> 'host[:port]', '/path'
1022 # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
1023 # splitpasswd('user:passwd') -> 'user', 'passwd'
1024 # splitport('host:port') --> 'host', 'port'
1025 # splitquery('/path?query') --> '/path', 'query'
1026 # splittag('/path#tag') --> '/path', 'tag'
1027 # splitattr('/path;attr1=value1;attr2=value2;...') ->
1028 # '/path', ['attr1=value1', 'attr2=value2', ...]
1029 # splitvalue('attr=value') --> 'attr', 'value'
1030 # unquote('abc%20def') -> 'abc def'
1031 # quote('abc def') -> 'abc%20def')
1040 return isinstance(x, unicode)
1043 """toBytes(u"URL") --> 'URL'."""
1044 # Most URL schemes require ASCII. If that changes, the conversion
1046 if _is_unicode(url):
1048 url = url.encode("ASCII")
1049 except UnicodeError:
1050 raise UnicodeError("URL " + repr(url) +
1051 " contains non-ASCII characters")
1055 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
1057 if url[:1] == '<' and url[-1:] == '>':
1058 url = url[1:-1].strip()
1059 if url[:4] == 'URL:': url = url[4:].strip()
1064 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
1066 if _typeprog is None:
1068 _typeprog = re.compile('^([^/:]+):')
1070 match = _typeprog.match(url)
1072 scheme = match.group(1)
1073 return scheme.lower(), url[len(scheme) + 1:]
1078 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
1080 if _hostprog is None:
1082 _hostprog = re.compile('^//([^/?]*)(.*)$')
1084 match = _hostprog.match(url)
1086 host_port = match.group(1)
1087 path = match.group(2)
1088 if path and not path.startswith('/'):
1090 return host_port, path
1094 def splituser(host):
1095 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
1097 if _userprog is None:
1099 _userprog = re.compile('^(.*)@(.*)$')
1101 match = _userprog.match(host)
1102 if match: return match.group(1, 2)
1106 def splitpasswd(user):
1107 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
1109 if _passwdprog is None:
1111 _passwdprog = re.compile('^([^:]*):(.*)$',re.S)
1113 match = _passwdprog.match(user)
1114 if match: return match.group(1, 2)
1117 # splittag('/path#tag') --> '/path', 'tag'
1119 def splitport(host):
1120 """splitport('host:port') --> 'host', 'port'."""
1122 if _portprog is None:
1124 _portprog = re.compile('^(.*):([0-9]+)$')
1126 match = _portprog.match(host)
1127 if match: return match.group(1, 2)
1131 def splitnport(host, defport=-1):
1132 """Split host and port, returning numeric port.
1133 Return given default port if no ':' found; defaults to -1.
1134 Return numerical port if a valid number are found after ':'.
1135 Return None if ':' but not a valid number."""
1137 if _nportprog is None:
1139 _nportprog = re.compile('^(.*):(.*)$')
1141 match = _nportprog.match(host)
1143 host, port = match.group(1, 2)
1145 if not port: raise ValueError, "no digits"
1150 return host, defport
1153 def splitquery(url):
1154 """splitquery('/path?query') --> '/path', 'query'."""
1156 if _queryprog is None:
1158 _queryprog = re.compile('^(.*)\?([^?]*)$')
1160 match = _queryprog.match(url)
1161 if match: return match.group(1, 2)
1166 """splittag('/path#tag') --> '/path', 'tag'."""
1168 if _tagprog is None:
1170 _tagprog = re.compile('^(.*)#([^#]*)$')
1172 match = _tagprog.match(url)
1173 if match: return match.group(1, 2)
1177 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1178 '/path', ['attr1=value1', 'attr2=value2', ...]."""
1179 words = url.split(';')
1180 return words[0], words[1:]
1183 def splitvalue(attr):
1184 """splitvalue('attr=value') --> 'attr', 'value'."""
1186 if _valueprog is None:
1188 _valueprog = re.compile('^([^=]*)=(.*)$')
1190 match = _valueprog.match(attr)
1191 if match: return match.group(1, 2)
1194 # urlparse contains a duplicate of this method to avoid a circular import. If
1195 # you update this method, also update the copy in urlparse. This code
1196 # duplication does not exist in Python3.
1198 _hexdig = '0123456789ABCDEFabcdef'
1199 _hextochr = dict((a + b, chr(int(a + b, 16)))
1200 for a in _hexdig for b in _hexdig)
1203 """unquote('abc%20def') -> 'abc def'."""
1209 for item in res[1:]:
1211 s += _hextochr[item[:2]] + item[2:]
1214 except UnicodeDecodeError:
1215 s += unichr(int(item[:2], 16)) + item[2:]
1218 def unquote_plus(s):
1219 """unquote('%7e/abc+def') -> '~/abc def'"""
1220 s = s.replace('+', ' ')
1223 always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
1224 'abcdefghijklmnopqrstuvwxyz'
1227 for i, c in zip(xrange(256), str(bytearray(xrange(256)))):
1228 _safe_map[c] = c if (i < 128 and c in always_safe) else '%{:02X}'.format(i)
1231 def quote(s, safe='/'):
1232 """quote('abc def') -> 'abc%20def'
1234 Each part of a URL, e.g. the path info, the query, etc., has a
1235 different set of reserved characters that must be quoted.
1237 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1238 the following reserved characters.
1240 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1243 Each of these characters is reserved in some component of a URL,
1244 but not necessarily in all of them.
1246 By default, the quote function is intended for quoting the path
1247 section of a URL. Thus, it will not encode '/'. This character
1248 is reserved, but in typical usage the quote function is being
1249 called on a path where the existing slash characters are used as
1250 reserved characters.
1255 raise TypeError('None object cannot be quoted')
1257 cachekey = (safe, always_safe)
1259 (quoter, safe) = _safe_quoters[cachekey]
1261 safe_map = _safe_map.copy()
1262 safe_map.update([(c, c) for c in safe])
1263 quoter = safe_map.__getitem__
1264 safe = always_safe + safe
1265 _safe_quoters[cachekey] = (quoter, safe)
1266 if not s.rstrip(safe):
1268 return ''.join(map(quoter, s))
1270 def quote_plus(s, safe=''):
1271 """Quote the query fragment of a URL; replacing ' ' with '+'"""
1273 s = quote(s, safe + ' ')
1274 return s.replace(' ', '+')
1275 return quote(s, safe)
1277 def urlencode(query, doseq=0):
1278 """Encode a sequence of two-element tuples or dictionary into a URL query string.
1280 If any values in the query arg are sequences and doseq is true, each
1281 sequence element is converted to a separate parameter.
1283 If the query arg is a sequence of two-element tuples, the order of the
1284 parameters in the output will match the order of parameters in the
1288 if hasattr(query,"items"):
1290 query = query.items()
1292 # it's a bother at times that strings and string-like objects are
1295 # non-sequence items should not work with len()
1296 # non-empty strings will fail this
1297 if len(query) and not isinstance(query[0], tuple):
1299 # zero-length sequences of all types will get here and succeed,
1300 # but that's a minor nit - since the original implementation
1301 # allowed empty dicts that type of behavior probably should be
1302 # preserved for consistency
1304 ty,va,tb = sys.exc_info()
1305 raise TypeError, "not a valid non-string sequence or mapping object", tb
1309 # preserve old behavior
1311 k = quote_plus(str(k))
1312 v = quote_plus(str(v))
1313 l.append(k + '=' + v)
1316 k = quote_plus(str(k))
1317 if isinstance(v, str):
1319 l.append(k + '=' + v)
1320 elif _is_unicode(v):
1321 # is there a reasonable way to convert to ASCII?
1322 # encode generates a string, but "replace" or "ignore"
1323 # lose information and "strict" can raise UnicodeError
1324 v = quote_plus(v.encode("ASCII","replace"))
1325 l.append(k + '=' + v)
1328 # is this a sufficient test for sequence-ness?
1332 v = quote_plus(str(v))
1333 l.append(k + '=' + v)
1335 # loop over the sequence
1337 l.append(k + '=' + quote_plus(str(elt)))
1341 def getproxies_environment():
1342 """Return a dictionary of scheme -> proxy server URL mappings.
1344 Scan the environment for variables named <scheme>_proxy;
1345 this seems to be the standard convention. If you need a
1346 different way, you can pass a proxies dictionary to the
1347 [Fancy]URLopener constructor.
1351 for name, value in os.environ.items():
1353 if value and name[-6:] == '_proxy':
1354 proxies[name[:-6]] = value
1357 def proxy_bypass_environment(host):
1358 """Test if proxies should not be used for a particular host.
1360 Checks the environment for a variable named no_proxy, which should
1361 be a list of DNS suffixes separated by commas, or '*' for all hosts.
1363 no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
1364 # '*' is special case for always bypass
1367 # strip port off host
1368 hostonly, port = splitport(host)
1369 # check if the host ends with any of the DNS suffixes
1370 no_proxy_list = [proxy.strip() for proxy in no_proxy.split(',')]
1371 for name in no_proxy_list:
1372 if name and (hostonly.endswith(name) or host.endswith(name)):
1374 # otherwise, don't bypass
1378 if sys.platform == 'darwin':
1379 from _scproxy import _get_proxy_settings, _get_proxies
1381 def proxy_bypass_macosx_sysconf(host):
1383 Return True iff this host shouldn't be accessed using a proxy
1385 This function uses the MacOSX framework SystemConfiguration
1386 to fetch the proxy information.
1390 from fnmatch import fnmatch
1392 hostonly, port = splitport(host)
1395 parts = ipAddr.split('.')
1396 parts = map(int, parts)
1398 parts = (parts + [0, 0, 0, 0])[:4]
1399 return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
1401 proxy_settings = _get_proxy_settings()
1403 # Check for simple host names:
1405 if proxy_settings['exclude_simple']:
1410 for value in proxy_settings.get('exceptions', ()):
1411 # Items in the list are strings like these: *.local, 169.254/16
1412 if not value: continue
1414 m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
1418 hostIP = socket.gethostbyname(hostonly)
1419 hostIP = ip2num(hostIP)
1420 except socket.error:
1423 base = ip2num(m.group(1))
1426 mask = 8 * (m.group(1).count('.') + 1)
1429 mask = int(mask[1:])
1432 if (hostIP >> mask) == (base >> mask):
1435 elif fnmatch(host, value):
1440 def getproxies_macosx_sysconf():
1441 """Return a dictionary of scheme -> proxy server URL mappings.
1443 This function uses the MacOSX framework SystemConfiguration
1444 to fetch the proxy information.
1446 return _get_proxies()
1448 def proxy_bypass(host):
1449 if getproxies_environment():
1450 return proxy_bypass_environment(host)
1452 return proxy_bypass_macosx_sysconf(host)
1455 return getproxies_environment() or getproxies_macosx_sysconf()
1457 elif os.name == 'nt':
1458 def getproxies_registry():
1459 """Return a dictionary of scheme -> proxy server URL mappings.
1461 Win32 uses the registry to store proxies.
1468 # Std module, so should be around - but you never know!
1471 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1472 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1473 proxyEnable = _winreg.QueryValueEx(internetSettings,
1476 # Returned as Unicode but problems if not converted to ASCII
1477 proxyServer = str(_winreg.QueryValueEx(internetSettings,
1479 if '=' in proxyServer:
1480 # Per-protocol settings
1481 for p in proxyServer.split(';'):
1482 protocol, address = p.split('=', 1)
1483 # See if address has a type:// prefix
1485 if not re.match('^([^/:]+)://', address):
1486 address = '%s://%s' % (protocol, address)
1487 proxies[protocol] = address
1489 # Use one setting for all protocols
1490 if proxyServer[:5] == 'http:':
1491 proxies['http'] = proxyServer
1493 proxies['http'] = 'http://%s' % proxyServer
1494 proxies['https'] = 'https://%s' % proxyServer
1495 proxies['ftp'] = 'ftp://%s' % proxyServer
1496 internetSettings.Close()
1497 except (WindowsError, ValueError, TypeError):
1498 # Either registry key not found etc, or the value in an
1499 # unexpected format.
1500 # proxies already set up to be empty so nothing to do
1505 """Return a dictionary of scheme -> proxy server URL mappings.
1507 Returns settings gathered from the environment, if specified,
1511 return getproxies_environment() or getproxies_registry()
1513 def proxy_bypass_registry(host):
1518 # Std modules, so should be around - but you never know!
1521 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1522 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1523 proxyEnable = _winreg.QueryValueEx(internetSettings,
1525 proxyOverride = str(_winreg.QueryValueEx(internetSettings,
1526 'ProxyOverride')[0])
1527 # ^^^^ Returned as Unicode but problems if not converted to ASCII
1528 except WindowsError:
1530 if not proxyEnable or not proxyOverride:
1532 # try to make a host list from name and IP address.
1533 rawHost, port = splitport(host)
1536 addr = socket.gethostbyname(rawHost)
1539 except socket.error:
1542 fqdn = socket.getfqdn(rawHost)
1545 except socket.error:
1547 # make a check value list from the registry entry: replace the
1548 # '<local>' string by the localhost entry and the corresponding
1550 proxyOverride = proxyOverride.split(';')
1551 # now check if we match one of the registry values.
1552 for test in proxyOverride:
1553 if test == '<local>':
1554 if '.' not in rawHost:
1556 test = test.replace(".", r"\.") # mask dots
1557 test = test.replace("*", r".*") # change glob sequence
1558 test = test.replace("?", r".") # change glob char
1560 # print "%s <--> %s" %( test, val )
1561 if re.match(test, val, re.I):
1565 def proxy_bypass(host):
1566 """Return a dictionary of scheme -> proxy server URL mappings.
1568 Returns settings gathered from the environment, if specified,
1572 if getproxies_environment():
1573 return proxy_bypass_environment(host)
1575 return proxy_bypass_registry(host)
1578 # By default use environment variables
1579 getproxies = getproxies_environment
1580 proxy_bypass = proxy_bypass_environment
1582 # Test and time quote() and unquote()
1585 for i in range(256): s = s + chr(i)
1596 print round(t1 - t0, 3), 'sec'
1599 def reporthook(blocknum, blocksize, totalsize):
1600 # Report during remote transfers
1601 print "Block number: %d, Block size: %d, Total size: %d" % (
1602 blocknum, blocksize, totalsize)