Lib/urllib2.py

   1 """An extensible library for opening URLs using a variety of protocols
   2
   3 The simplest way to use this module is to call the urlopen function,
   4 which accepts a string containing a URL or a Request object (described
   5 below).  It opens the URL and returns the results as file-like
   6 object; the returned object has some extra methods described below.
   7
   8 The OpenerDirector manages a collection of Handler objects that do
   9 all the actual work.  Each Handler implements a particular protocol or
  10 option.  The OpenerDirector is a composite object that invokes the
  11 Handlers needed to open the requested URL.  For example, the
  12 HTTPHandler performs HTTP GET and POST requests and deals with
  13 non-error returns.  The HTTPRedirectHandler automatically deals with
  14 HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
  15 deals with digest authentication.
  16
  17 urlopen(url, data=None) -- Basic usage is the same as original
  18 urllib.  pass the url and optionally data to post to an HTTP URL, and
  19 get a file-like object back.  One difference is that you can also pass
  20 a Request instance instead of URL.  Raises a URLError (subclass of
  21 IOError); for HTTP errors, raises an HTTPError, which can also be
  22 treated as a valid response.
  23
  24 build_opener -- Function that creates a new OpenerDirector instance.
  25 Will install the default handlers.  Accepts one or more Handlers as
  26 arguments, either instances or Handler classes that it will
  27 instantiate.  If one of the argument is a subclass of the default
  28 handler, the argument will be installed instead of the default.
  29
  30 install_opener -- Installs a new opener as the default opener.
  31
  32 objects of interest:
  33
  34 OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
  35 the Handler classes, while dealing with requests and responses.
  36
  37 Request -- An object that encapsulates the state of a request.  The
  38 state can be as simple as the URL.  It can also include extra HTTP
  39 headers, e.g. a User-Agent.
  40
  41 BaseHandler --
  42
  43 exceptions:
  44 URLError -- A subclass of IOError, individual protocols have their own
  45 specific subclass.
  46
  47 HTTPError -- Also a valid HTTP response, so you can treat an HTTP error
  48 as an exceptional event or valid response.
  49
  50 internals:
  51 BaseHandler and parent
  52 _call_chain conventions
  53
  54 Example usage:
  55
  56 import urllib2
  57
  58 # set up authentication info
  59 authinfo = urllib2.HTTPBasicAuthHandler()
  60 authinfo.add_password(realm='PDQ Application',
  61                       uri='https://mahler:8092/site-updates.py',
  62                       user='klem',
  63                       passwd='geheim$parole')
  64
  65 proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
  66
  67 # build a new opener that adds authentication and caching FTP handlers
  68 opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
  69
  70 # install it
  71 urllib2.install_opener(opener)
  72
  73 f = urllib2.urlopen('http://www.python.org/')
  74
  75
  76 """
  77
  78 # XXX issues:
  79 # If an authentication error handler that tries to perform
  80 # authentication for some reason but fails, how should the error be
  81 # signalled?  The client needs to know the HTTP error code.  But if
  82 # the handler knows that the problem was, e.g., that it didn't know
  83 # that hash algo that requested in the challenge, it would be good to
  84 # pass that information along to the client, too.
  85 # ftp errors aren't handled cleanly
  86 # check digest against correct (i.e. non-apache) implementation
  87
  88 # Possible extensions:
  89 # complex proxies  XXX not sure what exactly was meant by this
  90 # abstract factory for opener
  91
  92 import base64
  93 import hashlib
  94 import httplib
  95 import mimetools
  96 import os
  97 import posixpath
  98 import random
  99 import re
 100 import socket
 101 import sys
 102 import time
 103 import urlparse
 104 import bisect
 105
 106 try:
 107     from cStringIO import StringIO
 108 except ImportError:
 109     from StringIO import StringIO
 110
 111 from urllib import (unwrap, unquote, splittype, splithost, quote,
 112      addinfourl, splitport, splittag,
 113      splitattr, ftpwrapper, splituser, splitpasswd, splitvalue)
 114
 115 # support for FileHandler, proxies via environment variables
 116 from urllib import localhost, url2pathname, getproxies, proxy_bypass
 117
 118 # used in User-Agent header sent
 119 __version__ = sys.version[:3]
 120
 121 _opener = None
 122 def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
 123     global _opener
 124     if _opener is None:
 125         _opener = build_opener()
 126     return _opener.open(url, data, timeout)
 127
 128 def install_opener(opener):
 129     global _opener
 130     _opener = opener
 131
 132 # do these error classes make sense?
 133 # make sure all of the IOError stuff is overridden.  we just want to be
 134 # subtypes.
 135
 136 class URLError(IOError):
 137     # URLError is a sub-type of IOError, but it doesn't share any of
 138     # the implementation.  need to override __init__ and __str__.
 139     # It sets self.args for compatibility with other EnvironmentError
 140     # subclasses, but args doesn't have the typical format with errno in
 141     # slot 0 and strerror in slot 1.  This may be better than nothing.
 142     def __init__(self, reason):
 143         self.args = reason,
 144         self.reason = reason
 145
 146     def __str__(self):
 147         return '<urlopen error %s>' % self.reason
 148
 149 class HTTPError(URLError, addinfourl):
 150     """Raised when HTTP error occurs, but also acts like non-error return"""
 151     __super_init = addinfourl.__init__
 152
 153     def __init__(self, url, code, msg, hdrs, fp):
 154         self.code = code
 155         self.msg = msg
 156         self.hdrs = hdrs
 157         self.fp = fp
 158         self.filename = url
 159         # The addinfourl classes depend on fp being a valid file
 160         # object.  In some cases, the HTTPError may not have a valid
 161         # file object.  If this happens, the simplest workaround is to
 162         # not initialize the base classes.
 163         if fp is not None:
 164             self.__super_init(fp, hdrs, url, code)
 165
 166     def __str__(self):
 167         return 'HTTP Error %s: %s' % (self.code, self.msg)
 168
 169     # since URLError specifies a .reason attribute, HTTPError should also
 170     #  provide this attribute. See issue13211 fo discussion.
 171     @property
 172     def reason(self):
 173         return self.msg
 174
 175 # copied from cookielib.py
 176 _cut_port_re = re.compile(r":\d+$")
 177 def request_host(request):
 178     """Return request-host, as defined by RFC 2965.
 179
 180     Variation from RFC: returned value is lowercased, for convenient
 181     comparison.
 182
 183     """
 184     url = request.get_full_url()
 185     host = urlparse.urlparse(url)[1]
 186     if host == "":
 187         host = request.get_header("Host", "")
 188
 189     # remove port, if present
 190     host = _cut_port_re.sub("", host, 1)
 191     return host.lower()
 192
 193 class Request:
 194
 195     def __init__(self, url, data=None, headers={},
 196                  origin_req_host=None, unverifiable=False):
 197         # unwrap('<URL:type://host/path>') --> 'type://host/path'
 198         self.__original = unwrap(url)
 199         self.__original, self.__fragment = splittag(self.__original)
 200         self.type = None
 201         # self.__r_type is what's left after doing the splittype
 202         self.host = None
 203         self.port = None
 204         self._tunnel_host = None
 205         self.data = data
 206         self.headers = {}
 207         for key, value in headers.items():
 208             self.add_header(key, value)
 209         self.unredirected_hdrs = {}
 210         if origin_req_host is None:
 211             origin_req_host = request_host(self)
 212         self.origin_req_host = origin_req_host
 213         self.unverifiable = unverifiable
 214
 215     def __getattr__(self, attr):
 216         # XXX this is a fallback mechanism to guard against these
 217         # methods getting called in a non-standard order.  this may be
 218         # too complicated and/or unnecessary.
 219         # XXX should the __r_XXX attributes be public?
 220         if attr[:12] == '_Request__r_':
 221             name = attr[12:]
 222             if hasattr(Request, 'get_' + name):
 223                 getattr(self, 'get_' + name)()
 224                 return getattr(self, attr)
 225         raise AttributeError, attr
 226
 227     def get_method(self):
 228         if self.has_data():
 229             return "POST"
 230         else:
 231             return "GET"
 232
 233     # XXX these helper methods are lame
 234
 235     def add_data(self, data):
 236         self.data = data
 237
 238     def has_data(self):
 239         return self.data is not None
 240
 241     def get_data(self):
 242         return self.data
 243
 244     def get_full_url(self):
 245         if self.__fragment:
 246             return '%s#%s' % (self.__original, self.__fragment)
 247         else:
 248             return self.__original
 249
 250     def get_type(self):
 251         if self.type is None:
 252             self.type, self.__r_type = splittype(self.__original)
 253             if self.type is None:
 254                 raise ValueError, "unknown url type: %s" % self.__original
 255         return self.type
 256
 257     def get_host(self):
 258         if self.host is None:
 259             self.host, self.__r_host = splithost(self.__r_type)
 260             if self.host:
 261                 self.host = unquote(self.host)
 262         return self.host
 263
 264     def get_selector(self):
 265         return self.__r_host
 266
 267     def set_proxy(self, host, type):
 268         if self.type == 'https' and not self._tunnel_host:
 269             self._tunnel_host = self.host
 270         else:
 271             self.type = type
 272             self.__r_host = self.__original
 273
 274         self.host = host
 275
 276     def has_proxy(self):
 277         return self.__r_host == self.__original
 278
 279     def get_origin_req_host(self):
 280         return self.origin_req_host
 281
 282     def is_unverifiable(self):
 283         return self.unverifiable
 284
 285     def add_header(self, key, val):
 286         # useful for something like authentication
 287         self.headers[key.capitalize()] = val
 288
 289     def add_unredirected_header(self, key, val):
 290         # will not be added to a redirected request
 291         self.unredirected_hdrs[key.capitalize()] = val
 292
 293     def has_header(self, header_name):
 294         return (header_name in self.headers or
 295                 header_name in self.unredirected_hdrs)
 296
 297     def get_header(self, header_name, default=None):
 298         return self.headers.get(
 299             header_name,
 300             self.unredirected_hdrs.get(header_name, default))
 301
 302     def header_items(self):
 303         hdrs = self.unredirected_hdrs.copy()
 304         hdrs.update(self.headers)
 305         return hdrs.items()
 306
 307 class OpenerDirector:
 308     def __init__(self):
 309         client_version = "Python-urllib/%s" % __version__
 310         self.addheaders = [('User-agent', client_version)]
 311         # self.handlers is retained only for backward compatibility
 312         self.handlers = []
 313         # manage the individual handlers
 314         self.handle_open = {}
 315         self.handle_error = {}
 316         self.process_response = {}
 317         self.process_request = {}
 318
 319     def add_handler(self, handler):
 320         if not hasattr(handler, "add_parent"):
 321             raise TypeError("expected BaseHandler instance, got %r" %
 322                             type(handler))
 323
 324         added = False
 325         for meth in dir(handler):
 326             if meth in ["redirect_request", "do_open", "proxy_open"]:
 327                 # oops, coincidental match
 328                 continue
 329
 330             i = meth.find("_")
 331             protocol = meth[:i]
 332             condition = meth[i+1:]
 333
 334             if condition.startswith("error"):
 335                 j = condition.find("_") + i + 1
 336                 kind = meth[j+1:]
 337                 try:
 338                     kind = int(kind)
 339                 except ValueError:
 340                     pass
 341                 lookup = self.handle_error.get(protocol, {})
 342                 self.handle_error[protocol] = lookup
 343             elif condition == "open":
 344                 kind = protocol
 345                 lookup = self.handle_open
 346             elif condition == "response":
 347                 kind = protocol
 348                 lookup = self.process_response
 349             elif condition == "request":
 350                 kind = protocol
 351                 lookup = self.process_request
 352             else:
 353                 continue
 354
 355             handlers = lookup.setdefault(kind, [])
 356             if handlers:
 357                 bisect.insort(handlers, handler)
 358             else:
 359                 handlers.append(handler)
 360             added = True
 361
 362         if added:
 363             bisect.insort(self.handlers, handler)
 364             handler.add_parent(self)
 365
 366     def close(self):
 367         # Only exists for backwards compatibility.
 368         pass
 369
 370     def _call_chain(self, chain, kind, meth_name, *args):
 371         # Handlers raise an exception if no one else should try to handle
 372         # the request, or return None if they can't but another handler
 373         # could.  Otherwise, they return the response.
 374         handlers = chain.get(kind, ())
 375         for handler in handlers:
 376             func = getattr(handler, meth_name)
 377
 378             result = func(*args)
 379             if result is not None:
 380                 return result
 381
 382     def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
 383         # accept a URL or a Request object
 384         if isinstance(fullurl, basestring):
 385             req = Request(fullurl, data)
 386         else:
 387             req = fullurl
 388             if data is not None:
 389                 req.add_data(data)
 390
 391         req.timeout = timeout
 392         protocol = req.get_type()
 393
 394         # pre-process request
 395         meth_name = protocol+"_request"
 396         for processor in self.process_request.get(protocol, []):
 397             meth = getattr(processor, meth_name)
 398             req = meth(req)
 399
 400         response = self._open(req, data)
 401
 402         # post-process response
 403         meth_name = protocol+"_response"
 404         for processor in self.process_response.get(protocol, []):
 405             meth = getattr(processor, meth_name)
 406             response = meth(req, response)
 407
 408         return response
 409
 410     def _open(self, req, data=None):
 411         result = self._call_chain(self.handle_open, 'default',
 412                                   'default_open', req)
 413         if result:
 414             return result
 415
 416         protocol = req.get_type()
 417         result = self._call_chain(self.handle_open, protocol, protocol +
 418                                   '_open', req)
 419         if result:
 420             return result
 421
 422         return self._call_chain(self.handle_open, 'unknown',
 423                                 'unknown_open', req)
 424
 425     def error(self, proto, *args):
 426         if proto in ('http', 'https'):
 427             # XXX http[s] protocols are special-cased
 428             dict = self.handle_error['http'] # https is not different than http
 429             proto = args[2]  # YUCK!
 430             meth_name = 'http_error_%s' % proto
 431             http_err = 1
 432             orig_args = args
 433         else:
 434             dict = self.handle_error
 435             meth_name = proto + '_error'
 436             http_err = 0
 437         args = (dict, proto, meth_name) + args
 438         result = self._call_chain(*args)
 439         if result:
 440             return result
 441
 442         if http_err:
 443             args = (dict, 'default', 'http_error_default') + orig_args
 444             return self._call_chain(*args)
 445
 446 # XXX probably also want an abstract factory that knows when it makes
 447 # sense to skip a superclass in favor of a subclass and when it might
 448 # make sense to include both
 449
 450 def build_opener(*handlers):
 451     """Create an opener object from a list of handlers.
 452
 453     The opener will use several default handlers, including support
 454     for HTTP, FTP and when applicable, HTTPS.
 455
 456     If any of the handlers passed as arguments are subclasses of the
 457     default handlers, the default handlers will not be used.
 458     """
 459     import types
 460     def isclass(obj):
 461         return isinstance(obj, (types.ClassType, type))
 462
 463     opener = OpenerDirector()
 464     default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
 465                        HTTPDefaultErrorHandler, HTTPRedirectHandler,
 466                        FTPHandler, FileHandler, HTTPErrorProcessor]
 467     if hasattr(httplib, 'HTTPS'):
 468         default_classes.append(HTTPSHandler)
 469     skip = set()
 470     for klass in default_classes:
 471         for check in handlers:
 472             if isclass(check):
 473                 if issubclass(check, klass):
 474                     skip.add(klass)
 475             elif isinstance(check, klass):
 476                 skip.add(klass)
 477     for klass in skip:
 478         default_classes.remove(klass)
 479
 480     for klass in default_classes:
 481         opener.add_handler(klass())
 482
 483     for h in handlers:
 484         if isclass(h):
 485             h = h()
 486         opener.add_handler(h)
 487     return opener
 488
 489 class BaseHandler:
 490     handler_order = 500
 491
 492     def add_parent(self, parent):
 493         self.parent = parent
 494
 495     def close(self):
 496         # Only exists for backwards compatibility
 497         pass
 498
 499     def __lt__(self, other):
 500         if not hasattr(other, "handler_order"):
 501             # Try to preserve the old behavior of having custom classes
 502             # inserted after default ones (works only for custom user
 503             # classes which are not aware of handler_order).
 504             return True
 505         return self.handler_order < other.handler_order
 506
 507
 508 class HTTPErrorProcessor(BaseHandler):
 509     """Process HTTP error responses."""
 510     handler_order = 1000  # after all other processing
 511
 512     def http_response(self, request, response):
 513         code, msg, hdrs = response.code, response.msg, response.info()
 514
 515         # According to RFC 2616, "2xx" code indicates that the client's
 516         # request was successfully received, understood, and accepted.
 517         if not (200 <= code < 300):
 518             response = self.parent.error(
 519                 'http', request, response, code, msg, hdrs)
 520
 521         return response
 522
 523     https_response = http_response
 524
 525 class HTTPDefaultErrorHandler(BaseHandler):
 526     def http_error_default(self, req, fp, code, msg, hdrs):
 527         raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
 528
 529 class HTTPRedirectHandler(BaseHandler):
 530     # maximum number of redirections to any single URL
 531     # this is needed because of the state that cookies introduce
 532     max_repeats = 4
 533     # maximum total number of redirections (regardless of URL) before
 534     # assuming we're in a loop
 535     max_redirections = 10
 536
 537     def redirect_request(self, req, fp, code, msg, headers, newurl):
 538         """Return a Request or None in response to a redirect.
 539
 540         This is called by the http_error_30x methods when a
 541         redirection response is received.  If a redirection should
 542         take place, return a new Request to allow http_error_30x to
 543         perform the redirect.  Otherwise, raise HTTPError if no-one
 544         else should try to handle this url.  Return None if you can't
 545         but another Handler might.
 546         """
 547         m = req.get_method()
 548         if (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
 549             or code in (301, 302, 303) and m == "POST"):
 550             # Strictly (according to RFC 2616), 301 or 302 in response
 551             # to a POST MUST NOT cause a redirection without confirmation
 552             # from the user (of urllib2, in this case).  In practice,
 553             # essentially all clients do redirect in this case, so we
 554             # do the same.
 555             # be conciliant with URIs containing a space
 556             newurl = newurl.replace(' ', '%20')
 557             newheaders = dict((k,v) for k,v in req.headers.items()
 558                               if k.lower() not in ("content-length", "content-type")
 559                              )
 560             return Request(newurl,
 561                            headers=newheaders,
 562                            origin_req_host=req.get_origin_req_host(),
 563                            unverifiable=True)
 564         else:
 565             raise HTTPError(req.get_full_url(), code, msg, headers, fp)
 566
 567     # Implementation note: To avoid the server sending us into an
 568     # infinite loop, the request object needs to track what URLs we
 569     # have already seen.  Do this by adding a handler-specific
 570     # attribute to the Request object.
 571     def http_error_302(self, req, fp, code, msg, headers):
 572         # Some servers (incorrectly) return multiple Location headers
 573         # (so probably same goes for URI).  Use first header.
 574         if 'location' in headers:
 575             newurl = headers.getheaders('location')[0]
 576         elif 'uri' in headers:
 577             newurl = headers.getheaders('uri')[0]
 578         else:
 579             return
 580
 581         # fix a possible malformed URL
 582         urlparts = urlparse.urlparse(newurl)
 583         if not urlparts.path:
 584             urlparts = list(urlparts)
 585             urlparts[2] = "/"
 586         newurl = urlparse.urlunparse(urlparts)
 587
 588         newurl = urlparse.urljoin(req.get_full_url(), newurl)
 589
 590         # For security reasons we do not allow redirects to protocols
 591         # other than HTTP, HTTPS or FTP.
 592         newurl_lower = newurl.lower()
 593         if not (newurl_lower.startswith('http://') or
 594                 newurl_lower.startswith('https://') or
 595                 newurl_lower.startswith('ftp://')):
 596             raise HTTPError(newurl, code,
 597                             msg + " - Redirection to url '%s' is not allowed" %
 598                             newurl,
 599                             headers, fp)
 600
 601         # XXX Probably want to forget about the state of the current
 602         # request, although that might interact poorly with other
 603         # handlers that also use handler-specific request attributes
 604         new = self.redirect_request(req, fp, code, msg, headers, newurl)
 605         if new is None:
 606             return
 607
 608         # loop detection
 609         # .redirect_dict has a key url if url was previously visited.
 610         if hasattr(req, 'redirect_dict'):
 611             visited = new.redirect_dict = req.redirect_dict
 612             if (visited.get(newurl, 0) >= self.max_repeats or
 613                 len(visited) >= self.max_redirections):
 614                 raise HTTPError(req.get_full_url(), code,
 615                                 self.inf_msg + msg, headers, fp)
 616         else:
 617             visited = new.redirect_dict = req.redirect_dict = {}
 618         visited[newurl] = visited.get(newurl, 0) + 1
 619
 620         # Don't close the fp until we are sure that we won't use it
 621         # with HTTPError.
 622         fp.read()
 623         fp.close()
 624
 625         return self.parent.open(new, timeout=req.timeout)
 626
 627     http_error_301 = http_error_303 = http_error_307 = http_error_302
 628
 629     inf_msg = "The HTTP server returned a redirect error that would " \
 630               "lead to an infinite loop.\n" \
 631               "The last 30x error message was:\n"
 632
 633
 634 def _parse_proxy(proxy):
 635     """Return (scheme, user, password, host/port) given a URL or an authority.
 636
 637     If a URL is supplied, it must have an authority (host:port) component.
 638     According to RFC 3986, having an authority component means the URL must
 639     have two slashes after the scheme:
 640
 641     >>> _parse_proxy('file:/ftp.example.com/')
 642     Traceback (most recent call last):
 643     ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
 644
 645     The first three items of the returned tuple may be None.
 646
 647     Examples of authority parsing:
 648
 649     >>> _parse_proxy('proxy.example.com')
 650     (None, None, None, 'proxy.example.com')
 651     >>> _parse_proxy('proxy.example.com:3128')
 652     (None, None, None, 'proxy.example.com:3128')
 653
 654     The authority component may optionally include userinfo (assumed to be
 655     username:password):
 656
 657     >>> _parse_proxy('joe:password@proxy.example.com')
 658     (None, 'joe', 'password', 'proxy.example.com')
 659     >>> _parse_proxy('joe:password@proxy.example.com:3128')
 660     (None, 'joe', 'password', 'proxy.example.com:3128')
 661
 662     Same examples, but with URLs instead:
 663
 664     >>> _parse_proxy('http://proxy.example.com/')
 665     ('http', None, None, 'proxy.example.com')
 666     >>> _parse_proxy('http://proxy.example.com:3128/')
 667     ('http', None, None, 'proxy.example.com:3128')
 668     >>> _parse_proxy('http://joe:password@proxy.example.com/')
 669     ('http', 'joe', 'password', 'proxy.example.com')
 670     >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
 671     ('http', 'joe', 'password', 'proxy.example.com:3128')
 672
 673     Everything after the authority is ignored:
 674
 675     >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
 676     ('ftp', 'joe', 'password', 'proxy.example.com')
 677
 678     Test for no trailing '/' case:
 679
 680     >>> _parse_proxy('http://joe:password@proxy.example.com')
 681     ('http', 'joe', 'password', 'proxy.example.com')
 682
 683     """
 684     scheme, r_scheme = splittype(proxy)
 685     if not r_scheme.startswith("/"):
 686         # authority
 687         scheme = None
 688         authority = proxy
 689     else:
 690         # URL
 691         if not r_scheme.startswith("//"):
 692             raise ValueError("proxy URL with no authority: %r" % proxy)
 693         # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
 694         # and 3.3.), path is empty or starts with '/'
 695         end = r_scheme.find("/", 2)
 696         if end == -1:
 697             end = None
 698         authority = r_scheme[2:end]
 699     userinfo, hostport = splituser(authority)
 700     if userinfo is not None:
 701         user, password = splitpasswd(userinfo)
 702     else:
 703         user = password = None
 704     return scheme, user, password, hostport
 705
 706 class ProxyHandler(BaseHandler):
 707     # Proxies must be in front
 708     handler_order = 100
 709
 710     def __init__(self, proxies=None):
 711         if proxies is None:
 712             proxies = getproxies()
 713         assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
 714         self.proxies = proxies
 715         for type, url in proxies.items():
 716             setattr(self, '%s_open' % type,
 717                     lambda r, proxy=url, type=type, meth=self.proxy_open: \
 718                     meth(r, proxy, type))
 719
 720     def proxy_open(self, req, proxy, type):
 721         orig_type = req.get_type()
 722         proxy_type, user, password, hostport = _parse_proxy(proxy)
 723
 724         if proxy_type is None:
 725             proxy_type = orig_type
 726
 727         if req.host and proxy_bypass(req.host):
 728             return None
 729
 730         if user and password:
 731             user_pass = '%s:%s' % (unquote(user), unquote(password))
 732             creds = base64.b64encode(user_pass).strip()
 733             req.add_header('Proxy-authorization', 'Basic ' + creds)
 734         hostport = unquote(hostport)
 735         req.set_proxy(hostport, proxy_type)
 736
 737         if orig_type == proxy_type or orig_type == 'https':
 738             # let other handlers take care of it
 739             return None
 740         else:
 741             # need to start over, because the other handlers don't
 742             # grok the proxy's URL type
 743             # e.g. if we have a constructor arg proxies like so:
 744             # {'http': 'ftp://proxy.example.com'}, we may end up turning
 745             # a request for http://acme.example.com/a into one for
 746             # ftp://proxy.example.com/a
 747             return self.parent.open(req, timeout=req.timeout)
 748
 749 class HTTPPasswordMgr:
 750
 751     def __init__(self):
 752         self.passwd = {}
 753
 754     def add_password(self, realm, uri, user, passwd):
 755         # uri could be a single URI or a sequence
 756         if isinstance(uri, basestring):
 757             uri = [uri]
 758         if not realm in self.passwd:
 759             self.passwd[realm] = {}
 760         for default_port in True, False:
 761             reduced_uri = tuple(
 762                 [self.reduce_uri(u, default_port) for u in uri])
 763             self.passwd[realm][reduced_uri] = (user, passwd)
 764
 765     def find_user_password(self, realm, authuri):
 766         domains = self.passwd.get(realm, {})
 767         for default_port in True, False:
 768             reduced_authuri = self.reduce_uri(authuri, default_port)
 769             for uris, authinfo in domains.iteritems():
 770                 for uri in uris:
 771                     if self.is_suburi(uri, reduced_authuri):
 772                         return authinfo
 773         return None, None
 774
 775     def reduce_uri(self, uri, default_port=True):
 776         """Accept authority or URI and extract only the authority and path."""
 777         # note HTTP URLs do not have a userinfo component
 778         parts = urlparse.urlsplit(uri)
 779         if parts[1]:
 780             # URI
 781             scheme = parts[0]
 782             authority = parts[1]
 783             path = parts[2] or '/'
 784         else:
 785             # host or host:port
 786             scheme = None
 787             authority = uri
 788             path = '/'
 789         host, port = splitport(authority)
 790         if default_port and port is None and scheme is not None:
 791             dport = {"http": 80,
 792                      "https": 443,
 793                      }.get(scheme)
 794             if dport is not None:
 795                 authority = "%s:%d" % (host, dport)
 796         return authority, path
 797
 798     def is_suburi(self, base, test):
 799         """Check if test is below base in a URI tree
 800
 801         Both args must be URIs in reduced form.
 802         """
 803         if base == test:
 804             return True
 805         if base[0] != test[0]:
 806             return False
 807         common = posixpath.commonprefix((base[1], test[1]))
 808         if len(common) == len(base[1]):
 809             return True
 810         return False
 811
 812
 813 class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
 814
 815     def find_user_password(self, realm, authuri):
 816         user, password = HTTPPasswordMgr.find_user_password(self, realm,
 817                                                             authuri)
 818         if user is not None:
 819             return user, password
 820         return HTTPPasswordMgr.find_user_password(self, None, authuri)
 821
 822
 823 class AbstractBasicAuthHandler:
 824
 825     # XXX this allows for multiple auth-schemes, but will stupidly pick
 826     # the last one with a realm specified.
 827
 828     # allow for double- and single-quoted realm values
 829     # (single quotes are a violation of the RFC, but appear in the wild)
 830     rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
 831                     'realm=(["\'])(.*?)\\2', re.I)
 832
 833     # XXX could pre-emptively send auth info already accepted (RFC 2617,
 834     # end of section 2, and section 1.2 immediately after "credentials"
 835     # production).
 836
 837     def __init__(self, password_mgr=None):
 838         if password_mgr is None:
 839             password_mgr = HTTPPasswordMgr()
 840         self.passwd = password_mgr
 841         self.add_password = self.passwd.add_password
 842         self.retried = 0
 843
 844     def reset_retry_count(self):
 845         self.retried = 0
 846
 847     def http_error_auth_reqed(self, authreq, host, req, headers):
 848         # host may be an authority (without userinfo) or a URL with an
 849         # authority
 850         # XXX could be multiple headers
 851         authreq = headers.get(authreq, None)
 852
 853         if self.retried > 5:
 854             # retry sending the username:password 5 times before failing.
 855             raise HTTPError(req.get_full_url(), 401, "basic auth failed",
 856                             headers, None)
 857         else:
 858             self.retried += 1
 859
 860         if authreq:
 861             mo = AbstractBasicAuthHandler.rx.search(authreq)
 862             if mo:
 863                 scheme, quote, realm = mo.groups()
 864                 if scheme.lower() == 'basic':
 865                     response = self.retry_http_basic_auth(host, req, realm)
 866                     if response and response.code != 401:
 867                         self.retried = 0
 868                     return response
 869
 870     def retry_http_basic_auth(self, host, req, realm):
 871         user, pw = self.passwd.find_user_password(realm, host)
 872         if pw is not None:
 873             raw = "%s:%s" % (user, pw)
 874             auth = 'Basic %s' % base64.b64encode(raw).strip()
 875             if req.headers.get(self.auth_header, None) == auth:
 876                 return None
 877             req.add_unredirected_header(self.auth_header, auth)
 878             return self.parent.open(req, timeout=req.timeout)
 879         else:
 880             return None
 881
 882
 883 class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
 884
 885     auth_header = 'Authorization'
 886
 887     def http_error_401(self, req, fp, code, msg, headers):
 888         url = req.get_full_url()
 889         response = self.http_error_auth_reqed('www-authenticate',
 890                                               url, req, headers)
 891         self.reset_retry_count()
 892         return response
 893
 894
 895 class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
 896
 897     auth_header = 'Proxy-authorization'
 898
 899     def http_error_407(self, req, fp, code, msg, headers):
 900         # http_error_auth_reqed requires that there is no userinfo component in
 901         # authority.  Assume there isn't one, since urllib2 does not (and
 902         # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
 903         # userinfo.
 904         authority = req.get_host()
 905         response = self.http_error_auth_reqed('proxy-authenticate',
 906                                           authority, req, headers)
 907         self.reset_retry_count()
 908         return response
 909
 910
 911 def randombytes(n):
 912     """Return n random bytes."""
 913     # Use /dev/urandom if it is available.  Fall back to random module
 914     # if not.  It might be worthwhile to extend this function to use
 915     # other platform-specific mechanisms for getting random bytes.
 916     if os.path.exists("/dev/urandom"):
 917         f = open("/dev/urandom")
 918         s = f.read(n)
 919         f.close()
 920         return s
 921     else:
 922         L = [chr(random.randrange(0, 256)) for i in range(n)]
 923         return "".join(L)
 924
 925 class AbstractDigestAuthHandler:
 926     # Digest authentication is specified in RFC 2617.
 927
 928     # XXX The client does not inspect the Authentication-Info header
 929     # in a successful response.
 930
 931     # XXX It should be possible to test this implementation against
 932     # a mock server that just generates a static set of challenges.
 933
 934     # XXX qop="auth-int" supports is shaky
 935
 936     def __init__(self, passwd=None):
 937         if passwd is None:
 938             passwd = HTTPPasswordMgr()
 939         self.passwd = passwd
 940         self.add_password = self.passwd.add_password
 941         self.retried = 0
 942         self.nonce_count = 0
 943         self.last_nonce = None
 944
 945     def reset_retry_count(self):
 946         self.retried = 0
 947
 948     def http_error_auth_reqed(self, auth_header, host, req, headers):
 949         authreq = headers.get(auth_header, None)
 950         if self.retried > 5:
 951             # Don't fail endlessly - if we failed once, we'll probably
 952             # fail a second time. Hm. Unless the Password Manager is
 953             # prompting for the information. Crap. This isn't great
 954             # but it's better than the current 'repeat until recursion
 955             # depth exceeded' approach <wink>
 956             raise HTTPError(req.get_full_url(), 401, "digest auth failed",
 957                             headers, None)
 958         else:
 959             self.retried += 1
 960         if authreq:
 961             scheme = authreq.split()[0]
 962             if scheme.lower() == 'digest':
 963                 return self.retry_http_digest_auth(req, authreq)
 964
 965     def retry_http_digest_auth(self, req, auth):
 966         token, challenge = auth.split(' ', 1)
 967         chal = parse_keqv_list(parse_http_list(challenge))
 968         auth = self.get_authorization(req, chal)
 969         if auth:
 970             auth_val = 'Digest %s' % auth
 971             if req.headers.get(self.auth_header, None) == auth_val:
 972                 return None
 973             req.add_unredirected_header(self.auth_header, auth_val)
 974             resp = self.parent.open(req, timeout=req.timeout)
 975             return resp
 976
 977     def get_cnonce(self, nonce):
 978         # The cnonce-value is an opaque
 979         # quoted string value provided by the client and used by both client
 980         # and server to avoid chosen plaintext attacks, to provide mutual
 981         # authentication, and to provide some message integrity protection.
 982         # This isn't a fabulous effort, but it's probably Good Enough.
 983         dig = hashlib.sha1("%s:%s:%s:%s" % (self.nonce_count, nonce, time.ctime(),
 984                                             randombytes(8))).hexdigest()
 985         return dig[:16]
 986
 987     def get_authorization(self, req, chal):
 988         try:
 989             realm = chal['realm']
 990             nonce = chal['nonce']
 991             qop = chal.get('qop')
 992             algorithm = chal.get('algorithm', 'MD5')
 993             # mod_digest doesn't send an opaque, even though it isn't
 994             # supposed to be optional
 995             opaque = chal.get('opaque', None)
 996         except KeyError:
 997             return None
 998
 999         H, KD = self.get_algorithm_impls(algorithm)
1000         if H is None:
1001             return None
1002
1003         user, pw = self.passwd.find_user_password(realm, req.get_full_url())
1004         if user is None:
1005             return None
1006
1007         # XXX not implemented yet
1008         if req.has_data():
1009             entdig = self.get_entity_digest(req.get_data(), chal)
1010         else:
1011             entdig = None
1012
1013         A1 = "%s:%s:%s" % (user, realm, pw)
1014         A2 = "%s:%s" % (req.get_method(),
1015                         # XXX selector: what about proxies and full urls
1016                         req.get_selector())
1017         if qop == 'auth':
1018             if nonce == self.last_nonce:
1019                 self.nonce_count += 1
1020             else:
1021                 self.nonce_count = 1
1022                 self.last_nonce = nonce
1023
1024             ncvalue = '%08x' % self.nonce_count
1025             cnonce = self.get_cnonce(nonce)
1026             noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
1027             respdig = KD(H(A1), noncebit)
1028         elif qop is None:
1029             respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
1030         else:
1031             # XXX handle auth-int.
1032             raise URLError("qop '%s' is not supported." % qop)
1033
1034         # XXX should the partial digests be encoded too?
1035
1036         base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
1037                'response="%s"' % (user, realm, nonce, req.get_selector(),
1038                                   respdig)
1039         if opaque:
1040             base += ', opaque="%s"' % opaque
1041         if entdig:
1042             base += ', digest="%s"' % entdig
1043         base += ', algorithm="%s"' % algorithm
1044         if qop:
1045             base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
1046         return base
1047
1048     def get_algorithm_impls(self, algorithm):
1049         # algorithm should be case-insensitive according to RFC2617
1050         algorithm = algorithm.upper()
1051         # lambdas assume digest modules are imported at the top level
1052         if algorithm == 'MD5':
1053             H = lambda x: hashlib.md5(x).hexdigest()
1054         elif algorithm == 'SHA':
1055             H = lambda x: hashlib.sha1(x).hexdigest()
1056         # XXX MD5-sess
1057         KD = lambda s, d: H("%s:%s" % (s, d))
1058         return H, KD
1059
1060     def get_entity_digest(self, data, chal):
1061         # XXX not implemented yet
1062         return None
1063
1064
1065 class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1066     """An authentication protocol defined by RFC 2069
1067
1068     Digest authentication improves on basic authentication because it
1069     does not transmit passwords in the clear.
1070     """
1071
1072     auth_header = 'Authorization'
1073     handler_order = 490  # before Basic auth
1074
1075     def http_error_401(self, req, fp, code, msg, headers):
1076         host = urlparse.urlparse(req.get_full_url())[1]
1077         retry = self.http_error_auth_reqed('www-authenticate',
1078                                            host, req, headers)
1079         self.reset_retry_count()
1080         return retry
1081
1082
1083 class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1084
1085     auth_header = 'Proxy-Authorization'
1086     handler_order = 490  # before Basic auth
1087
1088     def http_error_407(self, req, fp, code, msg, headers):
1089         host = req.get_host()
1090         retry = self.http_error_auth_reqed('proxy-authenticate',
1091                                            host, req, headers)
1092         self.reset_retry_count()
1093         return retry
1094
1095 class AbstractHTTPHandler(BaseHandler):
1096
1097     def __init__(self, debuglevel=0):
1098         self._debuglevel = debuglevel
1099
1100     def set_http_debuglevel(self, level):
1101         self._debuglevel = level
1102
1103     def do_request_(self, request):
1104         host = request.get_host()
1105         if not host:
1106             raise URLError('no host given')
1107
1108         if request.has_data():  # POST
1109             data = request.get_data()
1110             if not request.has_header('Content-type'):
1111                 request.add_unredirected_header(
1112                     'Content-type',
1113                     'application/x-www-form-urlencoded')
1114             if not request.has_header('Content-length'):
1115                 request.add_unredirected_header(
1116                     'Content-length', '%d' % len(data))
1117
1118         sel_host = host
1119         if request.has_proxy():
1120             scheme, sel = splittype(request.get_selector())
1121             sel_host, sel_path = splithost(sel)
1122
1123         if not request.has_header('Host'):
1124             request.add_unredirected_header('Host', sel_host)
1125         for name, value in self.parent.addheaders:
1126             name = name.capitalize()
1127             if not request.has_header(name):
1128                 request.add_unredirected_header(name, value)
1129
1130         return request
1131
1132     def do_open(self, http_class, req):
1133         """Return an addinfourl object for the request, using http_class.
1134
1135         http_class must implement the HTTPConnection API from httplib.
1136         The addinfourl return value is a file-like object.  It also
1137         has methods and attributes including:
1138             - info(): return a mimetools.Message object for the headers
1139             - geturl(): return the original request URL
1140             - code: HTTP status code
1141         """
1142         host = req.get_host()
1143         if not host:
1144             raise URLError('no host given')
1145
1146         h = http_class(host, timeout=req.timeout) # will parse host:port
1147         h.set_debuglevel(self._debuglevel)
1148
1149         headers = dict(req.unredirected_hdrs)
1150         headers.update(dict((k, v) for k, v in req.headers.items()
1151                             if k not in headers))
1152
1153         # We want to make an HTTP/1.1 request, but the addinfourl
1154         # class isn't prepared to deal with a persistent connection.
1155         # It will try to read all remaining data from the socket,
1156         # which will block while the server waits for the next request.
1157         # So make sure the connection gets closed after the (only)
1158         # request.
1159         headers["Connection"] = "close"
1160         headers = dict(
1161             (name.title(), val) for name, val in headers.items())
1162
1163         if req._tunnel_host:
1164             tunnel_headers = {}
1165             proxy_auth_hdr = "Proxy-Authorization"
1166             if proxy_auth_hdr in headers:
1167                 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
1168                 # Proxy-Authorization should not be sent to origin
1169                 # server.
1170                 del headers[proxy_auth_hdr]
1171             h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
1172
1173         try:
1174             h.request(req.get_method(), req.get_selector(), req.data, headers)
1175         except socket.error, err: # XXX what error?
1176             h.close()
1177             raise URLError(err)
1178         else:
1179             try:
1180                 r = h.getresponse(buffering=True)
1181             except TypeError: # buffering kw not supported
1182                 r = h.getresponse()
1183
1184         # Pick apart the HTTPResponse object to get the addinfourl
1185         # object initialized properly.
1186
1187         # Wrap the HTTPResponse object in socket's file object adapter
1188         # for Windows.  That adapter calls recv(), so delegate recv()
1189         # to read().  This weird wrapping allows the returned object to
1190         # have readline() and readlines() methods.
1191
1192         # XXX It might be better to extract the read buffering code
1193         # out of socket._fileobject() and into a base class.
1194
1195         r.recv = r.read
1196         fp = socket._fileobject(r, close=True)
1197
1198         resp = addinfourl(fp, r.msg, req.get_full_url())
1199         resp.code = r.status
1200         resp.msg = r.reason
1201         return resp
1202
1203
1204 class HTTPHandler(AbstractHTTPHandler):
1205
1206     def http_open(self, req):
1207         return self.do_open(httplib.HTTPConnection, req)
1208
1209     http_request = AbstractHTTPHandler.do_request_
1210
1211 if hasattr(httplib, 'HTTPS'):
1212     class HTTPSHandler(AbstractHTTPHandler):
1213
1214         def https_open(self, req):
1215             return self.do_open(httplib.HTTPSConnection, req)
1216
1217         https_request = AbstractHTTPHandler.do_request_
1218
1219 class HTTPCookieProcessor(BaseHandler):
1220     def __init__(self, cookiejar=None):
1221         import cookielib
1222         if cookiejar is None:
1223             cookiejar = cookielib.CookieJar()
1224         self.cookiejar = cookiejar
1225
1226     def http_request(self, request):
1227         self.cookiejar.add_cookie_header(request)
1228         return request
1229
1230     def http_response(self, request, response):
1231         self.cookiejar.extract_cookies(response, request)
1232         return response
1233
1234     https_request = http_request
1235     https_response = http_response
1236
1237 class UnknownHandler(BaseHandler):
1238     def unknown_open(self, req):
1239         type = req.get_type()
1240         raise URLError('unknown url type: %s' % type)
1241
1242 def parse_keqv_list(l):
1243     """Parse list of key=value strings where keys are not duplicated."""
1244     parsed = {}
1245     for elt in l:
1246         k, v = elt.split('=', 1)
1247         if v[0] == '"' and v[-1] == '"':
1248             v = v[1:-1]
1249         parsed[k] = v
1250     return parsed
1251
1252 def parse_http_list(s):
1253     """Parse lists as described by RFC 2068 Section 2.
1254
1255     In particular, parse comma-separated lists where the elements of
1256     the list may include quoted-strings.  A quoted-string could
1257     contain a comma.  A non-quoted string could have quotes in the
1258     middle.  Neither commas nor quotes count if they are escaped.
1259     Only double-quotes count, not single-quotes.
1260     """
1261     res = []
1262     part = ''
1263
1264     escape = quote = False
1265     for cur in s:
1266         if escape:
1267             part += cur
1268             escape = False
1269             continue
1270         if quote:
1271             if cur == '\\':
1272                 escape = True
1273                 continue
1274             elif cur == '"':
1275                 quote = False
1276             part += cur
1277             continue
1278
1279         if cur == ',':
1280             res.append(part)
1281             part = ''
1282             continue
1283
1284         if cur == '"':
1285             quote = True
1286
1287         part += cur
1288
1289     # append last part
1290     if part:
1291         res.append(part)
1292
1293     return [part.strip() for part in res]
1294
1295 def _safe_gethostbyname(host):
1296     try:
1297         return socket.gethostbyname(host)
1298     except socket.gaierror:
1299         return None
1300
1301 class FileHandler(BaseHandler):
1302     # Use local file or FTP depending on form of URL
1303     def file_open(self, req):
1304         url = req.get_selector()
1305         if url[:2] == '//' and url[2:3] != '/' and (req.host and
1306                 req.host != 'localhost'):
1307             req.type = 'ftp'
1308             return self.parent.open(req)
1309         else:
1310             return self.open_local_file(req)
1311
1312     # names for the localhost
1313     names = None
1314     def get_names(self):
1315         if FileHandler.names is None:
1316             try:
1317                 FileHandler.names = tuple(
1318                     socket.gethostbyname_ex('localhost')[2] +
1319                     socket.gethostbyname_ex(socket.gethostname())[2])
1320             except socket.gaierror:
1321                 FileHandler.names = (socket.gethostbyname('localhost'),)
1322         return FileHandler.names
1323
1324     # not entirely sure what the rules are here
1325     def open_local_file(self, req):
1326         import email.utils
1327         import mimetypes
1328         host = req.get_host()
1329         filename = req.get_selector()
1330         localfile = url2pathname(filename)
1331         try:
1332             stats = os.stat(localfile)
1333             size = stats.st_size
1334             modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1335             mtype = mimetypes.guess_type(filename)[0]
1336             headers = mimetools.Message(StringIO(
1337                 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1338                 (mtype or 'text/plain', size, modified)))
1339             if host:
1340                 host, port = splitport(host)
1341             if not host or \
1342                 (not port and _safe_gethostbyname(host) in self.get_names()):
1343                 if host:
1344                     origurl = 'file://' + host + filename
1345                 else:
1346                     origurl = 'file://' + filename
1347                 return addinfourl(open(localfile, 'rb'), headers, origurl)
1348         except OSError, msg:
1349             # urllib2 users shouldn't expect OSErrors coming from urlopen()
1350             raise URLError(msg)
1351         raise URLError('file not on local host')
1352
1353 class FTPHandler(BaseHandler):
1354     def ftp_open(self, req):
1355         import ftplib
1356         import mimetypes
1357         host = req.get_host()
1358         if not host:
1359             raise URLError('ftp error: no host given')
1360         host, port = splitport(host)
1361         if port is None:
1362             port = ftplib.FTP_PORT
1363         else:
1364             port = int(port)
1365
1366         # username/password handling
1367         user, host = splituser(host)
1368         if user:
1369             user, passwd = splitpasswd(user)
1370         else:
1371             passwd = None
1372         host = unquote(host)
1373         user = user or ''
1374         passwd = passwd or ''
1375
1376         try:
1377             host = socket.gethostbyname(host)
1378         except socket.error, msg:
1379             raise URLError(msg)
1380         path, attrs = splitattr(req.get_selector())
1381         dirs = path.split('/')
1382         dirs = map(unquote, dirs)
1383         dirs, file = dirs[:-1], dirs[-1]
1384         if dirs and not dirs[0]:
1385             dirs = dirs[1:]
1386         try:
1387             fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1388             type = file and 'I' or 'D'
1389             for attr in attrs:
1390                 attr, value = splitvalue(attr)
1391                 if attr.lower() == 'type' and \
1392                    value in ('a', 'A', 'i', 'I', 'd', 'D'):
1393                     type = value.upper()
1394             fp, retrlen = fw.retrfile(file, type)
1395             headers = ""
1396             mtype = mimetypes.guess_type(req.get_full_url())[0]
1397             if mtype:
1398                 headers += "Content-type: %s\n" % mtype
1399             if retrlen is not None and retrlen >= 0:
1400                 headers += "Content-length: %d\n" % retrlen
1401             sf = StringIO(headers)
1402             headers = mimetools.Message(sf)
1403             return addinfourl(fp, headers, req.get_full_url())
1404         except ftplib.all_errors, msg:
1405             raise URLError, ('ftp error: %s' % msg), sys.exc_info()[2]
1406
1407     def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1408         fw = ftpwrapper(user, passwd, host, port, dirs, timeout,
1409                         persistent=False)
1410 ##        fw.ftp.set_debuglevel(1)
1411         return fw
1412
1413 class CacheFTPHandler(FTPHandler):
1414     # XXX would be nice to have pluggable cache strategies
1415     # XXX this stuff is definitely not thread safe
1416     def __init__(self):
1417         self.cache = {}
1418         self.timeout = {}
1419         self.soonest = 0
1420         self.delay = 60
1421         self.max_conns = 16
1422
1423     def setTimeout(self, t):
1424         self.delay = t
1425
1426     def setMaxConns(self, m):
1427         self.max_conns = m
1428
1429     def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1430         key = user, host, port, '/'.join(dirs), timeout
1431         if key in self.cache:
1432             self.timeout[key] = time.time() + self.delay
1433         else:
1434             self.cache[key] = ftpwrapper(user, passwd, host, port, dirs, timeout)
1435             self.timeout[key] = time.time() + self.delay
1436         self.check_cache()
1437         return self.cache[key]
1438
1439     def check_cache(self):
1440         # first check for old ones
1441         t = time.time()
1442         if self.soonest <= t:
1443             for k, v in self.timeout.items():
1444                 if v < t:
1445                     self.cache[k].close()
1446                     del self.cache[k]
1447                     del self.timeout[k]
1448         self.soonest = min(self.timeout.values())
1449
1450         # then check the size
1451         if len(self.cache) == self.max_conns:
1452             for k, v in self.timeout.items():
1453                 if v == self.soonest:
1454                     del self.cache[k]
1455                     del self.timeout[k]
1456                     break
1457             self.soonest = min(self.timeout.values())
1458
1459     def clear_cache(self):
1460         for conn in self.cache.values():
1461             conn.close()
1462         self.cache.clear()
1463         self.timeout.clear()