1 # -*- test-case-name: twisted.web.test.test_webclient -*-
2 # Copyright (c) Twisted Matrix Laboratories.
3 # See LICENSE for details.
10 from urlparse import urlunparse
11 from urllib import splithost, splittype
14 from zope.interface import implements
16 from twisted.python import log
17 from twisted.python.failure import Failure
18 from twisted.web import http
19 from twisted.internet import defer, protocol, task, reactor
20 from twisted.internet.interfaces import IProtocol
21 from twisted.internet.endpoints import TCP4ClientEndpoint, SSL4ClientEndpoint
22 from twisted.python import failure
23 from twisted.python.util import InsensitiveDict
24 from twisted.python.components import proxyForInterface
25 from twisted.web import error
26 from twisted.web.iweb import UNKNOWN_LENGTH, IBodyProducer, IResponse
27 from twisted.web.http_headers import Headers
28 from twisted.python.compat import set
31 class PartialDownloadError(error.Error):
33 Page was only partially downloaded, we got disconnected in middle.
35 @ivar response: All of the response body which was downloaded.
39 class HTTPPageGetter(http.HTTPClient):
41 Gets a resource via HTTP, then quits.
43 Typically used with L{HTTPClientFactory}. Note that this class does not, by
44 itself, do anything with the response. If you want to download a resource
45 into a file, use L{HTTPPageDownloader} instead.
47 @ivar _completelyDone: A boolean indicating whether any further requests are
48 necessary after this one completes in order to provide a result to
49 C{self.factory.deferred}. If it is C{False}, then a redirect is going
50 to be followed. Otherwise, this protocol's connection is the last one
51 before firing the result Deferred. This is used to make sure the result
52 Deferred is only fired after the connection is cleaned up.
59 _completelyDone = True
61 _specialHeaders = set(('host', 'user-agent', 'cookie', 'content-length'))
63 def connectionMade(self):
64 method = getattr(self.factory, 'method', 'GET')
65 self.sendCommand(method, self.factory.path)
66 if self.factory.scheme == 'http' and self.factory.port != 80:
67 host = '%s:%s' % (self.factory.host, self.factory.port)
68 elif self.factory.scheme == 'https' and self.factory.port != 443:
69 host = '%s:%s' % (self.factory.host, self.factory.port)
71 host = self.factory.host
72 self.sendHeader('Host', self.factory.headers.get("host", host))
73 self.sendHeader('User-Agent', self.factory.agent)
74 data = getattr(self.factory, 'postdata', None)
76 self.sendHeader("Content-Length", str(len(data)))
79 for (key, value) in self.factory.headers.items():
80 if key.lower() not in self._specialHeaders:
81 # we calculated it on our own
82 self.sendHeader(key, value)
83 if key.lower() == 'cookie':
84 cookieData.append(value)
85 for cookie, cookval in self.factory.cookies.items():
86 cookieData.append('%s=%s' % (cookie, cookval))
88 self.sendHeader('Cookie', '; '.join(cookieData))
93 self.transport.write(data)
95 def handleHeader(self, key, value):
97 Called every time a header is received. Stores the header information
98 as key-value pairs in the C{headers} attribute.
101 @param key: An HTTP header field name.
104 @param value: An HTTP header field value.
107 l = self.headers.setdefault(key, [])
110 def handleStatus(self, version, status, message):
111 self.version, self.status, self.message = version, status, message
112 self.factory.gotStatus(version, status, message)
114 def handleEndHeaders(self):
115 self.factory.gotHeaders(self.headers)
116 m = getattr(self, 'handleStatus_'+self.status, self.handleStatusDefault)
119 def handleStatus_200(self):
122 handleStatus_201 = lambda self: self.handleStatus_200()
123 handleStatus_202 = lambda self: self.handleStatus_200()
125 def handleStatusDefault(self):
128 def handleStatus_301(self):
129 l = self.headers.get('location')
131 self.handleStatusDefault()
134 if self.followRedirect:
135 scheme, host, port, path = \
136 _parse(url, defaultPort=self.transport.getPeer().port)
138 self.factory._redirectCount += 1
139 if self.factory._redirectCount >= self.factory.redirectLimit:
140 err = error.InfiniteRedirection(
142 'Infinite redirection detected',
144 self.factory.noPage(failure.Failure(err))
145 self.quietLoss = True
146 self.transport.loseConnection()
149 self._completelyDone = False
150 self.factory.setURL(url)
152 if self.factory.scheme == 'https':
153 from twisted.internet import ssl
154 contextFactory = ssl.ClientContextFactory()
155 reactor.connectSSL(self.factory.host, self.factory.port,
156 self.factory, contextFactory)
158 reactor.connectTCP(self.factory.host, self.factory.port,
161 self.handleStatusDefault()
165 self.status, self.message, location = url)))
166 self.quietLoss = True
167 self.transport.loseConnection()
169 def handleStatus_302(self):
170 if self.afterFoundGet:
171 self.handleStatus_303()
173 self.handleStatus_301()
176 def handleStatus_303(self):
177 self.factory.method = 'GET'
178 self.handleStatus_301()
181 def connectionLost(self, reason):
183 When the connection used to issue the HTTP request is closed, notify the
184 factory if we have not already, so it can produce a result.
186 if not self.quietLoss:
187 http.HTTPClient.connectionLost(self, reason)
188 self.factory.noPage(reason)
189 if self._completelyDone:
190 # Only if we think we're completely done do we tell the factory that
191 # we're "disconnected". This way when we're following redirects,
192 # only the last protocol used will fire the _disconnectedDeferred.
193 self.factory._disconnectedDeferred.callback(None)
196 def handleResponse(self, response):
203 self.status, self.message, response)))
204 if self.factory.method == 'HEAD':
205 # Callback with empty string, since there is never a response
206 # body for HEAD requests.
207 self.factory.page('')
208 elif self.length != None and self.length != 0:
209 self.factory.noPage(failure.Failure(
210 PartialDownloadError(self.status, self.message, response)))
212 self.factory.page(response)
213 # server might be stupid and not close connection. admittedly
214 # the fact we do only one request per connection is also
216 self.transport.loseConnection()
219 self.quietLoss = True
220 self.transport.loseConnection()
221 self.factory.noPage(defer.TimeoutError("Getting %s took longer than %s seconds." % (self.factory.url, self.factory.timeout)))
224 class HTTPPageDownloader(HTTPPageGetter):
228 def handleStatus_200(self, partialContent=0):
229 HTTPPageGetter.handleStatus_200(self)
230 self.transmittingPage = 1
231 self.factory.pageStart(partialContent)
233 def handleStatus_206(self):
234 self.handleStatus_200(partialContent=1)
236 def handleResponsePart(self, data):
237 if self.transmittingPage:
238 self.factory.pagePart(data)
240 def handleResponseEnd(self):
242 self.transmittingPage = 0
245 PartialDownloadError(self.status)))
246 if self.transmittingPage:
247 self.factory.pageEnd()
248 self.transmittingPage = 0
253 self.status, self.message, None)))
254 self.transport.loseConnection()
257 class HTTPClientFactory(protocol.ClientFactory):
258 """Download a given URL.
260 @type deferred: Deferred
261 @ivar deferred: A Deferred that will fire when the content has
262 been retrieved. Once this is fired, the ivars `status', `version',
263 and `message' will be set.
266 @ivar status: The status of the response.
269 @ivar version: The version of the response.
272 @ivar message: The text message returned with the status.
274 @type response_headers: dict
275 @ivar response_headers: The headers that were specified in the
276 response from the server.
279 @ivar method: The HTTP method to use in the request. This should be one of
280 OPTIONS, GET, HEAD, POST, PUT, DELETE, TRACE, or CONNECT (case
281 matters). Other values may be specified if the server being contacted
284 @type redirectLimit: int
285 @ivar redirectLimit: The maximum number of HTTP redirects that can occur
286 before it is assumed that the redirection is endless.
288 @type afterFoundGet: C{bool}
289 @ivar afterFoundGet: Deviate from the HTTP 1.1 RFC by handling redirects
290 the same way as most web browsers; if the request method is POST and a
291 302 status is encountered, the redirect is followed with a GET method
293 @type _redirectCount: int
294 @ivar _redirectCount: The current number of HTTP redirects encountered.
296 @ivar _disconnectedDeferred: A L{Deferred} which only fires after the last
297 connection associated with the request (redirects may cause multiple
298 connections to be required) has closed. The result Deferred will only
299 fire after this Deferred, so that callers can be assured that there are
300 no more event sources in the reactor once they get the result.
303 protocol = HTTPPageGetter
311 def __init__(self, url, method='GET', postdata=None, headers=None,
312 agent="Twisted PageGetter", timeout=0, cookies=None,
313 followRedirect=True, redirectLimit=20,
314 afterFoundGet=False):
315 self.followRedirect = followRedirect
316 self.redirectLimit = redirectLimit
317 self._redirectCount = 0
318 self.timeout = timeout
320 self.afterFoundGet = afterFoundGet
323 self.cookies = cookies
324 if headers is not None:
325 self.headers = InsensitiveDict(headers)
327 self.headers = InsensitiveDict()
328 if postdata is not None:
329 self.headers.setdefault('Content-Length', len(postdata))
330 # just in case a broken http/1.1 decides to keep connection alive
331 self.headers.setdefault("connection", "close")
332 self.postdata = postdata
338 self._disconnectedDeferred = defer.Deferred()
339 self.deferred = defer.Deferred()
340 # Make sure the first callback on the result Deferred pauses the
341 # callback chain until the request connection is closed.
342 self.deferred.addBoth(self._waitForDisconnect)
343 self.response_headers = None
346 def _waitForDisconnect(self, passthrough):
348 Chain onto the _disconnectedDeferred, preserving C{passthrough}, so that
349 the result is only available after the associated connection has been
352 self._disconnectedDeferred.addCallback(lambda ignored: passthrough)
353 return self._disconnectedDeferred
357 return "<%s: %s>" % (self.__class__.__name__, self.url)
359 def setURL(self, url):
361 scheme, host, port, path = _parse(url)
368 def buildProtocol(self, addr):
369 p = protocol.ClientFactory.buildProtocol(self, addr)
370 p.followRedirect = self.followRedirect
371 p.afterFoundGet = self.afterFoundGet
373 timeoutCall = reactor.callLater(self.timeout, p.timeout)
374 self.deferred.addBoth(self._cancelTimeout, timeoutCall)
377 def _cancelTimeout(self, result, timeoutCall):
378 if timeoutCall.active():
382 def gotHeaders(self, headers):
383 self.response_headers = headers
384 if headers.has_key('set-cookie'):
385 for cookie in headers['set-cookie']:
386 cookparts = cookie.split(';')
389 k, v = cook.split('=', 1)
390 self.cookies[k.lstrip()] = v.lstrip()
392 def gotStatus(self, version, status, message):
393 self.version, self.status, self.message = version, status, message
395 def page(self, page):
398 self.deferred.callback(page)
400 def noPage(self, reason):
403 self.deferred.errback(reason)
405 def clientConnectionFailed(self, _, reason):
407 When a connection attempt fails, the request cannot be issued. If no
408 result has yet been provided to the result Deferred, provide the
409 connection failure reason as an error result.
413 # If the connection attempt failed, there is nothing more to
414 # disconnect, so just fire that Deferred now.
415 self._disconnectedDeferred.callback(None)
416 self.deferred.errback(reason)
420 class HTTPDownloader(HTTPClientFactory):
421 """Download to a file."""
423 protocol = HTTPPageDownloader
426 def __init__(self, url, fileOrName,
427 method='GET', postdata=None, headers=None,
428 agent="Twisted client", supportPartial=0,
429 timeout=0, cookies=None, followRedirect=1,
430 redirectLimit=20, afterFoundGet=False):
431 self.requestedPartial = 0
432 if isinstance(fileOrName, types.StringTypes):
433 self.fileName = fileOrName
435 if supportPartial and os.path.exists(self.fileName):
436 fileLength = os.path.getsize(self.fileName)
438 self.requestedPartial = fileLength
441 headers["range"] = "bytes=%d-" % fileLength
443 self.file = fileOrName
444 HTTPClientFactory.__init__(
445 self, url, method=method, postdata=postdata, headers=headers,
446 agent=agent, timeout=timeout, cookies=cookies,
447 followRedirect=followRedirect, redirectLimit=redirectLimit,
448 afterFoundGet=afterFoundGet)
451 def gotHeaders(self, headers):
452 HTTPClientFactory.gotHeaders(self, headers)
453 if self.requestedPartial:
454 contentRange = headers.get("content-range", None)
456 # server doesn't support partial requests, oh well
457 self.requestedPartial = 0
459 start, end, realLength = http.parseContentRange(contentRange[0])
460 if start != self.requestedPartial:
461 # server is acting wierdly
462 self.requestedPartial = 0
465 def openFile(self, partialContent):
467 file = open(self.fileName, 'rb+')
470 file = open(self.fileName, 'wb')
473 def pageStart(self, partialContent):
474 """Called on page download start.
476 @param partialContent: tells us if the download is partial download we requested.
478 if partialContent and not self.requestedPartial:
479 raise ValueError, "we shouldn't get partial content response if we didn't want it!"
483 self.file = self.openFile(partialContent)
486 self.deferred.errback(failure.Failure())
488 def pagePart(self, data):
492 self.file.write(data)
496 self.deferred.errback(failure.Failure())
499 def noPage(self, reason):
501 Close the storage file and errback the waiting L{Deferred} with the
510 log.err(None, "Error closing HTTPDownloader file")
511 self.deferred.errback(reason)
521 self.deferred.errback(failure.Failure())
523 self.deferred.callback(self.value)
531 At some point this should be replaced with a better URL implementation.
533 def __new__(self, scheme, host, port, path):
534 return tuple.__new__(_URL, (scheme, host, port, path))
537 def __init__(self, scheme, host, port, path):
544 def _parse(url, defaultPort=None):
546 Split the given URL into the scheme, host, port, and path.
549 @param url: An URL to parse.
551 @type defaultPort: C{int} or C{None}
552 @param defaultPort: An alternate value to use as the port if the URL does
555 @return: A four-tuple of the scheme, host, port, and path of the URL. All
556 of these are C{str} instances except for port, which is an C{int}.
559 parsed = http.urlparse(url)
561 path = urlunparse(('', '') + parsed[2:])
563 if defaultPort is None:
564 if scheme == 'https':
569 host, port = parsed[1], defaultPort
571 host, port = host.split(':')
580 return _URL(scheme, host, port, path)
583 def _makeGetterFactory(url, factoryFactory, contextFactory=None,
586 Create and connect an HTTP page getting factory.
588 Any additional positional or keyword arguments are used when calling
591 @param factoryFactory: Factory factory that is called with C{url}, C{args}
592 and C{kwargs} to produce the getter
594 @param contextFactory: Context factory to use when creating a secure
595 connection, defaulting to C{None}
597 @return: The factory created by C{factoryFactory}
599 scheme, host, port, path = _parse(url)
600 factory = factoryFactory(url, *args, **kwargs)
601 if scheme == 'https':
602 from twisted.internet import ssl
603 if contextFactory is None:
604 contextFactory = ssl.ClientContextFactory()
605 reactor.connectSSL(host, port, factory, contextFactory)
607 reactor.connectTCP(host, port, factory)
611 def getPage(url, contextFactory=None, *args, **kwargs):
613 Download a web page as a string.
615 Download a page. Return a deferred, which will callback with a
616 page (as a string) or errback with a description of the error.
618 See L{HTTPClientFactory} to see what extra arguments can be passed.
620 return _makeGetterFactory(
623 contextFactory=contextFactory,
624 *args, **kwargs).deferred
627 def downloadPage(url, file, contextFactory=None, *args, **kwargs):
629 Download a web page to a file.
631 @param file: path to file on filesystem, or file-like object.
633 See HTTPDownloader to see what extra args can be passed.
635 factoryFactory = lambda url, *a, **kw: HTTPDownloader(url, file, *a, **kw)
636 return _makeGetterFactory(
639 contextFactory=contextFactory,
640 *args, **kwargs).deferred
643 # The code which follows is based on the new HTTP client implementation. It
644 # should be significantly better than anything above, though it is not yet
645 # feature equivalent.
647 from twisted.web.error import SchemeNotSupported
648 from twisted.web._newclient import Request, Response, HTTP11ClientProtocol
649 from twisted.web._newclient import ResponseDone, ResponseFailed
650 from twisted.web._newclient import RequestNotSent, RequestTransmissionFailed
651 from twisted.web._newclient import ResponseNeverReceived
654 from twisted.internet.ssl import ClientContextFactory
656 class WebClientContextFactory(object):
658 A web context factory which doesn't work because the necessary SSL
661 def getContext(self, hostname, port):
662 raise NotImplementedError("SSL support unavailable")
664 class WebClientContextFactory(ClientContextFactory):
666 A web context factory which ignores the hostname and port and does no
667 certificate verification.
669 def getContext(self, hostname, port):
670 return ClientContextFactory.getContext(self)
674 class _WebToNormalContextFactory(object):
676 Adapt a web context factory to a normal context factory.
678 @ivar _webContext: A web context factory which accepts a hostname and port
679 number to its C{getContext} method.
681 @ivar _hostname: The hostname which will be passed to
682 C{_webContext.getContext}.
684 @ivar _port: The port number which will be passed to
685 C{_webContext.getContext}.
687 def __init__(self, webContext, hostname, port):
688 self._webContext = webContext
689 self._hostname = hostname
693 def getContext(self):
695 Called the wrapped web context factory's C{getContext} method with a
696 hostname and port number and return the resulting context object.
698 return self._webContext.getContext(self._hostname, self._port)
702 class FileBodyProducer(object):
704 L{FileBodyProducer} produces bytes from an input file object incrementally
705 and writes them to a consumer.
707 Since file-like objects cannot be read from in an event-driven manner,
708 L{FileBodyProducer} uses a L{Cooperator} instance to schedule reads from
709 the file. This process is also paused and resumed based on notifications
710 from the L{IConsumer} provider being written to.
712 The file is closed after it has been read, or if the producer is stopped
715 @ivar _inputFile: Any file-like object, bytes read from which will be
716 written to a consumer.
718 @ivar _cooperate: A method like L{Cooperator.cooperate} which is used to
721 @ivar _readSize: The number of bytes to read from C{_inputFile} at a time.
723 implements(IBodyProducer)
725 # Python 2.4 doesn't have these symbolic constants
726 _SEEK_SET = getattr(os, 'SEEK_SET', 0)
727 _SEEK_END = getattr(os, 'SEEK_END', 2)
729 def __init__(self, inputFile, cooperator=task, readSize=2 ** 16):
730 self._inputFile = inputFile
731 self._cooperate = cooperator.cooperate
732 self._readSize = readSize
733 self.length = self._determineLength(inputFile)
736 def _determineLength(self, fObj):
738 Determine how many bytes can be read out of C{fObj} (assuming it is not
739 modified from this point on). If the determination cannot be made,
740 return C{UNKNOWN_LENGTH}.
745 except AttributeError:
746 return UNKNOWN_LENGTH
747 originalPosition = tell()
748 seek(0, self._SEEK_END)
750 seek(originalPosition, self._SEEK_SET)
751 return end - originalPosition
754 def stopProducing(self):
756 Permanently stop writing bytes from the file to the consumer by
757 stopping the underlying L{CooperativeTask}.
759 self._inputFile.close()
763 def startProducing(self, consumer):
765 Start a cooperative task which will read bytes from the input file and
766 write them to C{consumer}. Return a L{Deferred} which fires after all
767 bytes have been written.
769 @param consumer: Any L{IConsumer} provider
771 self._task = self._cooperate(self._writeloop(consumer))
772 d = self._task.whenDone()
773 def maybeStopped(reason):
774 # IBodyProducer.startProducing's Deferred isn't support to fire if
775 # stopProducing is called.
776 reason.trap(task.TaskStopped)
777 return defer.Deferred()
778 d.addCallbacks(lambda ignored: None, maybeStopped)
782 def _writeloop(self, consumer):
784 Return an iterator which reads one chunk of bytes from the input file
785 and writes them to the consumer for each time it is iterated.
788 bytes = self._inputFile.read(self._readSize)
790 self._inputFile.close()
792 consumer.write(bytes)
796 def pauseProducing(self):
798 Temporarily suspend copying bytes from the input file to the consumer
799 by pausing the L{CooperativeTask} which drives that activity.
804 def resumeProducing(self):
806 Undo the effects of a previous C{pauseProducing} and resume copying
807 bytes to the consumer by resuming the L{CooperativeTask} which drives
814 class _HTTP11ClientFactory(protocol.Factory):
816 A factory for L{HTTP11ClientProtocol}, used by L{HTTPConnectionPool}.
818 @ivar _quiescentCallback: The quiescent callback to be passed to protocol
819 instances, used to return them to the connection pool.
823 def __init__(self, quiescentCallback):
824 self._quiescentCallback = quiescentCallback
827 def buildProtocol(self, addr):
828 return HTTP11ClientProtocol(self._quiescentCallback)
832 class _RetryingHTTP11ClientProtocol(object):
834 A wrapper for L{HTTP11ClientProtocol} that automatically retries requests.
836 @ivar _clientProtocol: The underlying L{HTTP11ClientProtocol}.
838 @ivar _newConnection: A callable that creates a new connection for a
842 def __init__(self, clientProtocol, newConnection):
843 self._clientProtocol = clientProtocol
844 self._newConnection = newConnection
847 def _shouldRetry(self, method, exception, bodyProducer):
849 Indicate whether request should be retried.
851 Only returns C{True} if method is idempotent, no response was
852 received, and no body was sent. The latter requirement may be relaxed
853 in the future, and PUT added to approved method list.
855 if method not in ("GET", "HEAD", "OPTIONS", "DELETE", "TRACE"):
857 if not isinstance(exception, (RequestNotSent, RequestTransmissionFailed,
858 ResponseNeverReceived)):
860 if bodyProducer is not None:
865 def request(self, request):
867 Do a request, and retry once (with a new connection) it it fails in
870 @param request: A L{Request} instance that will be requested using the
873 d = self._clientProtocol.request(request)
876 if self._shouldRetry(request.method, reason.value,
877 request.bodyProducer):
878 return self._newConnection().addCallback(
879 lambda connection: connection.request(request))
887 class HTTPConnectionPool(object):
889 A pool of persistent HTTP connections.
892 - Cached connections will eventually time out.
893 - Limits on maximum number of persistent connections.
895 Connections are stored using keys, which should be chosen such that any
896 connections stored under a given key can be used interchangeably.
898 Failed requests done using previously cached connections will be retried
899 once if they use an idempotent method (e.g. GET), in case the HTTP server
902 @ivar persistent: Boolean indicating whether connections should be
903 persistent. Connections are persistent by default.
905 @ivar maxPersistentPerHost: The maximum number of cached persistent
906 connections for a C{host:port} destination.
907 @type maxPersistentPerHost: C{int}
909 @ivar cachedConnectionTimeout: Number of seconds a cached persistent
910 connection will stay open before disconnecting.
912 @ivar retryAutomatically: C{boolean} indicating whether idempotent
913 requests should be retried once if no response was received.
915 @ivar _factory: The factory used to connect to the proxy.
917 @ivar _connections: Map (scheme, host, port) to lists of
918 L{HTTP11ClientProtocol} instances.
920 @ivar _timeouts: Map L{HTTP11ClientProtocol} instances to a
921 C{IDelayedCall} instance of their timeout.
926 _factory = _HTTP11ClientFactory
927 maxPersistentPerHost = 2
928 cachedConnectionTimeout = 240
929 retryAutomatically = True
931 def __init__(self, reactor, persistent=True):
932 self._reactor = reactor
933 self.persistent = persistent
934 self._connections = {}
938 def getConnection(self, key, endpoint):
940 Retrieve a connection, either new or cached, to be used for a HTTP
943 If a cached connection is returned, it will not be used for other
944 requests until it is put back (which will happen automatically), since
945 we do not support pipelined requests. If no cached connection is
946 available, the passed in endpoint is used to create the connection.
948 If the connection doesn't disconnect at the end of its request, it
949 will be returned to this pool automatically. As such, only a single
950 request should be sent using the returned connection.
952 @param key: A unique key identifying connections that can be used
955 @param endpoint: An endpoint that can be used to open a new connection
956 if no cached connection is available.
958 @return: A C{Deferred} that will fire with a L{HTTP11ClientProtocol}
959 (or a wrapper) that can be used to send a single HTTP request.
961 # Try to get cached version:
962 connections = self._connections.get(key)
964 connection = connections.pop(0)
966 self._timeouts[connection].cancel()
967 del self._timeouts[connection]
968 if connection.state == "QUIESCENT":
969 if self.retryAutomatically:
970 newConnection = lambda: self._newConnection(key, endpoint)
971 connection = _RetryingHTTP11ClientProtocol(
972 connection, newConnection)
973 return defer.succeed(connection)
975 return self._newConnection(key, endpoint)
978 def _newConnection(self, key, endpoint):
980 Create a new connection.
982 This implements the new connection code path for L{getConnection}.
984 def quiescentCallback(protocol):
985 self._putConnection(key, protocol)
986 factory = self._factory(quiescentCallback)
987 return endpoint.connect(factory)
990 def _removeConnection(self, key, connection):
992 Remove a connection from the cache and disconnect it.
994 connection.transport.loseConnection()
995 self._connections[key].remove(connection)
996 del self._timeouts[connection]
999 def _putConnection(self, key, connection):
1001 Return a persistent connection to the pool. This will be called by
1002 L{HTTP11ClientProtocol} when the connection becomes quiescent.
1004 if connection.state != "QUIESCENT":
1005 # Log with traceback for debugging purposes:
1008 "BUG: Non-quiescent protocol added to connection pool.")
1012 connections = self._connections.setdefault(key, [])
1013 if len(connections) == self.maxPersistentPerHost:
1014 dropped = connections.pop(0)
1015 dropped.transport.loseConnection()
1016 self._timeouts[dropped].cancel()
1017 del self._timeouts[dropped]
1018 connections.append(connection)
1019 cid = self._reactor.callLater(self.cachedConnectionTimeout,
1020 self._removeConnection,
1022 self._timeouts[connection] = cid
1025 def closeCachedConnections(self):
1027 Close all persistent connections and remove them from the pool.
1029 @return: L{defer.Deferred} that fires when all connections have been
1033 for protocols in self._connections.itervalues():
1035 results.append(p.abort())
1036 self._connections = {}
1037 for dc in self._timeouts.values():
1040 return defer.gatherResults(results).addCallback(lambda ign: None)
1044 class _AgentBase(object):
1046 Base class offering common facilities for L{Agent}-type classes.
1048 @ivar _reactor: The C{IReactorTime} implementation which will be used by
1049 the pool, and perhaps by subclasses as well.
1051 @ivar _pool: The L{HTTPConnectionPool} used to manage HTTP connections.
1054 def __init__(self, reactor, pool):
1056 pool = HTTPConnectionPool(reactor, False)
1057 self._reactor = reactor
1061 def _computeHostValue(self, scheme, host, port):
1063 Compute the string to use for the value of the I{Host} header, based on
1064 the given scheme, host name, and port number.
1066 if (scheme, port) in (('http', 80), ('https', 443)):
1068 return '%s:%d' % (host, port)
1071 def _requestWithEndpoint(self, key, endpoint, method, parsedURI,
1072 headers, bodyProducer, requestPath):
1074 Issue a new request, given the endpoint and the path sent as part of
1077 # Create minimal headers, if necessary:
1080 if not headers.hasHeader('host'):
1081 headers = headers.copy()
1082 headers.addRawHeader(
1083 'host', self._computeHostValue(parsedURI.scheme, parsedURI.host,
1086 d = self._pool.getConnection(key, endpoint)
1087 def cbConnected(proto):
1088 return proto.request(
1089 Request(method, requestPath, headers, bodyProducer,
1090 persistent=self._pool.persistent))
1091 d.addCallback(cbConnected)
1096 class Agent(_AgentBase):
1098 L{Agent} is a very basic HTTP client. It supports I{HTTP} and I{HTTPS}
1099 scheme URIs (but performs no certificate checking by default).
1101 @param pool: A L{HTTPConnectionPool} instance, or C{None}, in which case a
1102 non-persistent L{HTTPConnectionPool} instance will be created.
1104 @ivar _contextFactory: A web context factory which will be used to create
1105 SSL context objects for any SSL connections the agent needs to make.
1107 @ivar _connectTimeout: If not C{None}, the timeout passed to C{connectTCP}
1108 or C{connectSSL} for specifying the connection timeout.
1110 @ivar _bindAddress: If not C{None}, the address passed to C{connectTCP} or
1111 C{connectSSL} for specifying the local address to bind to.
1116 def __init__(self, reactor, contextFactory=WebClientContextFactory(),
1117 connectTimeout=None, bindAddress=None,
1119 _AgentBase.__init__(self, reactor, pool)
1120 self._contextFactory = contextFactory
1121 self._connectTimeout = connectTimeout
1122 self._bindAddress = bindAddress
1125 def _wrapContextFactory(self, host, port):
1127 Create and return a normal context factory wrapped around
1128 C{self._contextFactory} in such a way that C{self._contextFactory} will
1129 have the host and port information passed to it.
1131 @param host: A C{str} giving the hostname which will be connected to in
1132 order to issue a request.
1134 @param port: An C{int} giving the port number the connection will be
1137 @return: A context factory suitable to be passed to
1138 C{reactor.connectSSL}.
1140 return _WebToNormalContextFactory(self._contextFactory, host, port)
1143 def _getEndpoint(self, scheme, host, port):
1145 Get an endpoint for the given host and port, using a transport
1146 selected based on scheme.
1148 @param scheme: A string like C{'http'} or C{'https'} (the only two
1149 supported values) to use to determine how to establish the
1152 @param host: A C{str} giving the hostname which will be connected to in
1153 order to issue a request.
1155 @param port: An C{int} giving the port number the connection will be
1158 @return: An endpoint which can be used to connect to given address.
1161 if self._connectTimeout is not None:
1162 kwargs['timeout'] = self._connectTimeout
1163 kwargs['bindAddress'] = self._bindAddress
1164 if scheme == 'http':
1165 return TCP4ClientEndpoint(self._reactor, host, port, **kwargs)
1166 elif scheme == 'https':
1167 return SSL4ClientEndpoint(self._reactor, host, port,
1168 self._wrapContextFactory(host, port),
1171 raise SchemeNotSupported("Unsupported scheme: %r" % (scheme,))
1174 def request(self, method, uri, headers=None, bodyProducer=None):
1176 Issue a new request.
1178 @param method: The request method to send.
1179 @type method: C{str}
1181 @param uri: The request URI send.
1184 @param headers: The request headers to send. If no I{Host} header is
1185 included, one will be added based on the request URI.
1186 @type headers: L{Headers}
1188 @param bodyProducer: An object which will produce the request body or,
1189 if the request body is to be empty, L{None}.
1190 @type bodyProducer: L{IBodyProducer} provider
1192 @return: A L{Deferred} which fires with the result of the request (a
1193 L{twisted.web.iweb.IResponse} provider), or fails if there is a
1194 problem setting up a connection over which to issue the request.
1195 It may also fail with L{SchemeNotSupported} if the scheme of the
1196 given URI is not supported.
1199 parsedURI = _parse(uri)
1201 endpoint = self._getEndpoint(parsedURI.scheme, parsedURI.host,
1203 except SchemeNotSupported:
1204 return defer.fail(Failure())
1205 key = (parsedURI.scheme, parsedURI.host, parsedURI.port)
1206 return self._requestWithEndpoint(key, endpoint, method, parsedURI,
1207 headers, bodyProducer, parsedURI.path)
1211 class ProxyAgent(_AgentBase):
1213 An HTTP agent able to cross HTTP proxies.
1215 @ivar _proxyEndpoint: The endpoint used to connect to the proxy.
1220 def __init__(self, endpoint, reactor=None, pool=None):
1222 from twisted.internet import reactor
1223 _AgentBase.__init__(self, reactor, pool)
1224 self._proxyEndpoint = endpoint
1227 def request(self, method, uri, headers=None, bodyProducer=None):
1229 Issue a new request via the configured proxy.
1231 # Cache *all* connections under the same key, since we are only
1232 # connecting to a single destination, the proxy:
1233 key = ("http-proxy", self._proxyEndpoint)
1235 # To support proxying HTTPS via CONNECT, we will use key
1236 # ("http-proxy-CONNECT", scheme, host, port), and an endpoint that
1237 # wraps _proxyEndpoint with an additional callback to do the CONNECT.
1238 return self._requestWithEndpoint(key, self._proxyEndpoint, method,
1239 _parse(uri), headers, bodyProducer,
1244 class _FakeUrllib2Request(object):
1246 A fake C{urllib2.Request} object for C{cookielib} to work with.
1248 @see: U{http://docs.python.org/library/urllib2.html#request-objects}
1251 @ivar uri: Request URI.
1253 @type headers: L{twisted.web.http_headers.Headers}
1254 @ivar headers: Request headers.
1257 @ivar type: The scheme of the URI.
1260 @ivar host: The host[:port] of the URI.
1264 def __init__(self, uri):
1266 self.headers = Headers()
1267 self.type, rest = splittype(self.uri)
1268 self.host, rest = splithost(rest)
1271 def has_header(self, header):
1272 return self.headers.hasHeader(header)
1275 def add_unredirected_header(self, name, value):
1276 self.headers.addRawHeader(name, value)
1279 def get_full_url(self):
1283 def get_header(self, name, default=None):
1284 headers = self.headers.getRawHeaders(name, default)
1285 if headers is not None:
1298 def is_unverifiable(self):
1299 # In theory this shouldn't be hardcoded.
1304 class _FakeUrllib2Response(object):
1306 A fake C{urllib2.Response} object for C{cookielib} to work with.
1308 @type response: C{twisted.web.iweb.IResponse}
1309 @ivar response: Underlying Twisted Web response.
1313 def __init__(self, response):
1314 self.response = response
1318 class _Meta(object):
1319 def getheaders(zelf, name):
1320 return self.response.headers.getRawHeaders(name, [])
1325 class CookieAgent(object):
1327 L{CookieAgent} extends the basic L{Agent} to add RFC-compliant
1328 handling of HTTP cookies. Cookies are written to and extracted
1329 from a C{cookielib.CookieJar} instance.
1331 The same cookie jar instance will be used for any requests through this
1332 agent, mutating it whenever a I{Set-Cookie} header appears in a response.
1334 @type _agent: L{twisted.web.client.Agent}
1335 @ivar _agent: Underlying Twisted Web agent to issue requests through.
1337 @type cookieJar: C{cookielib.CookieJar}
1338 @ivar cookieJar: Initialized cookie jar to read cookies from and store
1343 def __init__(self, agent, cookieJar):
1345 self.cookieJar = cookieJar
1348 def request(self, method, uri, headers=None, bodyProducer=None):
1350 Issue a new request to the wrapped L{Agent}.
1352 Send a I{Cookie} header if a cookie for C{uri} is stored in
1353 L{CookieAgent.cookieJar}. Cookies are automatically extracted and
1354 stored from requests.
1356 If a C{'cookie'} header appears in C{headers} it will override the
1357 automatic cookie header obtained from the cookie jar.
1359 @see: L{Agent.request}
1363 lastRequest = _FakeUrllib2Request(uri)
1364 # Setting a cookie header explicitly will disable automatic request
1366 if not headers.hasHeader('cookie'):
1367 self.cookieJar.add_cookie_header(lastRequest)
1368 cookieHeader = lastRequest.get_header('Cookie', None)
1369 if cookieHeader is not None:
1370 headers = headers.copy()
1371 headers.addRawHeader('cookie', cookieHeader)
1373 d = self._agent.request(method, uri, headers, bodyProducer)
1374 d.addCallback(self._extractCookies, lastRequest)
1378 def _extractCookies(self, response, request):
1380 Extract response cookies and store them in the cookie jar.
1382 @type response: L{twisted.web.iweb.IResponse}
1383 @param response: Twisted Web response.
1385 @param request: A urllib2 compatible request object.
1387 resp = _FakeUrllib2Response(response)
1388 self.cookieJar.extract_cookies(resp, request)
1393 class GzipDecoder(proxyForInterface(IResponse)):
1395 A wrapper for a L{Response} instance which handles gzip'ed body.
1397 @ivar original: The original L{Response} object.
1402 def __init__(self, response):
1403 self.original = response
1404 self.length = UNKNOWN_LENGTH
1407 def deliverBody(self, protocol):
1409 Override C{deliverBody} to wrap the given C{protocol} with
1412 self.original.deliverBody(_GzipProtocol(protocol, self.original))
1416 class _GzipProtocol(proxyForInterface(IProtocol)):
1418 A L{Protocol} implementation which wraps another one, transparently
1419 decompressing received data.
1421 @ivar _zlibDecompress: A zlib decompress object used to decompress the data
1424 @ivar _response: A reference to the original response, in case of errors.
1429 def __init__(self, protocol, response):
1430 self.original = protocol
1431 self._response = response
1432 self._zlibDecompress = zlib.decompressobj(16 + zlib.MAX_WBITS)
1435 def dataReceived(self, data):
1437 Decompress C{data} with the zlib decompressor, forwarding the raw data
1438 to the original protocol.
1441 rawData = self._zlibDecompress.decompress(data)
1443 raise ResponseFailed([failure.Failure()], self._response)
1445 self.original.dataReceived(rawData)
1448 def connectionLost(self, reason):
1450 Forward the connection lost event, flushing remaining data from the
1451 decompressor if any.
1454 rawData = self._zlibDecompress.flush()
1456 raise ResponseFailed([reason, failure.Failure()], self._response)
1458 self.original.dataReceived(rawData)
1459 self.original.connectionLost(reason)
1463 class ContentDecoderAgent(object):
1465 An L{Agent} wrapper to handle encoded content.
1467 It takes care of declaring the support for content in the
1468 I{Accept-Encoding} header, and automatically decompresses the received data
1469 if it's effectively using compression.
1471 @param decoders: A list or tuple of (name, decoder) objects. The name
1472 declares which decoding the decoder supports, and the decoder must
1473 return a response object when called/instantiated. For example,
1474 C{(('gzip', GzipDecoder))}. The order determines how the decoders are
1475 going to be advertized to the server.
1480 def __init__(self, agent, decoders):
1482 self._decoders = dict(decoders)
1483 self._supported = ','.join([decoder[0] for decoder in decoders])
1486 def request(self, method, uri, headers=None, bodyProducer=None):
1488 Send a client request which declares supporting compressed content.
1490 @see: L{Agent.request}.
1495 headers = headers.copy()
1496 headers.addRawHeader('accept-encoding', self._supported)
1497 deferred = self._agent.request(method, uri, headers, bodyProducer)
1498 return deferred.addCallback(self._handleResponse)
1501 def _handleResponse(self, response):
1503 Check if the response is encoded, and wrap it to handle decompression.
1505 contentEncodingHeaders = response.headers.getRawHeaders(
1506 'content-encoding', [])
1507 contentEncodingHeaders = ','.join(contentEncodingHeaders).split(',')
1508 while contentEncodingHeaders:
1509 name = contentEncodingHeaders.pop().strip()
1510 decoder = self._decoders.get(name)
1511 if decoder is not None:
1512 response = decoder(response)
1515 contentEncodingHeaders.append(name)
1517 if contentEncodingHeaders:
1518 response.headers.setRawHeaders(
1519 'content-encoding', [','.join(contentEncodingHeaders)])
1521 response.headers.removeHeader('content-encoding')
1526 class RedirectAgent(object):
1528 An L{Agent} wrapper which handles HTTP redirects.
1530 The implementation is rather strict: 301 and 302 behaves like 307, not
1531 redirecting automatically on methods different from C{GET} and C{HEAD}.
1533 @param redirectLimit: The maximum number of times the agent is allowed to
1534 follow redirects before failing with a L{error.InfiniteRedirection}.
1539 def __init__(self, agent, redirectLimit=20):
1541 self._redirectLimit = redirectLimit
1544 def request(self, method, uri, headers=None, bodyProducer=None):
1546 Send a client request following HTTP redirects.
1548 @see: L{Agent.request}.
1550 deferred = self._agent.request(method, uri, headers, bodyProducer)
1551 return deferred.addCallback(
1552 self._handleResponse, method, uri, headers, 0)
1555 def _handleRedirect(self, response, method, uri, headers, redirectCount):
1557 Handle a redirect response, checking the number of redirects already
1558 followed, and extracting the location header fields.
1560 if redirectCount >= self._redirectLimit:
1561 err = error.InfiniteRedirection(
1563 'Infinite redirection detected',
1565 raise ResponseFailed([failure.Failure(err)], response)
1566 locationHeaders = response.headers.getRawHeaders('location', [])
1567 if not locationHeaders:
1568 err = error.RedirectWithNoLocation(
1569 response.code, 'No location header field', uri)
1570 raise ResponseFailed([failure.Failure(err)], response)
1571 location = locationHeaders[0]
1572 deferred = self._agent.request(method, location, headers)
1573 return deferred.addCallback(
1574 self._handleResponse, method, uri, headers, redirectCount + 1)
1577 def _handleResponse(self, response, method, uri, headers, redirectCount):
1579 Handle the response, making another request if it indicates a redirect.
1581 if response.code in (http.MOVED_PERMANENTLY, http.FOUND,
1582 http.TEMPORARY_REDIRECT):
1583 if method not in ('GET', 'HEAD'):
1584 err = error.PageRedirect(response.code, location=uri)
1585 raise ResponseFailed([failure.Failure(err)], response)
1586 return self._handleRedirect(response, method, uri, headers,
1588 elif response.code == http.SEE_OTHER:
1589 return self._handleRedirect(response, 'GET', uri, headers,
1596 'PartialDownloadError', 'HTTPPageGetter', 'HTTPPageDownloader',
1597 'HTTPClientFactory', 'HTTPDownloader', 'getPage', 'downloadPage',
1598 'ResponseDone', 'Response', 'ResponseFailed', 'Agent', 'CookieAgent',
1599 'ProxyAgent', 'ContentDecoderAgent', 'GzipDecoder', 'RedirectAgent',
1600 'HTTPConnectionPool']