2 # Copyright 2012 Google Inc. All Rights Reserved.
4 # Licensed under the Apache License, Version 2.0 (the "License");
5 # you may not use this file except in compliance with the License.
6 # You may obtain a copy of the License at
8 # http://www.apache.org/licenses/LICENSE-2.0
10 # Unless required by applicable law or agreed to in writing, software
11 # distributed under the License is distributed on an "AS IS" BASIS,
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 # See the License for the specific language governing permissions and
14 # limitations under the License.
16 """Retrieve web resources over http."""
25 import platformsettings
26 import script_injector
29 # PIL isn't always available, but we still want to be able to run without
30 # the image scrambling functionality in this case.
36 TIMER = platformsettings.timer
39 class HttpClientException(Exception):
40 """Base class for all exceptions in httpclient."""
44 def _InjectScripts(response, inject_script):
45 """Injects |inject_script| immediately after <head> or <html>.
47 Copies |response| if it is modified.
50 response: an ArchivedHttpResponse
51 inject_script: JavaScript string (e.g. "Math.random = function(){...}")
53 an ArchivedHttpResponse
55 if type(response) == tuple:
56 logging.warn('tuple response: %s', response)
57 content_type = response.get_header('content-type')
58 if content_type and content_type.startswith('text/html'):
59 text = response.get_data_as_text()
60 text, already_injected = script_injector.InjectScript(
61 text, 'text/html', inject_script)
62 if not already_injected:
63 response = copy.deepcopy(response)
64 response.set_data(text)
68 def _ScrambleImages(response):
69 """If the |response| is an image, attempt to scramble it.
71 Copies |response| if it is modified.
74 response: an ArchivedHttpResponse
76 an ArchivedHttpResponse
79 assert Image, '--scramble_images requires the PIL module to be installed.'
81 content_type = response.get_header('content-type')
82 if content_type and content_type.startswith('image/'):
84 image_data = response.response_data[0]
85 image_data.decode(encoding='base64')
86 im = Image.open(StringIO.StringIO(image_data))
88 pixel_data = list(im.getdata())
89 random.shuffle(pixel_data)
91 scrambled_image = im.copy()
92 scrambled_image.putdata(pixel_data)
94 output_image_io = StringIO.StringIO()
95 scrambled_image.save(output_image_io, im.format)
96 output_image_data = output_image_io.getvalue()
97 output_image_data.encode(encoding='base64')
99 response = copy.deepcopy(response)
100 response.set_data(output_image_data)
107 class DetailedHTTPResponse(httplib.HTTPResponse):
108 """Preserve details relevant to replaying responses.
110 WARNING: This code uses attributes and methods of HTTPResponse
111 that are not part of the public interface.
114 def read_chunks(self):
115 """Return the response body content and timing data.
117 The returned chunks have the chunk size and CRLFs stripped off.
118 If the response was compressed, the returned data is still compressed.
123 [response_body] # non-chunked responses
124 [chunk_1, chunk_2, ...] # chunked responses
126 [0] # non-chunked responses
127 [chunk_1_first_byte_delay, ...] # chunked responses
129 The delay for the first body item should be recorded by the caller.
135 chunks.append(self.read())
141 line = self.fp.readline()
142 chunk_size = self._read_chunk_size(line)
143 if chunk_size is None:
144 raise httplib.IncompleteRead(''.join(chunks))
147 delays.append(TIMER() - start)
148 chunks.append(self._safe_read(chunk_size))
149 self._safe_read(2) # skip the CRLF at the end of the chunk
152 # Ignore any trailers.
154 line = self.fp.readline()
155 if not line or line == '\r\n':
159 return chunks, delays
162 def _read_chunk_size(cls, line):
163 chunk_extensions_pos = line.find(';')
164 if chunk_extensions_pos != -1:
165 line = line[:extention_pos] # strip chunk-extensions
167 chunk_size = int(line, 16)
173 class DetailedHTTPConnection(httplib.HTTPConnection):
174 """Preserve details relevant to replaying connections."""
175 response_class = DetailedHTTPResponse
178 class DetailedHTTPSResponse(DetailedHTTPResponse):
179 """Preserve details relevant to replaying SSL responses."""
183 class DetailedHTTPSConnection(httplib.HTTPSConnection):
184 """Preserve details relevant to replaying SSL connections."""
185 response_class = DetailedHTTPSResponse
188 class RealHttpFetch(object):
190 def __init__(self, real_dns_lookup):
191 """Initialize RealHttpFetch.
194 real_dns_lookup: a function that resolves a host to an IP.
196 self._real_dns_lookup = real_dns_lookup
199 def _GetHeaderNameValue(header):
200 """Parse the header line and return a name/value tuple.
203 header: a string for a header such as "Content-Length: 314".
205 A tuple (header_name, header_value) on success or None if the header
206 is not in expected format. header_name is in lowercase.
210 return (header[:i].lower(), header[i+1:].strip())
214 def _ToTuples(headers):
215 """Parse headers and save them to a list of tuples.
217 This method takes HttpResponse.msg.headers as input and convert it
218 to a list of (header_name, header_value) tuples.
219 HttpResponse.msg.headers is a list of strings where each string
220 represents either a header or a continuation line of a header.
221 1. a normal header consists of two parts which are separated by colon :
222 "header_name:header_value..."
223 2. a continuation line is a string starting with whitespace
224 "[whitespace]continued_header_value..."
225 If a header is not in good shape or an unexpected continuation line is
226 seen, it will be ignored.
228 Should avoid using response.getheaders() directly
229 because response.getheaders() can't handle multiple headers
230 with the same name properly. Instead, parse the
231 response.msg.headers using this method to get all headers.
234 headers: an instance of HttpResponse.msg.headers.
236 A list of tuples which looks like:
237 [(header_name, header_value), (header_name2, header_value2)...]
244 'Unexpected response header continuation line [%s]', line)
246 name, value = all_headers.pop()
247 value += '\n ' + line.strip()
249 name_value = RealHttpFetch._GetHeaderNameValue(line)
252 'Response header in wrong format [%s]', line)
254 name, value = name_value
255 all_headers.append((name, value))
259 def _get_request_host_port(request):
260 host_parts = request.host.split(':')
262 port = int(host_parts[1]) if len(host_parts) == 2 else None
265 def _get_system_proxy(self, is_ssl):
266 return platformsettings.get_system_proxy(is_ssl)
268 def _get_connection(self, request_host, request_port, is_ssl):
269 """Return a detailed connection object for host/port pair.
271 If a system proxy is defined (see platformsettings.py), it will be used.
274 request_host: a host string (e.g. "www.example.com").
275 request_port: a port integer (e.g. 8080) or None (for the default port).
276 is_ssl: True if HTTPS connection is needed.
278 A DetailedHTTPSConnection or DetailedHTTPConnection instance.
280 connection_host = request_host
281 connection_port = request_port
282 system_proxy = self._get_system_proxy(is_ssl)
284 connection_host = system_proxy.host
285 connection_port = system_proxy.port
287 # Use an IP address because WPR may override DNS settings.
288 connection_ip = self._real_dns_lookup(connection_host)
289 if not connection_ip:
290 logging.critical('Unable to find host ip for name: %s', connection_host)
294 connection = DetailedHTTPSConnection(connection_ip, connection_port)
296 connection.set_tunnel(self, request_host, request_port)
298 connection = DetailedHTTPConnection(connection_ip, connection_port)
301 def __call__(self, request):
302 """Fetch an HTTP request.
305 request: an ArchivedHttpRequest
307 an ArchivedHttpResponse
309 logging.debug('RealHttpFetch: %s %s', request.host, request.full_path)
310 request_host, request_port = self._get_request_host_port(request)
314 connection = self._get_connection(
315 request_host, request_port, request.is_ssl)
316 connect_start = TIMER()
318 connect_delay = int((TIMER() - connect_start) * 1000)
323 request.request_body,
325 response = connection.getresponse()
326 headers_delay = int((TIMER() - start) * 1000)
328 chunks, chunk_delays = response.read_chunks()
330 'connect': connect_delay,
331 'headers': headers_delay,
334 archived_http_response = httparchive.ArchivedHttpResponse(
338 RealHttpFetch._ToTuples(response.msg.headers),
341 return archived_http_response
345 logging.warning('Retrying fetch %s: %s', request, e)
347 logging.critical('Could not fetch %s: %s', request, e)
351 class RecordHttpArchiveFetch(object):
352 """Make real HTTP fetches and save responses in the given HttpArchive."""
354 def __init__(self, http_archive, real_dns_lookup, inject_script,
356 """Initialize RecordHttpArchiveFetch.
359 http_archive: an instance of a HttpArchive
360 real_dns_lookup: a function that resolves a host to an IP.
361 inject_script: script string to inject in all pages
362 cache_misses: instance of CacheMissArchive
364 self.http_archive = http_archive
365 self.real_http_fetch = RealHttpFetch(real_dns_lookup)
366 self.inject_script = inject_script
367 self.cache_misses = cache_misses
369 def __call__(self, request):
370 """Fetch the request and return the response.
373 request: an ArchivedHttpRequest.
375 an ArchivedHttpResponse
377 if self.cache_misses:
378 self.cache_misses.record_request(
379 request, is_record_mode=True, is_cache_miss=False)
381 # If request is already in the archive, return the archived response.
382 if request in self.http_archive:
383 logging.debug('Repeated request found: %s', request)
384 response = self.http_archive[request]
386 response = self.real_http_fetch(request)
389 self.http_archive[request] = response
390 if self.inject_script:
391 response = _InjectScripts(response, self.inject_script)
392 logging.debug('Recorded: %s', request)
396 class ReplayHttpArchiveFetch(object):
397 """Serve responses from the given HttpArchive."""
399 def __init__(self, http_archive, real_dns_lookup, inject_script,
400 use_diff_on_unknown_requests=False, cache_misses=None,
401 use_closest_match=False, scramble_images=False):
402 """Initialize ReplayHttpArchiveFetch.
405 http_archive: an instance of a HttpArchive
406 real_dns_lookup: a function that resolves a host to an IP.
407 inject_script: script string to inject in all pages
408 use_diff_on_unknown_requests: If True, log unknown requests
409 with a diff to requests that look similar.
410 cache_misses: Instance of CacheMissArchive.
411 Callback updates archive on cache misses
412 use_closest_match: If True, on replay mode, serve the closest match
413 in the archive instead of giving a 404.
415 self.http_archive = http_archive
416 self.inject_script = inject_script
417 self.use_diff_on_unknown_requests = use_diff_on_unknown_requests
418 self.cache_misses = cache_misses
419 self.use_closest_match = use_closest_match
420 self.scramble_images = scramble_images
421 self.real_http_fetch = RealHttpFetch(real_dns_lookup)
423 def __call__(self, request):
424 """Fetch the request and return the response.
427 request: an instance of an ArchivedHttpRequest.
429 Instance of ArchivedHttpResponse (if found) or None
431 if request.host.startswith('127.0.0.1:'):
432 return self.real_http_fetch(request)
434 response = self.http_archive.get(request)
436 if self.use_closest_match and not response:
437 closest_request = self.http_archive.find_closest_request(
438 request, use_path=True)
440 response = self.http_archive.get(closest_request)
442 logging.info('Request not found: %s\nUsing closest match: %s',
443 request, closest_request)
445 if self.cache_misses:
446 self.cache_misses.record_request(
447 request, is_record_mode=False, is_cache_miss=not response)
450 reason = str(request)
451 if self.use_diff_on_unknown_requests:
452 diff = self.http_archive.diff(request)
455 "\nNearest request diff "
456 "('-' for archived request, '+' for current request):\n%s" % diff)
457 logging.warning('Could not replay: %s', reason)
459 if self.inject_script:
460 response = _InjectScripts(response, self.inject_script)
461 if self.scramble_images:
462 response = _ScrambleImages(response)
466 class ControllableHttpArchiveFetch(object):
467 """Controllable fetch function that can swap between record and replay."""
469 def __init__(self, http_archive, real_dns_lookup,
470 inject_script, use_diff_on_unknown_requests,
471 use_record_mode, cache_misses, use_closest_match,
473 """Initialize HttpArchiveFetch.
476 http_archive: an instance of a HttpArchive
477 real_dns_lookup: a function that resolves a host to an IP.
478 inject_script: script string to inject in all pages.
479 use_diff_on_unknown_requests: If True, log unknown requests
480 with a diff to requests that look similar.
481 use_record_mode: If True, start in server in record mode.
482 cache_misses: Instance of CacheMissArchive.
483 use_closest_match: If True, on replay mode, serve the closest match
484 in the archive instead of giving a 404.
486 self.http_archive = http_archive
487 self.record_fetch = RecordHttpArchiveFetch(
488 http_archive, real_dns_lookup, inject_script,
490 self.replay_fetch = ReplayHttpArchiveFetch(
491 http_archive, real_dns_lookup, inject_script,
492 use_diff_on_unknown_requests, cache_misses,
493 use_closest_match, scramble_images)
499 def SetRecordMode(self):
500 self.fetch = self.record_fetch
501 self.is_record_mode = True
503 def SetReplayMode(self):
504 self.fetch = self.replay_fetch
505 self.is_record_mode = False
507 def __call__(self, *args, **kwargs):
508 """Forward calls to Replay/Record fetch functions depending on mode."""
509 return self.fetch(*args, **kwargs)