src/third_party/webpagereplay/httpclient.py

   1 #!/usr/bin/env python
   2 # Copyright 2012 Google Inc. All Rights Reserved.
   3 #
   4 # Licensed under the Apache License, Version 2.0 (the "License");
   5 # you may not use this file except in compliance with the License.
   6 # You may obtain a copy of the License at
   7 #
   8 #      http://www.apache.org/licenses/LICENSE-2.0
   9 #
  10 # Unless required by applicable law or agreed to in writing, software
  11 # distributed under the License is distributed on an "AS IS" BASIS,
  12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 # See the License for the specific language governing permissions and
  14 # limitations under the License.
  15
  16 """Retrieve web resources over http."""
  17
  18 import copy
  19 import httplib
  20 import logging
  21 import random
  22 import StringIO
  23
  24 import httparchive
  25 import platformsettings
  26 import script_injector
  27
  28
  29 # PIL isn't always available, but we still want to be able to run without
  30 # the image scrambling functionality in this case.
  31 try:
  32   import Image
  33 except ImportError:
  34   Image = None
  35
  36 TIMER = platformsettings.timer
  37
  38
  39 class HttpClientException(Exception):
  40   """Base class for all exceptions in httpclient."""
  41   pass
  42
  43
  44 def _InjectScripts(response, inject_script):
  45   """Injects |inject_script| immediately after <head> or <html>.
  46
  47   Copies |response| if it is modified.
  48
  49   Args:
  50     response: an ArchivedHttpResponse
  51     inject_script: JavaScript string (e.g. "Math.random = function(){...}")
  52   Returns:
  53     an ArchivedHttpResponse
  54   """
  55   if type(response) == tuple:
  56     logging.warn('tuple response: %s', response)
  57   content_type = response.get_header('content-type')
  58   if content_type and content_type.startswith('text/html'):
  59     text = response.get_data_as_text()
  60     text, already_injected = script_injector.InjectScript(
  61         text, 'text/html', inject_script)
  62     if not already_injected:
  63       response = copy.deepcopy(response)
  64       response.set_data(text)
  65   return response
  66
  67
  68 def _ScrambleImages(response):
  69   """If the |response| is an image, attempt to scramble it.
  70
  71   Copies |response| if it is modified.
  72
  73   Args:
  74     response: an ArchivedHttpResponse
  75   Returns:
  76     an ArchivedHttpResponse
  77   """
  78
  79   assert Image, '--scramble_images requires the PIL module to be installed.'
  80
  81   content_type = response.get_header('content-type')
  82   if content_type and content_type.startswith('image/'):
  83     try:
  84       image_data = response.response_data[0]
  85       image_data.decode(encoding='base64')
  86       im = Image.open(StringIO.StringIO(image_data))
  87
  88       pixel_data = list(im.getdata())
  89       random.shuffle(pixel_data)
  90
  91       scrambled_image = im.copy()
  92       scrambled_image.putdata(pixel_data)
  93
  94       output_image_io = StringIO.StringIO()
  95       scrambled_image.save(output_image_io, im.format)
  96       output_image_data = output_image_io.getvalue()
  97       output_image_data.encode(encoding='base64')
  98
  99       response = copy.deepcopy(response)
 100       response.set_data(output_image_data)
 101     except Exception:
 102       pass
 103
 104   return response
 105
 106
 107 class DetailedHTTPResponse(httplib.HTTPResponse):
 108   """Preserve details relevant to replaying responses.
 109
 110   WARNING: This code uses attributes and methods of HTTPResponse
 111   that are not part of the public interface.
 112   """
 113
 114   def read_chunks(self):
 115     """Return the response body content and timing data.
 116
 117     The returned chunks have the chunk size and CRLFs stripped off.
 118     If the response was compressed, the returned data is still compressed.
 119
 120     Returns:
 121       (chunks, delays)
 122         chunks:
 123           [response_body]                  # non-chunked responses
 124           [chunk_1, chunk_2, ...]          # chunked responses
 125         delays:
 126           [0]                              # non-chunked responses
 127           [chunk_1_first_byte_delay, ...]  # chunked responses
 128
 129       The delay for the first body item should be recorded by the caller.
 130     """
 131     buf = []
 132     chunks = []
 133     delays = []
 134     if not self.chunked:
 135       chunks.append(self.read())
 136       delays.append(0)
 137     else:
 138       start = TIMER()
 139       try:
 140         while True:
 141           line = self.fp.readline()
 142           chunk_size = self._read_chunk_size(line)
 143           if chunk_size is None:
 144             raise httplib.IncompleteRead(''.join(chunks))
 145           if chunk_size == 0:
 146             break
 147           delays.append(TIMER() - start)
 148           chunks.append(self._safe_read(chunk_size))
 149           self._safe_read(2)  # skip the CRLF at the end of the chunk
 150           start = TIMER()
 151
 152         # Ignore any trailers.
 153         while True:
 154           line = self.fp.readline()
 155           if not line or line == '\r\n':
 156             break
 157       finally:
 158         self.close()
 159     return chunks, delays
 160
 161   @classmethod
 162   def _read_chunk_size(cls, line):
 163     chunk_extensions_pos = line.find(';')
 164     if chunk_extensions_pos != -1:
 165       line = line[:extention_pos]  # strip chunk-extensions
 166     try:
 167       chunk_size = int(line, 16)
 168     except ValueError:
 169       return None
 170     return chunk_size
 171
 172
 173 class DetailedHTTPConnection(httplib.HTTPConnection):
 174   """Preserve details relevant to replaying connections."""
 175   response_class = DetailedHTTPResponse
 176
 177
 178 class DetailedHTTPSResponse(DetailedHTTPResponse):
 179   """Preserve details relevant to replaying SSL responses."""
 180   pass
 181
 182
 183 class DetailedHTTPSConnection(httplib.HTTPSConnection):
 184   """Preserve details relevant to replaying SSL connections."""
 185   response_class = DetailedHTTPSResponse
 186
 187
 188 class RealHttpFetch(object):
 189
 190   def __init__(self, real_dns_lookup):
 191     """Initialize RealHttpFetch.
 192
 193     Args:
 194       real_dns_lookup: a function that resolves a host to an IP.
 195     """
 196     self._real_dns_lookup = real_dns_lookup
 197
 198   @staticmethod
 199   def _GetHeaderNameValue(header):
 200     """Parse the header line and return a name/value tuple.
 201
 202     Args:
 203       header: a string for a header such as "Content-Length: 314".
 204     Returns:
 205       A tuple (header_name, header_value) on success or None if the header
 206       is not in expected format. header_name is in lowercase.
 207     """
 208     i = header.find(':')
 209     if i > 0:
 210       return (header[:i].lower(), header[i+1:].strip())
 211     return None
 212
 213   @staticmethod
 214   def _ToTuples(headers):
 215     """Parse headers and save them to a list of tuples.
 216
 217     This method takes HttpResponse.msg.headers as input and convert it
 218     to a list of (header_name, header_value) tuples.
 219     HttpResponse.msg.headers is a list of strings where each string
 220     represents either a header or a continuation line of a header.
 221     1. a normal header consists of two parts which are separated by colon :
 222        "header_name:header_value..."
 223     2. a continuation line is a string starting with whitespace
 224        "[whitespace]continued_header_value..."
 225     If a header is not in good shape or an unexpected continuation line is
 226     seen, it will be ignored.
 227
 228     Should avoid using response.getheaders() directly
 229     because response.getheaders() can't handle multiple headers
 230     with the same name properly. Instead, parse the
 231     response.msg.headers using this method to get all headers.
 232
 233     Args:
 234       headers: an instance of HttpResponse.msg.headers.
 235     Returns:
 236       A list of tuples which looks like:
 237       [(header_name, header_value), (header_name2, header_value2)...]
 238     """
 239     all_headers = []
 240     for line in headers:
 241       if line[0] in '\t ':
 242         if not all_headers:
 243           logging.warning(
 244               'Unexpected response header continuation line [%s]', line)
 245           continue
 246         name, value = all_headers.pop()
 247         value += '\n ' + line.strip()
 248       else:
 249         name_value = RealHttpFetch._GetHeaderNameValue(line)
 250         if not name_value:
 251           logging.warning(
 252               'Response header in wrong format [%s]', line)
 253           continue
 254         name, value = name_value
 255       all_headers.append((name, value))
 256     return all_headers
 257
 258   @staticmethod
 259   def _get_request_host_port(request):
 260     host_parts = request.host.split(':')
 261     host = host_parts[0]
 262     port = int(host_parts[1]) if len(host_parts) == 2 else None
 263     return host, port
 264
 265   def _get_system_proxy(self, is_ssl):
 266     return platformsettings.get_system_proxy(is_ssl)
 267
 268   def _get_connection(self, request_host, request_port, is_ssl):
 269     """Return a detailed connection object for host/port pair.
 270
 271     If a system proxy is defined (see platformsettings.py), it will be used.
 272
 273     Args:
 274       request_host: a host string (e.g. "www.example.com").
 275       request_port: a port integer (e.g. 8080) or None (for the default port).
 276       is_ssl: True if HTTPS connection is needed.
 277     Returns:
 278       A DetailedHTTPSConnection or DetailedHTTPConnection instance.
 279     """
 280     connection_host = request_host
 281     connection_port = request_port
 282     system_proxy = self._get_system_proxy(is_ssl)
 283     if system_proxy:
 284       connection_host = system_proxy.host
 285       connection_port = system_proxy.port
 286
 287     # Use an IP address because WPR may override DNS settings.
 288     connection_ip = self._real_dns_lookup(connection_host)
 289     if not connection_ip:
 290       logging.critical('Unable to find host ip for name: %s', connection_host)
 291       return None
 292
 293     if is_ssl:
 294       connection = DetailedHTTPSConnection(connection_ip, connection_port)
 295       if system_proxy:
 296         connection.set_tunnel(self, request_host, request_port)
 297     else:
 298       connection = DetailedHTTPConnection(connection_ip, connection_port)
 299     return connection
 300
 301   def __call__(self, request):
 302     """Fetch an HTTP request.
 303
 304     Args:
 305       request: an ArchivedHttpRequest
 306     Returns:
 307       an ArchivedHttpResponse
 308     """
 309     logging.debug('RealHttpFetch: %s %s', request.host, request.full_path)
 310     request_host, request_port = self._get_request_host_port(request)
 311     retries = 3
 312     while True:
 313       try:
 314         connection = self._get_connection(
 315             request_host, request_port, request.is_ssl)
 316         connect_start = TIMER()
 317         connection.connect()
 318         connect_delay = int((TIMER() - connect_start) * 1000)
 319         start = TIMER()
 320         connection.request(
 321             request.command,
 322             request.full_path,
 323             request.request_body,
 324             request.headers)
 325         response = connection.getresponse()
 326         headers_delay = int((TIMER() - start) * 1000)
 327
 328         chunks, chunk_delays = response.read_chunks()
 329         delays = {
 330             'connect': connect_delay,
 331             'headers': headers_delay,
 332             'data': chunk_delays
 333             }
 334         archived_http_response = httparchive.ArchivedHttpResponse(
 335             response.version,
 336             response.status,
 337             response.reason,
 338             RealHttpFetch._ToTuples(response.msg.headers),
 339             chunks,
 340             delays)
 341         return archived_http_response
 342       except Exception, e:
 343         if retries:
 344           retries -= 1
 345           logging.warning('Retrying fetch %s: %s', request, e)
 346           continue
 347         logging.critical('Could not fetch %s: %s', request, e)
 348         return None
 349
 350
 351 class RecordHttpArchiveFetch(object):
 352   """Make real HTTP fetches and save responses in the given HttpArchive."""
 353
 354   def __init__(self, http_archive, real_dns_lookup, inject_script,
 355                cache_misses=None):
 356     """Initialize RecordHttpArchiveFetch.
 357
 358     Args:
 359       http_archive: an instance of a HttpArchive
 360       real_dns_lookup: a function that resolves a host to an IP.
 361       inject_script: script string to inject in all pages
 362       cache_misses: instance of CacheMissArchive
 363     """
 364     self.http_archive = http_archive
 365     self.real_http_fetch = RealHttpFetch(real_dns_lookup)
 366     self.inject_script = inject_script
 367     self.cache_misses = cache_misses
 368
 369   def __call__(self, request):
 370     """Fetch the request and return the response.
 371
 372     Args:
 373       request: an ArchivedHttpRequest.
 374     Returns:
 375       an ArchivedHttpResponse
 376     """
 377     if self.cache_misses:
 378       self.cache_misses.record_request(
 379           request, is_record_mode=True, is_cache_miss=False)
 380
 381     # If request is already in the archive, return the archived response.
 382     if request in self.http_archive:
 383       logging.debug('Repeated request found: %s', request)
 384       response = self.http_archive[request]
 385     else:
 386       response = self.real_http_fetch(request)
 387       if response is None:
 388         return None
 389       self.http_archive[request] = response
 390     if self.inject_script:
 391       response = _InjectScripts(response, self.inject_script)
 392     logging.debug('Recorded: %s', request)
 393     return response
 394
 395
 396 class ReplayHttpArchiveFetch(object):
 397   """Serve responses from the given HttpArchive."""
 398
 399   def __init__(self, http_archive, real_dns_lookup, inject_script,
 400                use_diff_on_unknown_requests=False, cache_misses=None,
 401                use_closest_match=False, scramble_images=False):
 402     """Initialize ReplayHttpArchiveFetch.
 403
 404     Args:
 405       http_archive: an instance of a HttpArchive
 406       real_dns_lookup: a function that resolves a host to an IP.
 407       inject_script: script string to inject in all pages
 408       use_diff_on_unknown_requests: If True, log unknown requests
 409         with a diff to requests that look similar.
 410       cache_misses: Instance of CacheMissArchive.
 411         Callback updates archive on cache misses
 412       use_closest_match: If True, on replay mode, serve the closest match
 413         in the archive instead of giving a 404.
 414     """
 415     self.http_archive = http_archive
 416     self.inject_script = inject_script
 417     self.use_diff_on_unknown_requests = use_diff_on_unknown_requests
 418     self.cache_misses = cache_misses
 419     self.use_closest_match = use_closest_match
 420     self.scramble_images = scramble_images
 421     self.real_http_fetch = RealHttpFetch(real_dns_lookup)
 422
 423   def __call__(self, request):
 424     """Fetch the request and return the response.
 425
 426     Args:
 427       request: an instance of an ArchivedHttpRequest.
 428     Returns:
 429       Instance of ArchivedHttpResponse (if found) or None
 430     """
 431     if request.host.startswith('127.0.0.1:'):
 432       return self.real_http_fetch(request)
 433
 434     response = self.http_archive.get(request)
 435
 436     if self.use_closest_match and not response:
 437       closest_request = self.http_archive.find_closest_request(
 438           request, use_path=True)
 439       if closest_request:
 440         response = self.http_archive.get(closest_request)
 441         if response:
 442           logging.info('Request not found: %s\nUsing closest match: %s',
 443                        request, closest_request)
 444
 445     if self.cache_misses:
 446       self.cache_misses.record_request(
 447           request, is_record_mode=False, is_cache_miss=not response)
 448
 449     if not response:
 450       reason = str(request)
 451       if self.use_diff_on_unknown_requests:
 452         diff = self.http_archive.diff(request)
 453         if diff:
 454           reason += (
 455               "\nNearest request diff "
 456               "('-' for archived request, '+' for current request):\n%s" % diff)
 457       logging.warning('Could not replay: %s', reason)
 458     else:
 459       if self.inject_script:
 460         response = _InjectScripts(response, self.inject_script)
 461       if self.scramble_images:
 462         response = _ScrambleImages(response)
 463     return response
 464
 465
 466 class ControllableHttpArchiveFetch(object):
 467   """Controllable fetch function that can swap between record and replay."""
 468
 469   def __init__(self, http_archive, real_dns_lookup,
 470                inject_script, use_diff_on_unknown_requests,
 471                use_record_mode, cache_misses, use_closest_match,
 472                scramble_images):
 473     """Initialize HttpArchiveFetch.
 474
 475     Args:
 476       http_archive: an instance of a HttpArchive
 477       real_dns_lookup: a function that resolves a host to an IP.
 478       inject_script: script string to inject in all pages.
 479       use_diff_on_unknown_requests: If True, log unknown requests
 480         with a diff to requests that look similar.
 481       use_record_mode: If True, start in server in record mode.
 482       cache_misses: Instance of CacheMissArchive.
 483       use_closest_match: If True, on replay mode, serve the closest match
 484         in the archive instead of giving a 404.
 485     """
 486     self.http_archive = http_archive
 487     self.record_fetch = RecordHttpArchiveFetch(
 488         http_archive, real_dns_lookup, inject_script,
 489         cache_misses)
 490     self.replay_fetch = ReplayHttpArchiveFetch(
 491         http_archive, real_dns_lookup, inject_script,
 492         use_diff_on_unknown_requests, cache_misses,
 493         use_closest_match, scramble_images)
 494     if use_record_mode:
 495       self.SetRecordMode()
 496     else:
 497       self.SetReplayMode()
 498
 499   def SetRecordMode(self):
 500     self.fetch = self.record_fetch
 501     self.is_record_mode = True
 502
 503   def SetReplayMode(self):
 504     self.fetch = self.replay_fetch
 505     self.is_record_mode = False
 506
 507   def __call__(self, *args, **kwargs):
 508     """Forward calls to Replay/Record fetch functions depending on mode."""
 509     return self.fetch(*args, **kwargs)