src/chrome/common/extensions/docs/server2/link_error_detector.py

   1 # Copyright 2013 The Chromium Authors. All rights reserved.
   2 # Use of this source code is governed by a BSD-style license that can be
   3 # found in the LICENSE file.
   4
   5 from collections import defaultdict, deque, namedtuple
   6 from HTMLParser import HTMLParser, HTMLParseError
   7 from itertools import groupby
   8 from operator import itemgetter
   9 import posixpath
  10 from urlparse import urlsplit
  11
  12 from file_system_util import CreateURLsFromPaths
  13 from path_util import AssertIsDirectory
  14
  15
  16 Page = namedtuple('Page', 'status, links, anchors, anchor_refs')
  17
  18
  19 def _SplitAnchor(url):
  20   components = urlsplit(url)
  21   return components.path, components.fragment
  22
  23
  24 def _Process(path, renderer):
  25   '''Render the page at |path| using a |renderer| and process the contents of
  26   that page. Returns a |Page| namedtuple with fields for the http status code
  27   of the page render, the href of all the links that occurred on the page, all
  28   of the anchors on the page (ids and names), and all links that contain an
  29   anchor component.
  30
  31   If a non-html page is properly rendered, a |Page| with status code 200 and
  32   all other fields empty is returned.
  33   '''
  34   parser = _ContentParser()
  35   response = renderer(path)
  36
  37   if response.status != 200:
  38     return Page(response.status, (), (), ())
  39   if not path.endswith('.html'):
  40     return Page(200, (), (), ())
  41
  42   try:
  43     parser.feed(str(response.content))
  44   except HTMLParseError:
  45     return Page(200, (), (), ())
  46
  47   links, anchors = parser.links, parser.anchors
  48   if '/' in path:
  49     base, _ = path.rsplit('/', 1)
  50   else:
  51     base = ''
  52   edges = []
  53   anchor_refs = []
  54
  55   # Convert relative links to absolute links and categorize links as edges
  56   # or anchor_refs.
  57   for link in links:
  58     # Files like experimental_history.html are refered to with the URL
  59     # experimental.history.html.
  60     head, last = link.rsplit('/', 1) if '/' in link else ('', link)
  61     last, anchor = _SplitAnchor(last)
  62
  63     if last.endswith('.html') and last.count('.') > 1:
  64       last = last.replace('.', '_', last.count('.') - 1)
  65       link = posixpath.join(head, last)
  66       if anchor:
  67         link = '%s#%s' % (link, anchor)
  68
  69     if link.startswith('#'):
  70       anchor_refs.append(link)
  71     else:
  72       if link.startswith('/'):
  73         link = link[1:]
  74       else:
  75         link = posixpath.normpath('%s/%s' % (base, link))
  76
  77       if '#' in link:
  78         anchor_refs.append(link)
  79       else:
  80         edges.append(link)
  81
  82   return Page(200, edges, anchors, anchor_refs)
  83
  84
  85 class _ContentParser(HTMLParser):
  86   '''Parse an html file pulling out all links and anchor_refs, where an
  87   anchor_ref is a link that contains an anchor.
  88   '''
  89
  90   def __init__(self):
  91     HTMLParser.__init__(self)
  92     self.links = []
  93     self.anchors = set()
  94
  95   def handle_starttag(self, tag, raw_attrs):
  96     attrs = dict(raw_attrs)
  97
  98     if tag == 'a':
  99       # Handle special cases for href's that: start with a space, contain
 100       # just a '.' (period), contain python templating code, are an absolute
 101       # url, are a zip file, or execute javascript on the page.
 102       href = attrs.get('href', '').strip()
 103       if href and not href == '.' and not '{{' in href:
 104         if not urlsplit(href).scheme in ('http', 'https'):
 105           if not href.endswith('.zip') and not 'javascript:' in href:
 106             self.links.append(href)
 107
 108     if attrs.get('id'):
 109       self.anchors.add(attrs['id'])
 110     if attrs.get('name'):
 111       self.anchors.add(attrs['name'])
 112
 113
 114 class LinkErrorDetector(object):
 115   '''Finds link errors on the doc server. This includes broken links, those with
 116   a target page that 404s or contain an anchor that doesn't exist, or pages that
 117   have no links to them.
 118   '''
 119
 120   def __init__(self, file_system, renderer, public_path, root_pages):
 121     '''Creates a new broken link detector. |renderer| is a callable that takes
 122     a path and returns a full html page. |public_path| is the path to public
 123     template files. All URLs in |root_pages| are used as the starting nodes for
 124     the orphaned page search.
 125     '''
 126     AssertIsDirectory(public_path)
 127     self._file_system = file_system
 128     self._renderer = renderer
 129     self._public_path = public_path
 130     self._pages = defaultdict(lambda: Page(404, (), (), ()))
 131     self._root_pages = frozenset(root_pages)
 132     self._always_detached = frozenset((
 133         'apps/404.html',
 134         'extensions/404.html',
 135         'apps/private_apis.html',
 136         'extensions/private_apis.html'))
 137     self._redirection_whitelist = frozenset(('extensions/', 'apps/'))
 138
 139     self._RenderAllPages()
 140
 141   def _RenderAllPages(self):
 142     '''Traverses the public templates directory rendering each URL and
 143     processing the resultant html to pull out all links and anchors.
 144     '''
 145     top_level_directories = (
 146       ('docs/templates/public/', ''),
 147       ('docs/static/', 'static/'),
 148       ('docs/examples/', 'extensions/examples/'),
 149     )
 150
 151     for dirpath, urlprefix in top_level_directories:
 152       files = CreateURLsFromPaths(self._file_system, dirpath, urlprefix)
 153       for url, path in files:
 154         self._pages[url] = _Process(url, self._renderer)
 155
 156         if self._pages[url].status != 200:
 157           print(url, ', a url derived from the path', dirpath +
 158               ', resulted in a', self._pages[url].status)
 159
 160   def _FollowRedirections(self, starting_url, limit=4):
 161     '''Follow redirection until a non-redirectable page is reached. Start at
 162     |starting_url| which must return a 301 or 302 status code.
 163
 164     Return a tuple of: the status of rendering |staring_url|, the final url,
 165     and a list of the pages reached including |starting_url|. If no redirection
 166     occurred, returns (None, None, None).
 167     '''
 168     pages_reached = [starting_url]
 169     redirect_link = None
 170     target_page = self._renderer(starting_url)
 171     original_status = status = target_page.status
 172     count = 0
 173
 174     while status in (301, 302):
 175       if count > limit:
 176         return None, None, None
 177       redirect_link = target_page.headers.get('Location')
 178       target_page = self._renderer(redirect_link)
 179       status = target_page.status
 180       pages_reached.append(redirect_link)
 181       count += 1
 182
 183     if redirect_link is None:
 184       return None, None, None
 185
 186     return original_status, redirect_link, pages_reached
 187
 188   def _CategorizeBrokenLinks(self, url, page, pages):
 189     '''Find all broken links on a page and create appropriate notes describing
 190     why tehy are broken (broken anchor, target redirects, etc). |page| is the
 191     current page being checked and is the result of rendering |url|. |pages|
 192     is a callable that takes a path and returns a Page.
 193     '''
 194     broken_links = []
 195
 196     for link in page.links + page.anchor_refs:
 197       components = urlsplit(link)
 198       fragment = components.fragment
 199
 200       if components.path == '':
 201         if fragment == 'top' or fragment == '':
 202           continue
 203         if not fragment in page.anchors:
 204           broken_links.append((200, url, link, 'target anchor not found'))
 205       else:
 206         # Render the target page
 207         target_page = pages(components.path)
 208
 209         if target_page.status != 200:
 210           if components.path in self._redirection_whitelist:
 211             continue
 212
 213           status, relink, _ = self._FollowRedirections(components.path)
 214           if relink:
 215             broken_links.append((
 216                 status,
 217                 url,
 218                 link,
 219                 'redirects to %s' % relink))
 220           else:
 221             broken_links.append((
 222                 target_page.status, url, link, 'target page not found'))
 223
 224         elif fragment:
 225           if not fragment in target_page.anchors:
 226             broken_links.append((
 227                 target_page.status, url, link, 'target anchor not found'))
 228
 229     return broken_links
 230
 231   def GetBrokenLinks(self):
 232     '''Find all broken links. A broken link is a link that leads to a page
 233     that does not exist (404s), redirects to another page (301 or 302), or
 234     has an anchor whose target does not exist.
 235
 236     Returns a list of tuples of four elements: status, url, target_page,
 237     notes.
 238     '''
 239     broken_links = []
 240
 241     for url in self._pages.keys():
 242       page = self._pages[url]
 243       if page.status != 200:
 244         continue
 245       broken_links.extend(self._CategorizeBrokenLinks(
 246           url, page, lambda x: self._pages[x]))
 247
 248     return broken_links
 249
 250   def GetOrphanedPages(self):
 251     '''Crawls the server find all pages that are connected to the pages at
 252     |seed_url|s. Return the links that are valid on the server but are not in
 253     part of the connected component containing the |root_pages|. These pages
 254     are orphans and cannot be reached simply by clicking through the server.
 255     '''
 256     pages_to_check = deque(self._root_pages.union(self._always_detached))
 257     found = set(self._root_pages) | self._always_detached
 258
 259     while pages_to_check:
 260       item = pages_to_check.popleft()
 261       target_page = self._pages[item]
 262
 263       if target_page.status != 200:
 264         redirected_page = self._FollowRedirections(item)[1]
 265         if not redirected_page is None:
 266           target_page = self._pages[redirected_page]
 267
 268       for link in target_page.links:
 269         if link not in found:
 270           found.add(link)
 271           pages_to_check.append(link)
 272
 273     all_urls = set(
 274         [url for url, page in self._pages.iteritems() if page.status == 200])
 275
 276     return [url for url in all_urls - found if url.endswith('.html')]
 277
 278
 279 def StringifyBrokenLinks(broken_links):
 280   '''Prints out broken links in a more readable format.
 281   '''
 282   def fixed_width(string, width):
 283     return "%s%s" % (string, (width - len(string)) * ' ')
 284
 285   first_col_width = max(len(link[1]) for link in broken_links)
 286   second_col_width = max(len(link[2]) for link in broken_links)
 287   target = itemgetter(2)
 288   output = []
 289
 290   def pretty_print(link, col_offset=0):
 291     return "%s -> %s %s" % (
 292         fixed_width(link[1], first_col_width - col_offset),
 293         fixed_width(link[2], second_col_width),
 294         link[3])
 295
 296   for target, links in groupby(sorted(broken_links, key=target), target):
 297     links = list(links)
 298     # Compress messages
 299     if len(links) > 50 and not links[0][2].startswith('#'):
 300       message = "Found %d broken links (" % len(links)
 301       output.append("%s%s)" % (message, pretty_print(links[0], len(message))))
 302     else:
 303       for link in links:
 304         output.append(pretty_print(link))
 305
 306   return '\n'.join(output)