src/lxml/html/clean.py

   1 # cython: language_level=2
   2
   3 """A cleanup tool for HTML.
   4
   5 Removes unwanted tags and content.  See the `Cleaner` class for
   6 details.
   7 """
   8
   9 from __future__ import absolute_import
  10
  11 import re
  12 import copy
  13 try:
  14     from urlparse import urlsplit
  15     from urllib import unquote_plus
  16 except ImportError:
  17     # Python 3
  18     from urllib.parse import urlsplit, unquote_plus
  19 from lxml import etree
  20 from lxml.html import defs
  21 from lxml.html import fromstring, XHTML_NAMESPACE
  22 from lxml.html import xhtml_to_html, _transform_result
  23
  24 try:
  25     unichr
  26 except NameError:
  27     # Python 3
  28     unichr = chr
  29 try:
  30     unicode
  31 except NameError:
  32     # Python 3
  33     unicode = str
  34 try:
  35     basestring
  36 except NameError:
  37     basestring = (str, bytes)
  38
  39
  40 __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
  41            'word_break', 'word_break_html']
  42
  43 # Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl
  44 #   Particularly the CSS cleaning; most of the tag cleaning is integrated now
  45 # I have multiple kinds of schemes searched; but should schemes be
  46 #   whitelisted instead?
  47 # max height?
  48 # remove images?  Also in CSS?  background attribute?
  49 # Some way to whitelist object, iframe, etc (e.g., if you want to
  50 #   allow *just* embedded YouTube movies)
  51 # Log what was deleted and why?
  52 # style="behavior: ..." might be bad in IE?
  53 # Should we have something for just <meta http-equiv>?  That's the worst of the
  54 #   metas.
  55 # UTF-7 detections?  Example:
  56 #     <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4-
  57 #   you don't always have to have the charset set, if the page has no charset
  58 #   and there's UTF7-like code in it.
  59 # Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php
  60
  61
  62 # This is an IE-specific construct you can have in a stylesheet to
  63 # run some Javascript:
  64 _css_javascript_re = re.compile(
  65     r'expression\s*\(.*?\)', re.S|re.I)
  66
  67 # Do I have to worry about @\nimport?
  68 _css_import_re = re.compile(
  69     r'@\s*import', re.I)
  70
  71 # All kinds of schemes besides just javascript: that can cause
  72 # execution:
  73 _is_image_dataurl = re.compile(
  74     r'^data:image/.+;base64', re.I).search
  75 _is_possibly_malicious_scheme = re.compile(
  76     r'(?:javascript|jscript|livescript|vbscript|data|about|mocha):',
  77     re.I).search
  78 def _is_javascript_scheme(s):
  79     if _is_image_dataurl(s):
  80         return None
  81     return _is_possibly_malicious_scheme(s)
  82
  83 _substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub
  84 # FIXME: should data: be blocked?
  85
  86 # FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx
  87 _conditional_comment_re = re.compile(
  88     r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
  89
  90 _find_styled_elements = etree.XPath(
  91     "descendant-or-self::*[@style]")
  92
  93 _find_external_links = etree.XPath(
  94     ("descendant-or-self::a  [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |"
  95      "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),
  96     namespaces={'x':XHTML_NAMESPACE})
  97
  98
  99 class Cleaner(object):
 100     """
 101     Instances cleans the document of each of the possible offending
 102     elements.  The cleaning is controlled by attributes; you can
 103     override attributes in a subclass, or set them in the constructor.
 104
 105     ``scripts``:
 106         Removes any ``<script>`` tags.
 107
 108     ``javascript``:
 109         Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets
 110         as they could contain Javascript.
 111
 112     ``comments``:
 113         Removes any comments.
 114
 115     ``style``:
 116         Removes any style tags.
 117
 118     ``inline_style``
 119         Removes any style attributes.  Defaults to the value of the ``style`` option.
 120
 121     ``links``:
 122         Removes any ``<link>`` tags
 123
 124     ``meta``:
 125         Removes any ``<meta>`` tags
 126
 127     ``page_structure``:
 128         Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
 129
 130     ``processing_instructions``:
 131         Removes any processing instructions.
 132
 133     ``embedded``:
 134         Removes any embedded objects (flash, iframes)
 135
 136     ``frames``:
 137         Removes any frame-related tags
 138
 139     ``forms``:
 140         Removes any form tags
 141
 142     ``annoying_tags``:
 143         Tags that aren't *wrong*, but are annoying.  ``<blink>`` and ``<marquee>``
 144
 145     ``remove_tags``:
 146         A list of tags to remove.  Only the tags will be removed,
 147         their content will get pulled up into the parent tag.
 148
 149     ``kill_tags``:
 150         A list of tags to kill.  Killing also removes the tag's content,
 151         i.e. the whole subtree, not just the tag itself.
 152
 153     ``allow_tags``:
 154         A list of tags to include (default include all).
 155
 156     ``remove_unknown_tags``:
 157         Remove any tags that aren't standard parts of HTML.
 158
 159     ``safe_attrs_only``:
 160         If true, only include 'safe' attributes (specifically the list
 161         from the feedparser HTML sanitisation web site).
 162
 163     ``safe_attrs``:
 164         A set of attribute names to override the default list of attributes
 165         considered 'safe' (when safe_attrs_only=True).
 166
 167     ``add_nofollow``:
 168         If true, then any <a> tags will have ``rel="nofollow"`` added to them.
 169
 170     ``host_whitelist``:
 171         A list or set of hosts that you can use for embedded content
 172         (for content like ``<object>``, ``<link rel="stylesheet">``, etc).
 173         You can also implement/override the method
 174         ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to
 175         implement more complex rules for what can be embedded.
 176         Anything that passes this test will be shown, regardless of
 177         the value of (for instance) ``embedded``.
 178
 179         Note that this parameter might not work as intended if you do not
 180         make the links absolute before doing the cleaning.
 181
 182         Note that you may also need to set ``whitelist_tags``.
 183
 184     ``whitelist_tags``:
 185         A set of tags that can be included with ``host_whitelist``.
 186         The default is ``iframe`` and ``embed``; you may wish to
 187         include other tags like ``script``, or you may want to
 188         implement ``allow_embedded_url`` for more control.  Set to None to
 189         include all tags.
 190
 191     This modifies the document *in place*.
 192     """
 193
 194     scripts = True
 195     javascript = True
 196     comments = True
 197     style = False
 198     inline_style = None
 199     links = True
 200     meta = True
 201     page_structure = True
 202     processing_instructions = True
 203     embedded = True
 204     frames = True
 205     forms = True
 206     annoying_tags = True
 207     remove_tags = None
 208     allow_tags = None
 209     kill_tags = None
 210     remove_unknown_tags = True
 211     safe_attrs_only = True
 212     safe_attrs = defs.safe_attrs
 213     add_nofollow = False
 214     host_whitelist = ()
 215     whitelist_tags = {'iframe', 'embed'}
 216
 217     def __init__(self, **kw):
 218         for name, value in kw.items():
 219             if not hasattr(self, name):
 220                 raise TypeError(
 221                     "Unknown parameter: %s=%r" % (name, value))
 222             setattr(self, name, value)
 223         if self.inline_style is None and 'inline_style' not in kw:
 224             self.inline_style = self.style
 225
 226     # Used to lookup the primary URL for a given tag that is up for
 227     # removal:
 228     _tag_link_attrs = dict(
 229         script='src',
 230         link='href',
 231         # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html
 232         # From what I can tell, both attributes can contain a link:
 233         applet=['code', 'object'],
 234         iframe='src',
 235         embed='src',
 236         layer='src',
 237         # FIXME: there doesn't really seem like a general way to figure out what
 238         # links an <object> tag uses; links often go in <param> tags with values
 239         # that we don't really know.  You'd have to have knowledge about specific
 240         # kinds of plugins (probably keyed off classid), and match against those.
 241         ##object=?,
 242         # FIXME: not looking at the action currently, because it is more complex
 243         # than than -- if you keep the form, you should keep the form controls.
 244         ##form='action',
 245         a='href',
 246         )
 247
 248     def __call__(self, doc):
 249         """
 250         Cleans the document.
 251         """
 252         if hasattr(doc, 'getroot'):
 253             # ElementTree instance, instead of an element
 254             doc = doc.getroot()
 255         # convert XHTML to HTML
 256         xhtml_to_html(doc)
 257         # Normalize a case that IE treats <image> like <img>, and that
 258         # can confuse either this step or later steps.
 259         for el in doc.iter('image'):
 260             el.tag = 'img'
 261         if not self.comments:
 262             # Of course, if we were going to kill comments anyway, we don't
 263             # need to worry about this
 264             self.kill_conditional_comments(doc)
 265
 266         kill_tags = set(self.kill_tags or ())
 267         remove_tags = set(self.remove_tags or ())
 268         allow_tags = set(self.allow_tags or ())
 269
 270         if self.scripts:
 271             kill_tags.add('script')
 272         if self.safe_attrs_only:
 273             safe_attrs = set(self.safe_attrs)
 274             for el in doc.iter(etree.Element):
 275                 attrib = el.attrib
 276                 for aname in attrib.keys():
 277                     if aname not in safe_attrs:
 278                         del attrib[aname]
 279         if self.javascript:
 280             if not (self.safe_attrs_only and
 281                     self.safe_attrs == defs.safe_attrs):
 282                 # safe_attrs handles events attributes itself
 283                 for el in doc.iter(etree.Element):
 284                     attrib = el.attrib
 285                     for aname in attrib.keys():
 286                         if aname.startswith('on'):
 287                             del attrib[aname]
 288             doc.rewrite_links(self._remove_javascript_link,
 289                               resolve_base_href=False)
 290             # If we're deleting style then we don't have to remove JS links
 291             # from styles, otherwise...
 292             if not self.inline_style:
 293                 for el in _find_styled_elements(doc):
 294                     old = el.get('style')
 295                     new = _css_javascript_re.sub('', old)
 296                     new = _css_import_re.sub('', new)
 297                     if self._has_sneaky_javascript(new):
 298                         # Something tricky is going on...
 299                         del el.attrib['style']
 300                     elif new != old:
 301                         el.set('style', new)
 302             if not self.style:
 303                 for el in list(doc.iter('style')):
 304                     if el.get('type', '').lower().strip() == 'text/javascript':
 305                         el.drop_tree()
 306                         continue
 307                     old = el.text or ''
 308                     new = _css_javascript_re.sub('', old)
 309                     # The imported CSS can do anything; we just can't allow:
 310                     new = _css_import_re.sub('', old)
 311                     if self._has_sneaky_javascript(new):
 312                         # Something tricky is going on...
 313                         el.text = '/* deleted */'
 314                     elif new != old:
 315                         el.text = new
 316         if self.comments or self.processing_instructions:
 317             # FIXME: why either?  I feel like there's some obscure reason
 318             # because you can put PIs in comments...?  But I've already
 319             # forgotten it
 320             kill_tags.add(etree.Comment)
 321         if self.processing_instructions:
 322             kill_tags.add(etree.ProcessingInstruction)
 323         if self.style:
 324             kill_tags.add('style')
 325         if self.inline_style:
 326             etree.strip_attributes(doc, 'style')
 327         if self.links:
 328             kill_tags.add('link')
 329         elif self.style or self.javascript:
 330             # We must get rid of included stylesheets if Javascript is not
 331             # allowed, as you can put Javascript in them
 332             for el in list(doc.iter('link')):
 333                 if 'stylesheet' in el.get('rel', '').lower():
 334                     # Note this kills alternate stylesheets as well
 335                     if not self.allow_element(el):
 336                         el.drop_tree()
 337         if self.meta:
 338             kill_tags.add('meta')
 339         if self.page_structure:
 340             remove_tags.update(('head', 'html', 'title'))
 341         if self.embedded:
 342             # FIXME: is <layer> really embedded?
 343             # We should get rid of any <param> tags not inside <applet>;
 344             # These are not really valid anyway.
 345             for el in list(doc.iter('param')):
 346                 found_parent = False
 347                 parent = el.getparent()
 348                 while parent is not None and parent.tag not in ('applet', 'object'):
 349                     parent = parent.getparent()
 350                 if parent is None:
 351                     el.drop_tree()
 352             kill_tags.update(('applet',))
 353             # The alternate contents that are in an iframe are a good fallback:
 354             remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
 355         if self.frames:
 356             # FIXME: ideally we should look at the frame links, but
 357             # generally frames don't mix properly with an HTML
 358             # fragment anyway.
 359             kill_tags.update(defs.frame_tags)
 360         if self.forms:
 361             remove_tags.add('form')
 362             kill_tags.update(('button', 'input', 'select', 'textarea'))
 363         if self.annoying_tags:
 364             remove_tags.update(('blink', 'marquee'))
 365
 366         _remove = []
 367         _kill = []
 368         for el in doc.iter():
 369             if el.tag in kill_tags:
 370                 if self.allow_element(el):
 371                     continue
 372                 _kill.append(el)
 373             elif el.tag in remove_tags:
 374                 if self.allow_element(el):
 375                     continue
 376                 _remove.append(el)
 377
 378         if _remove and _remove[0] == doc:
 379             # We have to drop the parent-most tag, which we can't
 380             # do.  Instead we'll rewrite it:
 381             el = _remove.pop(0)
 382             el.tag = 'div'
 383             el.attrib.clear()
 384         elif _kill and _kill[0] == doc:
 385             # We have to drop the parent-most element, which we can't
 386             # do.  Instead we'll clear it:
 387             el = _kill.pop(0)
 388             if el.tag != 'html':
 389                 el.tag = 'div'
 390             el.clear()
 391
 392         _kill.reverse() # start with innermost tags
 393         for el in _kill:
 394             el.drop_tree()
 395         for el in _remove:
 396             el.drop_tag()
 397
 398         if self.remove_unknown_tags:
 399             if allow_tags:
 400                 raise ValueError(
 401                     "It does not make sense to pass in both allow_tags and remove_unknown_tags")
 402             allow_tags = set(defs.tags)
 403         if allow_tags:
 404             bad = []
 405             for el in doc.iter():
 406                 if el.tag not in allow_tags:
 407                     bad.append(el)
 408             if bad:
 409                 if bad[0] is doc:
 410                     el = bad.pop(0)
 411                     el.tag = 'div'
 412                     el.attrib.clear()
 413                 for el in bad:
 414                     el.drop_tag()
 415         if self.add_nofollow:
 416             for el in _find_external_links(doc):
 417                 if not self.allow_follow(el):
 418                     rel = el.get('rel')
 419                     if rel:
 420                         if ('nofollow' in rel
 421                                 and ' nofollow ' in (' %s ' % rel)):
 422                             continue
 423                         rel = '%s nofollow' % rel
 424                     else:
 425                         rel = 'nofollow'
 426                     el.set('rel', rel)
 427
 428     def allow_follow(self, anchor):
 429         """
 430         Override to suppress rel="nofollow" on some anchors.
 431         """
 432         return False
 433
 434     def allow_element(self, el):
 435         """
 436         Decide whether an element is configured to be accepted or rejected.
 437
 438         :param el: an element.
 439         :return: true to accept the element or false to reject/discard it.
 440         """
 441         if el.tag not in self._tag_link_attrs:
 442             return False
 443         attr = self._tag_link_attrs[el.tag]
 444         if isinstance(attr, (list, tuple)):
 445             for one_attr in attr:
 446                 url = el.get(one_attr)
 447                 if not url:
 448                     return False
 449                 if not self.allow_embedded_url(el, url):
 450                     return False
 451             return True
 452         else:
 453             url = el.get(attr)
 454             if not url:
 455                 return False
 456             return self.allow_embedded_url(el, url)
 457
 458     def allow_embedded_url(self, el, url):
 459         """
 460         Decide whether a URL that was found in an element's attributes or text
 461         if configured to be accepted or rejected.
 462
 463         :param el: an element.
 464         :param url: a URL found on the element.
 465         :return: true to accept the URL and false to reject it.
 466         """
 467         if self.whitelist_tags is not None and el.tag not in self.whitelist_tags:
 468             return False
 469         scheme, netloc, path, query, fragment = urlsplit(url)
 470         netloc = netloc.lower().split(':', 1)[0]
 471         if scheme not in ('http', 'https'):
 472             return False
 473         if netloc in self.host_whitelist:
 474             return True
 475         return False
 476
 477     def kill_conditional_comments(self, doc):
 478         """
 479         IE conditional comments basically embed HTML that the parser
 480         doesn't normally see.  We can't allow anything like that, so
 481         we'll kill any comments that could be conditional.
 482         """
 483         bad = []
 484         self._kill_elements(
 485             doc, lambda el: _conditional_comment_re.search(el.text),
 486             etree.Comment)
 487
 488     def _kill_elements(self, doc, condition, iterate=None):
 489         bad = []
 490         for el in doc.iter(iterate):
 491             if condition(el):
 492                 bad.append(el)
 493         for el in bad:
 494             el.drop_tree()
 495
 496     def _remove_javascript_link(self, link):
 497         # links like "j a v a s c r i p t:" might be interpreted in IE
 498         new = _substitute_whitespace('', unquote_plus(link))
 499         if _is_javascript_scheme(new):
 500             # FIXME: should this be None to delete?
 501             return ''
 502         return link
 503
 504     _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub
 505
 506     def _has_sneaky_javascript(self, style):
 507         """
 508         Depending on the browser, stuff like ``e x p r e s s i o n(...)``
 509         can get interpreted, or ``expre/* stuff */ssion(...)``.  This
 510         checks for attempt to do stuff like this.
 511
 512         Typically the response will be to kill the entire style; if you
 513         have just a bit of Javascript in the style another rule will catch
 514         that and remove only the Javascript from the style; this catches
 515         more sneaky attempts.
 516         """
 517         style = self._substitute_comments('', style)
 518         style = style.replace('\\', '')
 519         style = _substitute_whitespace('', style)
 520         style = style.lower()
 521         if 'javascript:' in style:
 522             return True
 523         if 'expression(' in style:
 524             return True
 525         return False
 526
 527     def clean_html(self, html):
 528         result_type = type(html)
 529         if isinstance(html, basestring):
 530             doc = fromstring(html)
 531         else:
 532             doc = copy.deepcopy(html)
 533         self(doc)
 534         return _transform_result(result_type, doc)
 535
 536 clean = Cleaner()
 537 clean_html = clean.clean_html
 538
 539 ############################################################
 540 ## Autolinking
 541 ############################################################
 542
 543 _link_regexes = [
 544     re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I),
 545     # This is conservative, but autolinking can be a bit conservative:
 546     re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_.-]+[a-z]))', re.I),
 547     ]
 548
 549 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
 550
 551 _avoid_hosts = [
 552     re.compile(r'^localhost', re.I),
 553     re.compile(r'\bexample\.(?:com|org|net)$', re.I),
 554     re.compile(r'^127\.0\.0\.1$'),
 555     ]
 556
 557 _avoid_classes = ['nolink']
 558
 559 def autolink(el, link_regexes=_link_regexes,
 560              avoid_elements=_avoid_elements,
 561              avoid_hosts=_avoid_hosts,
 562              avoid_classes=_avoid_classes):
 563     """
 564     Turn any URLs into links.
 565
 566     It will search for links identified by the given regular
 567     expressions (by default mailto and http(s) links).
 568
 569     It won't link text in an element in avoid_elements, or an element
 570     with a class in avoid_classes.  It won't link to anything with a
 571     host that matches one of the regular expressions in avoid_hosts
 572     (default localhost and 127.0.0.1).
 573
 574     If you pass in an element, the element's tail will not be
 575     substituted, only the contents of the element.
 576     """
 577     if el.tag in avoid_elements:
 578         return
 579     class_name = el.get('class')
 580     if class_name:
 581         class_name = class_name.split()
 582         for match_class in avoid_classes:
 583             if match_class in class_name:
 584                 return
 585     for child in list(el):
 586         autolink(child, link_regexes=link_regexes,
 587                  avoid_elements=avoid_elements,
 588                  avoid_hosts=avoid_hosts,
 589                  avoid_classes=avoid_classes)
 590         if child.tail:
 591             text, tail_children = _link_text(
 592                 child.tail, link_regexes, avoid_hosts, factory=el.makeelement)
 593             if tail_children:
 594                 child.tail = text
 595                 index = el.index(child)
 596                 el[index+1:index+1] = tail_children
 597     if el.text:
 598         text, pre_children = _link_text(
 599             el.text, link_regexes, avoid_hosts, factory=el.makeelement)
 600         if pre_children:
 601             el.text = text
 602             el[:0] = pre_children
 603
 604 def _link_text(text, link_regexes, avoid_hosts, factory):
 605     leading_text = ''
 606     links = []
 607     last_pos = 0
 608     while 1:
 609         best_match, best_pos = None, None
 610         for regex in link_regexes:
 611             regex_pos = last_pos
 612             while 1:
 613                 match = regex.search(text, pos=regex_pos)
 614                 if match is None:
 615                     break
 616                 host = match.group('host')
 617                 for host_regex in avoid_hosts:
 618                     if host_regex.search(host):
 619                         regex_pos = match.end()
 620                         break
 621                 else:
 622                     break
 623             if match is None:
 624                 continue
 625             if best_pos is None or match.start() < best_pos:
 626                 best_match = match
 627                 best_pos = match.start()
 628         if best_match is None:
 629             # No more matches
 630             if links:
 631                 assert not links[-1].tail
 632                 links[-1].tail = text
 633             else:
 634                 assert not leading_text
 635                 leading_text = text
 636             break
 637         link = best_match.group(0)
 638         end = best_match.end()
 639         if link.endswith('.') or link.endswith(','):
 640             # These punctuation marks shouldn't end a link
 641             end -= 1
 642             link = link[:-1]
 643         prev_text = text[:best_match.start()]
 644         if links:
 645             assert not links[-1].tail
 646             links[-1].tail = prev_text
 647         else:
 648             assert not leading_text
 649             leading_text = prev_text
 650         anchor = factory('a')
 651         anchor.set('href', link)
 652         body = best_match.group('body')
 653         if not body:
 654             body = link
 655         if body.endswith('.') or body.endswith(','):
 656             body = body[:-1]
 657         anchor.text = body
 658         links.append(anchor)
 659         text = text[end:]
 660     return leading_text, links
 661
 662 def autolink_html(html, *args, **kw):
 663     result_type = type(html)
 664     if isinstance(html, basestring):
 665         doc = fromstring(html)
 666     else:
 667         doc = copy.deepcopy(html)
 668     autolink(doc, *args, **kw)
 669     return _transform_result(result_type, doc)
 670
 671 autolink_html.__doc__ = autolink.__doc__
 672
 673 ############################################################
 674 ## Word wrapping
 675 ############################################################
 676
 677 _avoid_word_break_elements = ['pre', 'textarea', 'code']
 678 _avoid_word_break_classes = ['nobreak']
 679
 680 def word_break(el, max_width=40,
 681                avoid_elements=_avoid_word_break_elements,
 682                avoid_classes=_avoid_word_break_classes,
 683                break_character=unichr(0x200b)):
 684     """
 685     Breaks any long words found in the body of the text (not attributes).
 686
 687     Doesn't effect any of the tags in avoid_elements, by default
 688     ``<textarea>`` and ``<pre>``
 689
 690     Breaks words by inserting &#8203;, which is a unicode character
 691     for Zero Width Space character.  This generally takes up no space
 692     in rendering, but does copy as a space, and in monospace contexts
 693     usually takes up space.
 694
 695     See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion
 696     """
 697     # Character suggestion of &#8203 comes from:
 698     #   http://www.cs.tut.fi/~jkorpela/html/nobr.html
 699     if el.tag in _avoid_word_break_elements:
 700         return
 701     class_name = el.get('class')
 702     if class_name:
 703         dont_break = False
 704         class_name = class_name.split()
 705         for avoid in avoid_classes:
 706             if avoid in class_name:
 707                 dont_break = True
 708                 break
 709         if dont_break:
 710             return
 711     if el.text:
 712         el.text = _break_text(el.text, max_width, break_character)
 713     for child in el:
 714         word_break(child, max_width=max_width,
 715                    avoid_elements=avoid_elements,
 716                    avoid_classes=avoid_classes,
 717                    break_character=break_character)
 718         if child.tail:
 719             child.tail = _break_text(child.tail, max_width, break_character)
 720
 721 def word_break_html(html, *args, **kw):
 722     result_type = type(html)
 723     doc = fromstring(html)
 724     word_break(doc, *args, **kw)
 725     return _transform_result(result_type, doc)
 726
 727 def _break_text(text, max_width, break_character):
 728     words = text.split()
 729     for word in words:
 730         if len(word) > max_width:
 731             replacement = _insert_break(word, max_width, break_character)
 732             text = text.replace(word, replacement)
 733     return text
 734
 735 _break_prefer_re = re.compile(r'[^a-z]', re.I)
 736
 737 def _insert_break(word, width, break_character):
 738     orig_word = word
 739     result = ''
 740     while len(word) > width:
 741         start = word[:width]
 742         breaks = list(_break_prefer_re.finditer(start))
 743         if breaks:
 744             last_break = breaks[-1]
 745             # Only walk back up to 10 characters to find a nice break:
 746             if last_break.end() > width-10:
 747                 # FIXME: should the break character be at the end of the
 748                 # chunk, or the beginning of the next chunk?
 749                 start = word[:last_break.end()]
 750         result += start + break_character
 751         word = word[len(start):]
 752     result += word
 753     return result
 754