src/lxml/html/clean.py

   1 # cython: language_level=3str
   2
   3 """A cleanup tool for HTML.
   4
   5 Removes unwanted tags and content.  See the `Cleaner` class for
   6 details.
   7 """
   8
   9 from __future__ import absolute_import
  10
  11 import re
  12 import copy
  13 try:
  14     from urlparse import urlsplit
  15     from urllib import unquote_plus
  16 except ImportError:
  17     # Python 3
  18     from urllib.parse import urlsplit, unquote_plus
  19 from lxml import etree
  20 from lxml.html import defs
  21 from lxml.html import fromstring, XHTML_NAMESPACE
  22 from lxml.html import xhtml_to_html, _transform_result
  23
  24 try:
  25     unichr
  26 except NameError:
  27     # Python 3
  28     unichr = chr
  29 try:
  30     unicode
  31 except NameError:
  32     # Python 3
  33     unicode = str
  34 try:
  35     basestring
  36 except NameError:
  37     basestring = (str, bytes)
  38
  39
  40 __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
  41            'word_break', 'word_break_html']
  42
  43 # Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl
  44 #   Particularly the CSS cleaning; most of the tag cleaning is integrated now
  45 # I have multiple kinds of schemes searched; but should schemes be
  46 #   whitelisted instead?
  47 # max height?
  48 # remove images?  Also in CSS?  background attribute?
  49 # Some way to whitelist object, iframe, etc (e.g., if you want to
  50 #   allow *just* embedded YouTube movies)
  51 # Log what was deleted and why?
  52 # style="behavior: ..." might be bad in IE?
  53 # Should we have something for just <meta http-equiv>?  That's the worst of the
  54 #   metas.
  55 # UTF-7 detections?  Example:
  56 #     <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4-
  57 #   you don't always have to have the charset set, if the page has no charset
  58 #   and there's UTF7-like code in it.
  59 # Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php
  60
  61
  62 # This is an IE-specific construct you can have in a stylesheet to
  63 # run some Javascript:
  64 _css_javascript_re = re.compile(
  65     r'expression\s*\(.*?\)', re.S|re.I)
  66
  67 # Do I have to worry about @\nimport?
  68 _css_import_re = re.compile(
  69     r'@\s*import', re.I)
  70
  71 # All kinds of schemes besides just javascript: that can cause
  72 # execution:
  73 _is_image_dataurl = re.compile(
  74     r'^data:image/.+;base64', re.I).search
  75 _is_possibly_malicious_scheme = re.compile(
  76     r'(?:javascript|jscript|livescript|vbscript|data|about|mocha):',
  77     re.I).search
  78 def _is_javascript_scheme(s):
  79     if _is_image_dataurl(s):
  80         return None
  81     return _is_possibly_malicious_scheme(s)
  82
  83 _substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub
  84 # FIXME: should data: be blocked?
  85
  86 # FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx
  87 _conditional_comment_re = re.compile(
  88     r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
  89
  90 _find_styled_elements = etree.XPath(
  91     "descendant-or-self::*[@style]")
  92
  93 _find_external_links = etree.XPath(
  94     ("descendant-or-self::a  [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |"
  95      "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),
  96     namespaces={'x':XHTML_NAMESPACE})
  97
  98
  99 class Cleaner(object):
 100     """
 101     Instances cleans the document of each of the possible offending
 102     elements.  The cleaning is controlled by attributes; you can
 103     override attributes in a subclass, or set them in the constructor.
 104
 105     ``scripts``:
 106         Removes any ``<script>`` tags.
 107
 108     ``javascript``:
 109         Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets
 110         as they could contain Javascript.
 111
 112     ``comments``:
 113         Removes any comments.
 114
 115     ``style``:
 116         Removes any style tags.
 117
 118     ``inline_style``
 119         Removes any style attributes.  Defaults to the value of the ``style`` option.
 120
 121     ``links``:
 122         Removes any ``<link>`` tags
 123
 124     ``meta``:
 125         Removes any ``<meta>`` tags
 126
 127     ``page_structure``:
 128         Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
 129
 130     ``processing_instructions``:
 131         Removes any processing instructions.
 132
 133     ``embedded``:
 134         Removes any embedded objects (flash, iframes)
 135
 136     ``frames``:
 137         Removes any frame-related tags
 138
 139     ``forms``:
 140         Removes any form tags
 141
 142     ``annoying_tags``:
 143         Tags that aren't *wrong*, but are annoying.  ``<blink>`` and ``<marquee>``
 144
 145     ``remove_tags``:
 146         A list of tags to remove.  Only the tags will be removed,
 147         their content will get pulled up into the parent tag.
 148
 149     ``kill_tags``:
 150         A list of tags to kill.  Killing also removes the tag's content,
 151         i.e. the whole subtree, not just the tag itself.
 152
 153     ``allow_tags``:
 154         A list of tags to include (default include all).
 155
 156     ``remove_unknown_tags``:
 157         Remove any tags that aren't standard parts of HTML.
 158
 159     ``safe_attrs_only``:
 160         If true, only include 'safe' attributes (specifically the list
 161         from the feedparser HTML sanitisation web site).
 162
 163     ``safe_attrs``:
 164         A set of attribute names to override the default list of attributes
 165         considered 'safe' (when safe_attrs_only=True).
 166
 167     ``add_nofollow``:
 168         If true, then any <a> tags will have ``rel="nofollow"`` added to them.
 169
 170     ``host_whitelist``:
 171         A list or set of hosts that you can use for embedded content
 172         (for content like ``<object>``, ``<link rel="stylesheet">``, etc).
 173         You can also implement/override the method
 174         ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to
 175         implement more complex rules for what can be embedded.
 176         Anything that passes this test will be shown, regardless of
 177         the value of (for instance) ``embedded``.
 178
 179         Note that this parameter might not work as intended if you do not
 180         make the links absolute before doing the cleaning.
 181
 182         Note that you may also need to set ``whitelist_tags``.
 183
 184     ``whitelist_tags``:
 185         A set of tags that can be included with ``host_whitelist``.
 186         The default is ``iframe`` and ``embed``; you may wish to
 187         include other tags like ``script``, or you may want to
 188         implement ``allow_embedded_url`` for more control.  Set to None to
 189         include all tags.
 190
 191     This modifies the document *in place*.
 192     """
 193
 194     scripts = True
 195     javascript = True
 196     comments = True
 197     style = False
 198     inline_style = None
 199     links = True
 200     meta = True
 201     page_structure = True
 202     processing_instructions = True
 203     embedded = True
 204     frames = True
 205     forms = True
 206     annoying_tags = True
 207     remove_tags = None
 208     allow_tags = None
 209     kill_tags = None
 210     remove_unknown_tags = True
 211     safe_attrs_only = True
 212     safe_attrs = defs.safe_attrs
 213     add_nofollow = False
 214     host_whitelist = ()
 215     whitelist_tags = {'iframe', 'embed'}
 216
 217     def __init__(self, **kw):
 218         not_an_attribute = object()
 219         for name, value in kw.items():
 220             default = getattr(self, name, not_an_attribute)
 221             if (default is not None and default is not True and default is not False
 222                     and not isinstance(default, (frozenset, set, tuple, list))):
 223                 raise TypeError(
 224                     "Unknown parameter: %s=%r" % (name, value))
 225             setattr(self, name, value)
 226         if self.inline_style is None and 'inline_style' not in kw:
 227             self.inline_style = self.style
 228
 229         if kw.get("allow_tags"):
 230             if kw.get("remove_unknown_tags"):
 231                 raise ValueError("It does not make sense to pass in both "
 232                                  "allow_tags and remove_unknown_tags")
 233             self.remove_unknown_tags = False
 234
 235     # Used to lookup the primary URL for a given tag that is up for
 236     # removal:
 237     _tag_link_attrs = dict(
 238         script='src',
 239         link='href',
 240         # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html
 241         # From what I can tell, both attributes can contain a link:
 242         applet=['code', 'object'],
 243         iframe='src',
 244         embed='src',
 245         layer='src',
 246         # FIXME: there doesn't really seem like a general way to figure out what
 247         # links an <object> tag uses; links often go in <param> tags with values
 248         # that we don't really know.  You'd have to have knowledge about specific
 249         # kinds of plugins (probably keyed off classid), and match against those.
 250         ##object=?,
 251         # FIXME: not looking at the action currently, because it is more complex
 252         # than than -- if you keep the form, you should keep the form controls.
 253         ##form='action',
 254         a='href',
 255         )
 256
 257     def __call__(self, doc):
 258         """
 259         Cleans the document.
 260         """
 261         try:
 262             getroot = doc.getroot
 263         except AttributeError:
 264             pass  # Element instance
 265         else:
 266             doc = getroot()  # ElementTree instance, instead of an element
 267         # convert XHTML to HTML
 268         xhtml_to_html(doc)
 269         # Normalize a case that IE treats <image> like <img>, and that
 270         # can confuse either this step or later steps.
 271         for el in doc.iter('image'):
 272             el.tag = 'img'
 273         if not self.comments:
 274             # Of course, if we were going to kill comments anyway, we don't
 275             # need to worry about this
 276             self.kill_conditional_comments(doc)
 277
 278         kill_tags = set(self.kill_tags or ())
 279         remove_tags = set(self.remove_tags or ())
 280         allow_tags = set(self.allow_tags or ())
 281
 282         if self.scripts:
 283             kill_tags.add('script')
 284         if self.safe_attrs_only:
 285             safe_attrs = set(self.safe_attrs)
 286             for el in doc.iter(etree.Element):
 287                 attrib = el.attrib
 288                 for aname in attrib.keys():
 289                     if aname not in safe_attrs:
 290                         del attrib[aname]
 291         if self.javascript:
 292             if not (self.safe_attrs_only and
 293                     self.safe_attrs == defs.safe_attrs):
 294                 # safe_attrs handles events attributes itself
 295                 for el in doc.iter(etree.Element):
 296                     attrib = el.attrib
 297                     for aname in attrib.keys():
 298                         if aname.startswith('on'):
 299                             del attrib[aname]
 300             doc.rewrite_links(self._remove_javascript_link,
 301                               resolve_base_href=False)
 302             # If we're deleting style then we don't have to remove JS links
 303             # from styles, otherwise...
 304             if not self.inline_style:
 305                 for el in _find_styled_elements(doc):
 306                     old = el.get('style')
 307                     new = _css_javascript_re.sub('', old)
 308                     new = _css_import_re.sub('', new)
 309                     if self._has_sneaky_javascript(new):
 310                         # Something tricky is going on...
 311                         del el.attrib['style']
 312                     elif new != old:
 313                         el.set('style', new)
 314             if not self.style:
 315                 for el in list(doc.iter('style')):
 316                     if el.get('type', '').lower().strip() == 'text/javascript':
 317                         el.drop_tree()
 318                         continue
 319                     old = el.text or ''
 320                     new = _css_javascript_re.sub('', old)
 321                     # The imported CSS can do anything; we just can't allow:
 322                     new = _css_import_re.sub('', old)
 323                     if self._has_sneaky_javascript(new):
 324                         # Something tricky is going on...
 325                         el.text = '/* deleted */'
 326                     elif new != old:
 327                         el.text = new
 328         if self.comments:
 329             kill_tags.add(etree.Comment)
 330         if self.processing_instructions:
 331             kill_tags.add(etree.ProcessingInstruction)
 332         if self.style:
 333             kill_tags.add('style')
 334         if self.inline_style:
 335             etree.strip_attributes(doc, 'style')
 336         if self.links:
 337             kill_tags.add('link')
 338         elif self.style or self.javascript:
 339             # We must get rid of included stylesheets if Javascript is not
 340             # allowed, as you can put Javascript in them
 341             for el in list(doc.iter('link')):
 342                 if 'stylesheet' in el.get('rel', '').lower():
 343                     # Note this kills alternate stylesheets as well
 344                     if not self.allow_element(el):
 345                         el.drop_tree()
 346         if self.meta:
 347             kill_tags.add('meta')
 348         if self.page_structure:
 349             remove_tags.update(('head', 'html', 'title'))
 350         if self.embedded:
 351             # FIXME: is <layer> really embedded?
 352             # We should get rid of any <param> tags not inside <applet>;
 353             # These are not really valid anyway.
 354             for el in list(doc.iter('param')):
 355                 found_parent = False
 356                 parent = el.getparent()
 357                 while parent is not None and parent.tag not in ('applet', 'object'):
 358                     parent = parent.getparent()
 359                 if parent is None:
 360                     el.drop_tree()
 361             kill_tags.update(('applet',))
 362             # The alternate contents that are in an iframe are a good fallback:
 363             remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
 364         if self.frames:
 365             # FIXME: ideally we should look at the frame links, but
 366             # generally frames don't mix properly with an HTML
 367             # fragment anyway.
 368             kill_tags.update(defs.frame_tags)
 369         if self.forms:
 370             remove_tags.add('form')
 371             kill_tags.update(('button', 'input', 'select', 'textarea'))
 372         if self.annoying_tags:
 373             remove_tags.update(('blink', 'marquee'))
 374
 375         _remove = []
 376         _kill = []
 377         for el in doc.iter():
 378             if el.tag in kill_tags:
 379                 if self.allow_element(el):
 380                     continue
 381                 _kill.append(el)
 382             elif el.tag in remove_tags:
 383                 if self.allow_element(el):
 384                     continue
 385                 _remove.append(el)
 386
 387         if _remove and _remove[0] == doc:
 388             # We have to drop the parent-most tag, which we can't
 389             # do.  Instead we'll rewrite it:
 390             el = _remove.pop(0)
 391             el.tag = 'div'
 392             el.attrib.clear()
 393         elif _kill and _kill[0] == doc:
 394             # We have to drop the parent-most element, which we can't
 395             # do.  Instead we'll clear it:
 396             el = _kill.pop(0)
 397             if el.tag != 'html':
 398                 el.tag = 'div'
 399             el.clear()
 400
 401         _kill.reverse() # start with innermost tags
 402         for el in _kill:
 403             el.drop_tree()
 404         for el in _remove:
 405             el.drop_tag()
 406
 407         if self.remove_unknown_tags:
 408             if allow_tags:
 409                 raise ValueError(
 410                     "It does not make sense to pass in both allow_tags and remove_unknown_tags")
 411             allow_tags = set(defs.tags)
 412         if allow_tags:
 413             # make sure we do not remove comments/PIs if users want them (which is rare enough)
 414             if not self.comments:
 415                 allow_tags.add(etree.Comment)
 416             if not self.processing_instructions:
 417                 allow_tags.add(etree.ProcessingInstruction)
 418
 419             bad = []
 420             for el in doc.iter():
 421                 if el.tag not in allow_tags:
 422                     bad.append(el)
 423             if bad:
 424                 if bad[0] is doc:
 425                     el = bad.pop(0)
 426                     el.tag = 'div'
 427                     el.attrib.clear()
 428                 for el in bad:
 429                     el.drop_tag()
 430         if self.add_nofollow:
 431             for el in _find_external_links(doc):
 432                 if not self.allow_follow(el):
 433                     rel = el.get('rel')
 434                     if rel:
 435                         if ('nofollow' in rel
 436                                 and ' nofollow ' in (' %s ' % rel)):
 437                             continue
 438                         rel = '%s nofollow' % rel
 439                     else:
 440                         rel = 'nofollow'
 441                     el.set('rel', rel)
 442
 443     def allow_follow(self, anchor):
 444         """
 445         Override to suppress rel="nofollow" on some anchors.
 446         """
 447         return False
 448
 449     def allow_element(self, el):
 450         """
 451         Decide whether an element is configured to be accepted or rejected.
 452
 453         :param el: an element.
 454         :return: true to accept the element or false to reject/discard it.
 455         """
 456         if el.tag not in self._tag_link_attrs:
 457             return False
 458         attr = self._tag_link_attrs[el.tag]
 459         if isinstance(attr, (list, tuple)):
 460             for one_attr in attr:
 461                 url = el.get(one_attr)
 462                 if not url:
 463                     return False
 464                 if not self.allow_embedded_url(el, url):
 465                     return False
 466             return True
 467         else:
 468             url = el.get(attr)
 469             if not url:
 470                 return False
 471             return self.allow_embedded_url(el, url)
 472
 473     def allow_embedded_url(self, el, url):
 474         """
 475         Decide whether a URL that was found in an element's attributes or text
 476         if configured to be accepted or rejected.
 477
 478         :param el: an element.
 479         :param url: a URL found on the element.
 480         :return: true to accept the URL and false to reject it.
 481         """
 482         if self.whitelist_tags is not None and el.tag not in self.whitelist_tags:
 483             return False
 484         scheme, netloc, path, query, fragment = urlsplit(url)
 485         netloc = netloc.lower().split(':', 1)[0]
 486         if scheme not in ('http', 'https'):
 487             return False
 488         if netloc in self.host_whitelist:
 489             return True
 490         return False
 491
 492     def kill_conditional_comments(self, doc):
 493         """
 494         IE conditional comments basically embed HTML that the parser
 495         doesn't normally see.  We can't allow anything like that, so
 496         we'll kill any comments that could be conditional.
 497         """
 498         has_conditional_comment = _conditional_comment_re.search
 499         self._kill_elements(
 500             doc, lambda el: has_conditional_comment(el.text),
 501             etree.Comment)
 502
 503     def _kill_elements(self, doc, condition, iterate=None):
 504         bad = []
 505         for el in doc.iter(iterate):
 506             if condition(el):
 507                 bad.append(el)
 508         for el in bad:
 509             el.drop_tree()
 510
 511     def _remove_javascript_link(self, link):
 512         # links like "j a v a s c r i p t:" might be interpreted in IE
 513         new = _substitute_whitespace('', unquote_plus(link))
 514         if _is_javascript_scheme(new):
 515             # FIXME: should this be None to delete?
 516             return ''
 517         return link
 518
 519     _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub
 520
 521     def _has_sneaky_javascript(self, style):
 522         """
 523         Depending on the browser, stuff like ``e x p r e s s i o n(...)``
 524         can get interpreted, or ``expre/* stuff */ssion(...)``.  This
 525         checks for attempt to do stuff like this.
 526
 527         Typically the response will be to kill the entire style; if you
 528         have just a bit of Javascript in the style another rule will catch
 529         that and remove only the Javascript from the style; this catches
 530         more sneaky attempts.
 531         """
 532         style = self._substitute_comments('', style)
 533         style = style.replace('\\', '')
 534         style = _substitute_whitespace('', style)
 535         style = style.lower()
 536         if 'javascript:' in style:
 537             return True
 538         if 'expression(' in style:
 539             return True
 540         return False
 541
 542     def clean_html(self, html):
 543         result_type = type(html)
 544         if isinstance(html, basestring):
 545             doc = fromstring(html)
 546         else:
 547             doc = copy.deepcopy(html)
 548         self(doc)
 549         return _transform_result(result_type, doc)
 550
 551 clean = Cleaner()
 552 clean_html = clean.clean_html
 553
 554 ############################################################
 555 ## Autolinking
 556 ############################################################
 557
 558 _link_regexes = [
 559     re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I),
 560     # This is conservative, but autolinking can be a bit conservative:
 561     re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_.-]+[a-z]))', re.I),
 562     ]
 563
 564 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
 565
 566 _avoid_hosts = [
 567     re.compile(r'^localhost', re.I),
 568     re.compile(r'\bexample\.(?:com|org|net)$', re.I),
 569     re.compile(r'^127\.0\.0\.1$'),
 570     ]
 571
 572 _avoid_classes = ['nolink']
 573
 574 def autolink(el, link_regexes=_link_regexes,
 575              avoid_elements=_avoid_elements,
 576              avoid_hosts=_avoid_hosts,
 577              avoid_classes=_avoid_classes):
 578     """
 579     Turn any URLs into links.
 580
 581     It will search for links identified by the given regular
 582     expressions (by default mailto and http(s) links).
 583
 584     It won't link text in an element in avoid_elements, or an element
 585     with a class in avoid_classes.  It won't link to anything with a
 586     host that matches one of the regular expressions in avoid_hosts
 587     (default localhost and 127.0.0.1).
 588
 589     If you pass in an element, the element's tail will not be
 590     substituted, only the contents of the element.
 591     """
 592     if el.tag in avoid_elements:
 593         return
 594     class_name = el.get('class')
 595     if class_name:
 596         class_name = class_name.split()
 597         for match_class in avoid_classes:
 598             if match_class in class_name:
 599                 return
 600     for child in list(el):
 601         autolink(child, link_regexes=link_regexes,
 602                  avoid_elements=avoid_elements,
 603                  avoid_hosts=avoid_hosts,
 604                  avoid_classes=avoid_classes)
 605         if child.tail:
 606             text, tail_children = _link_text(
 607                 child.tail, link_regexes, avoid_hosts, factory=el.makeelement)
 608             if tail_children:
 609                 child.tail = text
 610                 index = el.index(child)
 611                 el[index+1:index+1] = tail_children
 612     if el.text:
 613         text, pre_children = _link_text(
 614             el.text, link_regexes, avoid_hosts, factory=el.makeelement)
 615         if pre_children:
 616             el.text = text
 617             el[:0] = pre_children
 618
 619 def _link_text(text, link_regexes, avoid_hosts, factory):
 620     leading_text = ''
 621     links = []
 622     last_pos = 0
 623     while 1:
 624         best_match, best_pos = None, None
 625         for regex in link_regexes:
 626             regex_pos = last_pos
 627             while 1:
 628                 match = regex.search(text, pos=regex_pos)
 629                 if match is None:
 630                     break
 631                 host = match.group('host')
 632                 for host_regex in avoid_hosts:
 633                     if host_regex.search(host):
 634                         regex_pos = match.end()
 635                         break
 636                 else:
 637                     break
 638             if match is None:
 639                 continue
 640             if best_pos is None or match.start() < best_pos:
 641                 best_match = match
 642                 best_pos = match.start()
 643         if best_match is None:
 644             # No more matches
 645             if links:
 646                 assert not links[-1].tail
 647                 links[-1].tail = text
 648             else:
 649                 assert not leading_text
 650                 leading_text = text
 651             break
 652         link = best_match.group(0)
 653         end = best_match.end()
 654         if link.endswith('.') or link.endswith(','):
 655             # These punctuation marks shouldn't end a link
 656             end -= 1
 657             link = link[:-1]
 658         prev_text = text[:best_match.start()]
 659         if links:
 660             assert not links[-1].tail
 661             links[-1].tail = prev_text
 662         else:
 663             assert not leading_text
 664             leading_text = prev_text
 665         anchor = factory('a')
 666         anchor.set('href', link)
 667         body = best_match.group('body')
 668         if not body:
 669             body = link
 670         if body.endswith('.') or body.endswith(','):
 671             body = body[:-1]
 672         anchor.text = body
 673         links.append(anchor)
 674         text = text[end:]
 675     return leading_text, links
 676
 677 def autolink_html(html, *args, **kw):
 678     result_type = type(html)
 679     if isinstance(html, basestring):
 680         doc = fromstring(html)
 681     else:
 682         doc = copy.deepcopy(html)
 683     autolink(doc, *args, **kw)
 684     return _transform_result(result_type, doc)
 685
 686 autolink_html.__doc__ = autolink.__doc__
 687
 688 ############################################################
 689 ## Word wrapping
 690 ############################################################
 691
 692 _avoid_word_break_elements = ['pre', 'textarea', 'code']
 693 _avoid_word_break_classes = ['nobreak']
 694
 695 def word_break(el, max_width=40,
 696                avoid_elements=_avoid_word_break_elements,
 697                avoid_classes=_avoid_word_break_classes,
 698                break_character=unichr(0x200b)):
 699     """
 700     Breaks any long words found in the body of the text (not attributes).
 701
 702     Doesn't effect any of the tags in avoid_elements, by default
 703     ``<textarea>`` and ``<pre>``
 704
 705     Breaks words by inserting &#8203;, which is a unicode character
 706     for Zero Width Space character.  This generally takes up no space
 707     in rendering, but does copy as a space, and in monospace contexts
 708     usually takes up space.
 709
 710     See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion
 711     """
 712     # Character suggestion of &#8203 comes from:
 713     #   http://www.cs.tut.fi/~jkorpela/html/nobr.html
 714     if el.tag in _avoid_word_break_elements:
 715         return
 716     class_name = el.get('class')
 717     if class_name:
 718         dont_break = False
 719         class_name = class_name.split()
 720         for avoid in avoid_classes:
 721             if avoid in class_name:
 722                 dont_break = True
 723                 break
 724         if dont_break:
 725             return
 726     if el.text:
 727         el.text = _break_text(el.text, max_width, break_character)
 728     for child in el:
 729         word_break(child, max_width=max_width,
 730                    avoid_elements=avoid_elements,
 731                    avoid_classes=avoid_classes,
 732                    break_character=break_character)
 733         if child.tail:
 734             child.tail = _break_text(child.tail, max_width, break_character)
 735
 736 def word_break_html(html, *args, **kw):
 737     result_type = type(html)
 738     doc = fromstring(html)
 739     word_break(doc, *args, **kw)
 740     return _transform_result(result_type, doc)
 741
 742 def _break_text(text, max_width, break_character):
 743     words = text.split()
 744     for word in words:
 745         if len(word) > max_width:
 746             replacement = _insert_break(word, max_width, break_character)
 747             text = text.replace(word, replacement)
 748     return text
 749
 750 _break_prefer_re = re.compile(r'[^a-z]', re.I)
 751
 752 def _insert_break(word, width, break_character):
 753     orig_word = word
 754     result = ''
 755     while len(word) > width:
 756         start = word[:width]
 757         breaks = list(_break_prefer_re.finditer(start))
 758         if breaks:
 759             last_break = breaks[-1]
 760             # Only walk back up to 10 characters to find a nice break:
 761             if last_break.end() > width-10:
 762                 # FIXME: should the break character be at the end of the
 763                 # chunk, or the beginning of the next chunk?
 764                 start = word[:last_break.end()]
 765         result += start + break_character
 766         word = word[len(start):]
 767     result += word
 768     return result
 769